# Pierwszy checkpoint - walidacja
# Fake News Dataset

### Wojtek Grabias, Wiktor Wierzchowski

In [1]:
import pandas as pd
import numpy as np

df1 = pd.read_csv('original_data.csv')
df1

Unnamed: 0.1,Unnamed: 0,title,text,Ground Label
0,0,Ann Coulter Make Believes She Has ‘Gay Friend...,"It s hard to believe, but Donald Trump does ha...",fake
1,1,Rating: Moody‘s verbessert Ausblick für Russla...,bankensektor Der russische Staat werde die Ban...,fake
2,2,CAN WE ADD DIRTY MONEY ‘LAUNDERING’ To The Oba...,A member of the House Intelligence Committee i...,fake
3,3,Republicans on Obamacare repeal: 'We're going ...,WASHINGTON (Reuters) - House of Representative...,true
4,4,"Trump, on possible DACA deal, says border wall...",WASHINGTON (Reuters) - U.S. President Donald T...,true
...,...,...,...,...
69040,69040,Burundi opposition platform boycotts new round...,NAIROBI (Reuters) - Burundi s main opposition ...,true
69041,69041,Hillary’s Message To Former Miss Universe Cal...,Miss Universe 1996 Alicia Machado is now an Am...,fake
69042,69042,Cop Crashes Car And Runs Away When More Cops A...,The Daily Sheeple – by Ryan Banister \r\nAn aw...,fake
69043,69043,Trump Stole An Idea From North Korean Propaga...,Jesus f*cking Christ our President* is a moron...,fake


#### Podział danych
Został dokonany po wyczyszczeniu danych - odwrotnie do przyjętej poprawnej konwencji, jednak w przypadku rozważanej ramki danych, kontekst zawartości całej ramki nie wpływa na wygląd wektora wynikowego, nie traktowane jest to jako niepoprawne.

#### Interpretacja języków
Jednym z pierwszych założeń grupy prepocessującej było odrzucenie wierszy zawierające artykuły w języku innym niż angielski, co ze względu na charakter ramki danych wydaje się zupełnie zasadne (inne języki są wyłącznie fake-newsami, bądź część prawdziwych informacji jest marginalna)

----
### Data Preprocessing dokonany został z wykorzystaniem poniższego zestawu funkcji:

In [2]:
import re
import string
from nltk.corpus import stopwords
from nltk import PorterStemmer
from nltk import word_tokenize
# funkcja odpowiedzialna za czyszczenie pojedynczego wiersza

def clean_text(text, punctuation_chars):
    # usuwamy zamianę na małe litery
    text = text.lower()
    # rozwinięcie skrótów
    text = expand_contractions(text)
    # remove punctuation
    text = text.translate(str.maketrans('', '', ''.join(punctuation_chars)))
    # remove digits
    text = text.translate(str.maketrans('', '', string.digits))
    # remove all single characters
    pattern = r'(^| ).( |$)'
    text = re.sub(pattern, ' ', text)
    # remove multiple spaces
    text = re.sub(' +', ' ', text)
    # remove stopwords
    text = delete_stopwords(text)
    # stemming
    text = stemming(text)
    return text

# funkcja odpowiedzialna za usuwanie stopwordów

def delete_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

# funkcja odpowiedzialna za stemizację

def stemming(text):
    words = word_tokenize(text)
    porter = PorterStemmer()
    stem_words = [porter.stem(word) for word in words]
    return ' '.join(stem_words)

#### Poprawność funkcji  procesujących
Powyższe kroki uważamy za zasadne i poprawne w ujęciu technicznym - funkcje przekształcają ramkę danych w sposób zgodny z założeniami.

---
### Zastrzeżenia i propozycje

#### Tekst w ujęciu dosłownym
Przedstawiona przez grupę budującą propozycja przetworzenia danych zwraca wyłącznie uwagę na jej faktyczną treść - pominięta została jakakolwiek interpunkcja, styl (formalny/nieformalny), czy użycie wielkich liter. 

#### Połączenie kolumn z tytułem i treścią
Jako grupa walidacyjna mamy zastrzeżenia co do scalenia powyższych kolumn. Z czysto praktycznego punktu widzenia, sposób, w jakim został napisany tytuł artykułu, czy też sama jego treść może wskazywać na rzetelność przedstawionej wiadomości. Proponujemy zachować podział na kolumnę z treścią i tekstem artykułu.

#### Propozycja rozwiązań
Powyższe problemy proponujemy rozwiązać poprzez dodanie dwóch kolumn przed obróbką tekstów:
1. Uppercase_ratio - zawiera informacje o stosunku ilości wielkich liter do długości napisu
2. Abbrev_ratio - zawiera informacje o stosunku skrótów typu "i've, you're" do długości tekstu (liczba słów)

In [3]:
def n_upper_chars(string):
    return sum(map(str.isupper, string))

def upper_ratio(string):
    return n_upper_chars(string)/len(string)

In [4]:
def abbreviation_counter(input_text):
    contractions = {
        "ain't": "am not / are not / is not / has not / have not",
        "aren't": "are not / am not",
        "can't": "cannot",
        "can't've": "cannot have",
        "'cause": "because",
        "could've": "could have",
        "couldn't": "could not",
        "couldn't've": "could not have",
        "didn't": "did not",
        "doesn't": "does not",
        "don't": "do not",
        "hadn't": "had not",
        "hadn't've": "had not have",
        "hasn't": "has not",
        "haven't": "have not",
        "he'd": "he had / he would",
        "he'd've": "he would have",
        "he'll": "he shall / he will",
        "he'll've": "he shall have / he will have",
        "he's": "he has / he is",
        "how'd": "how did",
        "how'd'y": "how do you",
        "how'll": "how will",
        "how's": "how has / how is / how does",
        "i'd": "I had / I would",
        "i'd've": "I would have",
        "i'll": "I shall / I will",
        "i'll've": "I shall have / I will have",
        "i'm": "I am",
        "i've": "I have",
        "isn't": "is not",
        "it'd": "it had / it would",
        "it'd've": "it would have",
        "it'll": "it shall / it will",
        "it'll've": "it shall have / it will have",
        "it's": "it has / it is",
        "let's": "let us",
        "ma'am": "madam",
        "mayn't": "may not",
        "might've": "might have",
        "mightn't": "might not",
        "mightn't've": "might not have",
        "must've": "must have",
        "mustn't": "must not",
        "mustn't've": "must not have",
        "needn't": "need not",
        "needn't've": "need not have",
        "o'clock": "of the clock",
        "oughtn't": "ought not",
        "oughtn't've": "ought not have",
        "shan't": "shall not",
        "sha'n't": "shall not",
        "shan't've": "shall not have",
        "she'd": "she had / she would",
        "she'd've": "she would have",
        "she'll": "she shall / she will",
        "she'll've": "she shall have / she will have",
        "she's": "she has / she is",
        "should've": "should have",
        "shouldn't": "should not",
        "shouldn't've": "should not have",
        "so've": "so have",
        "so's": "so as / so is",
        "that'd": "that would / that had",
        "that'd've": "that would have",
        "that's": "that has / that is",
        "there'd": "there had / there would",
        "there'd've": "there would have",
        "there's": "there has / there is",
        "they'd": "they had / they would",
        "they'd've": "they would have",
        "they'll": "they shall / they will",
        "they'll've": "they shall have / they will have",
        "they're": "they are",
        "they've": "they have",
        "to've": "to have",
        "wasn't": "was not",
        "we'd": "we had / we would",
        "we'd've": "we would have",
        "we'll": "we will",
        "we'll've": "we will have",
        "we're": "we are",
        "we've": "we have",
        "weren't": "were not",
        "what'll": "what shall / what will",
        "what'll've": "what shall have / what will have",
        "what're": "what are",
        "what's": "what has / what is",
        "what've": "what have",
        "when's": "when has / when is",
        "when've": "when have",
        "where'd": "where did",
        "where's": "where has / where is",
        "where've": "where have",
        "who'll": "who shall / who will",
        "who'll've": "who shall have / who will have",
        "who's": "who has / who is",
        "who've": "who have",
        "why's": "why has / why is",
        "why've": "why have",
        "will've": "will have",
        "won't": "will not",
        "won't've": "will not have",
        "would've": "would have",
        "wouldn't": "would not",
        "wouldn't've": "would not have",
        "y'all": "you all",
        "y'all'd": "you all would",
        "y'all'd've": "you all would have",
        "y'all're": "you all are",
        "y'all've": "you all have",
        "you'd": "you had / you would",
        "you'd've": "you would have",
        "you'll": "you shall / you will",
        "you'll've": "you shall have / you will have",
        "you're": "you are",
        "you've": "you have"
    }
    count = 0
    for word in input_text.split(" "):
        if re.sub(',|;', '', word).lower() in contractions:
            count += 1
    return count

def abbrev_ratio(input_text):
    return abbreviation_counter(input_text)/len(input_text.split(" "))

#### Modyfikacje
Zmodyfikowaliśmy funkcję clean_df, aby unormowanie tekstu nastąpiło po dodaniu wcześniejszych kolumn

In [5]:
def clean_df(df):

    df = df[['title', 'text', 'Ground Label']]

    df.dropna(subset=['title'], inplace=True)
    df.dropna(subset=['text'], inplace=True)
    df = df.fillna('')

    punctuation_chars = [chr(i) for i in range(sys.maxunicode)
                         if category(chr(i)).startswith("P")]

    df['Uppercase_ratio'] = df['title'].apply(upper_ratio)
    df['Abbrev_ratio'] = df['text'].apply(abbrev_ratio)

    df['title'] = df['title'].map(lambda x: clean_text(x, punctuation_chars))
    df['text'] = df['text'].map(lambda x: clean_text(x, punctuation_chars))
    return df

### Wektoryzacja tekstu

Po wykonaniu powyższych zmian otrzymana ramka danych jest gotowa do wektoryzacji. Zespół budowy wykorzystał metodę TF-IDF. Dla urozmaicenia, poniżej wykonaliśmy tę operację wykonując word embedding. To, które z tych podejść okaże się efektywniejsze będzie można zweryfikować na etapie budowania modeli.

In [6]:
df2 = pd.read_csv('clean_data.csv')
df2 = df2[['title', 'text', 'Ground Label', 'Uppercase_ratio', 'Abbrev_ratio']]
df2 = df2.dropna()
df2

Unnamed: 0,title,text,Ground Label,Uppercase_ratio,Abbrev_ratio
0,ann coulter make believ gay friend make racist...,hard believ donald trump sizabl amount support...,fake,0.227848,0.000000
1,add dirti money launder obama billion iran ran...,member hous intellig committe accus obama admi...,fake,0.426829,0.000000
2,republican obamacar repeal go get done,washington reuter hous repres republican leade...,true,0.049180,0.000000
3,trump possibl daca deal say border wall would ...,washington reuter us presid donald trump said ...,true,0.079365,0.000000
4,trump administr forc peac jame pinkerton,origin appear american conserv donald trump pl...,fake,0.144578,0.000000
...,...,...,...,...,...
66778,burundi opposit platform boycott new round pea...,nairobi reuter burundi main opposit group said...,true,0.027397,0.000000
66779,hillari messag former miss univers call miss p...,miss univers alicia machado american citizen v...,fake,0.266667,0.000000
66780,cop crash car run away cop arriv,daili sheepl ryan banist awardwin california s...,fake,0.196078,0.000000
66781,trump stole idea north korean propaganda parod...,jesu fcking christ presid moron satisfi simpli...,fake,0.224490,0.001078


Poniższa funkcja zwróci nam krotkę z vektorami word embedding'u dla kolumny title i text, oraz kolumnę wartości Ground Label.

In [7]:
import gensim
import nltk

def word_embedding(df):
    sentences = [nltk.word_tokenize(text) for text in df] 
    model = gensim.models.Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

    vector = []
    for text in df:
        words = nltk.word_tokenize(text)
        vectors = [model.wv[word] for word in words if word in model.wv.key_to_index]
        if len(vectors) > 0:
            vector.append(np.mean(vectors, axis=0))
        else:
            vector.append(np.zeros(model.vector_size))
    vector = np.array(vector)
    return vector

def label_to_num(df):
    df['Ground Label'] = np.where(df['Ground Label'] == 'true', 1, 0)
    return df
    
def vectorisation(df):
    title = word_embedding(df['title'])
    text = word_embedding(df['text'])
    label = label_to_num(df)['Ground Label']
    return (title, text, label)

In [None]:
df3 = vectorisation(df2)
df3