In [15]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import resample
from sklearn.metrics import f1_score
import nltk
from nltk.tokenize import word_tokenize
import pandas as pd
import re
import tensorflow as tf
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import re

In [2]:
fake_df = pd.read_csv('data/Fake.csv')
true_df = pd.read_csv('data/True.csv')

Combining Datasets for Balance Analysis

In [3]:
fake_df['source'] = 0
true_df['source'] = 1

combined_df = pd.concat([fake_df, true_df], axis=0)
df = combined_df.sample(frac=1).reset_index(drop=True)
df = df[['text', 'title', 'source']] 
df.to_csv('data/combined.csv', index=False)
df

Unnamed: 0,text,title,source
0,SAN DIEGO (Reuters) - U.S. District Judge Gonz...,U.S. judge to mull release of Trump video test...,1
1,Florida Governor Rick Scott is not holding any...,FLORIDA GOVERNOR GOES OFF ON OBAMA: “The Secon...,0
2,BERLIN (Reuters) - The German government is no...,Germany not mulling sanctions against U.S. aft...,1
3,NEW YORK (Reuters) - A U.S. appeals court on T...,U.S. appeals court rejects Trump's bid to bar ...,1
4,UNITED NATIONS (Reuters) - The United Nations ...,U.N. enacts sanctions against anyone hindering...,1
...,...,...,...
44893,Donald Trump is nigh-universally recognized as...,50 Top Republicans Pen BRUTAL Letter Warning ...,0
44894,WASHINGTON (Reuters) - Expanded negotiations a...,U.S. Commerce chief says expanded North Korean...,1
44895,In the interest of wasting as much time and en...,"Christian Nutjobs: ‘Eliminate Evolution, It’s...",0
44896,"Bolling tweeted a couple of messages, thanking...",BOOM! FOX NEWS HOST ERIC BOLLING Hits Back Wit...,0


Source Assessment

In [4]:
df.isna().sum()
df.dropna()

Unnamed: 0,text,title,source
0,SAN DIEGO (Reuters) - U.S. District Judge Gonz...,U.S. judge to mull release of Trump video test...,1
1,Florida Governor Rick Scott is not holding any...,FLORIDA GOVERNOR GOES OFF ON OBAMA: “The Secon...,0
2,BERLIN (Reuters) - The German government is no...,Germany not mulling sanctions against U.S. aft...,1
3,NEW YORK (Reuters) - A U.S. appeals court on T...,U.S. appeals court rejects Trump's bid to bar ...,1
4,UNITED NATIONS (Reuters) - The United Nations ...,U.N. enacts sanctions against anyone hindering...,1
...,...,...,...
44893,Donald Trump is nigh-universally recognized as...,50 Top Republicans Pen BRUTAL Letter Warning ...,0
44894,WASHINGTON (Reuters) - Expanded negotiations a...,U.S. Commerce chief says expanded North Korean...,1
44895,In the interest of wasting as much time and en...,"Christian Nutjobs: ‘Eliminate Evolution, It’s...",0
44896,"Bolling tweeted a couple of messages, thanking...",BOOM! FOX NEWS HOST ERIC BOLLING Hits Back Wit...,0


Combining text and title column, these are our features

In [5]:
text = (df['title'] + df['text']).astype(str)
text

0        U.S. judge to mull release of Trump video test...
1        FLORIDA GOVERNOR GOES OFF ON OBAMA: “The Secon...
2        Germany not mulling sanctions against U.S. aft...
3        U.S. appeals court rejects Trump's bid to bar ...
4        U.N. enacts sanctions against anyone hindering...
                               ...                        
44894    U.S. Commerce chief says expanded North Korean...
44895     Christian Nutjobs: ‘Eliminate Evolution, It’s...
44896    BOOM! FOX NEWS HOST ERIC BOLLING Hits Back Wit...
44897    Hungry South Sudanese refugees risk death in r...
Length: 44898, dtype: object

Cleaning and Lemmatizing Text

In [6]:
def clean_text(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    text = text.strip()
    return text

stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
def lemmatize_stopwords(words):
    return [lemmatizer.lemmatize(word) for word in words if word.lower() not in stop_words]

tokens = text.apply(clean_text).apply(word_tokenize).apply(lemmatize_stopwords)
tokens

0        [u, judge, mull, release, trump, video, testim...
1        [florida, governor, go, obama, second, amendme...
2        [germany, mulling, sanction, u, climate, pact,...
3        [u, appeal, court, reject, trump, bid, bar, re...
4        [u, n, enacts, sanction, anyone, hindering, ma...
                               ...                        
44894    [u, commerce, chief, say, expanded, north, kor...
44895    [christian, nutjobs, eliminate, evolution, rel...
44896    [boom, fox, news, host, eric, bolling, hit, ba...
44897    [hungry, south, sudanese, refugee, risk, death...
Length: 44898, dtype: object

Creating Corpus

In [7]:
corpus = tokens.apply(lambda x: ' '.join(x))
corpus

0        u judge mull release trump video testimonysan ...
1        florida governor go obama second amendment kil...
2        germany mulling sanction u climate pact withdr...
3        u appeal court reject trump bid bar refugeesne...
4        u n enacts sanction anyone hindering mali peac...
                               ...                        
44894    u commerce chief say expanded north korean san...
44895    christian nutjobs eliminate evolution religion...
44896    boom fox news host eric bolling hit back milli...
44897    hungry south sudanese refugee risk death retur...
Length: 44898, dtype: object

Vectorizing (TF-IDF Vectorization)

In [8]:
vectorizer = TfidfVectorizer(use_idf=True, max_df=0.5, ngram_range=(1, 1))
tfidf = vectorizer.fit_transform(corpus)
X = tfidf
y = df['source']

#### Model Testing


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier, plot_tree

seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)
print("Size of train dataset: {} rows".format(X_train.shape[0]))
print("Size of test dataset: {} rows".format(X_test.shape[0]))

Size of train dataset: 30081 rows
Size of test dataset: 14817 rows


XGB

In [11]:
model = XGBClassifier()
model.fit(X_train, y_train)

In [12]:
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]
predictions_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred, 'Probability': y_pred_proba})
print(predictions_df.head())

       Actual  Predicted  Probability
25168       0          0     0.000072
5096        1          1     0.999991
3123        0          0     0.000004
37809       0          0     0.000005
27507       0          0     0.000642


In [16]:
f1 = f1_score(y_test, y_pred)
print(f"XGBoost F1 Score: {f1}")

# evaluate predictions
from sklearn.metrics import confusion_matrix
Cmat = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
print("Confusion Matrix: \n")
print(Cmat)
print("\n Accuracy: %.2f%%" % (accuracy * 100.0))

XGBoost F1 Score: 0.9974536709577027
Confusion Matrix: 

[[7730   25]
 [  11 7051]]

 Accuracy: 99.76%


Multinomial Bayes

In [17]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_curve, auc, precision_recall_curve, f1_score, confusion_matrix, ConfusionMatrixDisplay

nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

In [18]:
y_pred_nb = nb_model.predict(X_test)
y_pred_proba_nb = nb_model.predict_proba(X_test)[:, 1]
predictions_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_nb, 'Probability': y_pred_proba_nb})
print(predictions_df.head())

       Actual  Predicted  Probability
25168       0          0     0.001055
5096        1          1     0.999830
3123        0          0     0.000004
37809       0          0     0.000550
27507       0          0     0.002909


In [19]:
f1_nb = f1_score(y_test, y_pred_nb)
print(f"Multinomial Naive Bayes F1 Score: {f1_nb}")

cm_nb = confusion_matrix(y_test, y_pred_nb)
accuracy_nb = accuracy_score(y_test, y_pred_nb)
print("Confusion Matrix: \n")
print(Cmat)
print("\n Accuracy: %.2f%%" % (accuracy * 100.0))

Multinomial Naive Bayes F1 Score: 0.9325206167084976
Confusion Matrix: 

[[7730   25]
 [  11 7051]]

 Accuracy: 99.76%


Random Forest

In [20]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=seed)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
y_pred_proba_rf = rf_model.predict_proba(X_test)[:, 1]

In [21]:
predictions_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_rf, 'Probability': y_pred_proba_rf})
print(predictions_df.head())

       Actual  Predicted  Probability
25168       0          0         0.09
5096        1          1         0.92
3123        0          0         0.05
37809       0          0         0.02
27507       0          0         0.12


In [22]:
f1_rf = f1_score(y_test, y_pred_rf)
print(f"Random Forest F1 Score: {f1_rf}")

cm_nb = confusion_matrix(y_test, y_pred_rf)
accuracy_nb = accuracy_score(y_test, y_pred_rf)
print("Confusion Matrix: \n")
print(Cmat)
print("\n Accuracy: %.2f%%" % (accuracy * 100.0))

Random Forest F1 Score: 0.9892259710802381
Confusion Matrix: 

[[7730   25]
 [  11 7051]]

 Accuracy: 99.76%


In [None]:
print('hi')