In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils import resample
import nltk
from nltk.tokenize import word_tokenize
import pandas as pd
import re
import tensorflow as tf
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import re

In [3]:
fake_df = pd.read_csv('data/Fake.csv')
true_df = pd.read_csv('data/True.csv')

Combining Datasets for Balance Analysis

In [4]:
fake_df['source'] = 'Fake'
true_df['source'] = 'True'

combined_df = pd.concat([fake_df, true_df], axis=0)
df = combined_df.sample(frac=1).reset_index(drop=True)
df = df[['text', 'title', 'source']] 
df.to_csv('data/combined.csv', index=False)
df

Unnamed: 0,text,title,source
0,A clear violation of Obama s lopsided deal w...,BREAKING: Iran Publicly Humiliates Obama…Unvei...,Fake
1,You have got to be kidding me! After all the t...,STUNNING DEVELOPMENT: OBAMA GIVES $75 MILLION ...,Fake
2,"Meanwhile, families with two full time working...",NO MORE FUN FREEBIES ON BACKS OF HARD WORKING ...,Fake
3,JOHANNESBURG (Reuters) - Zimbabwe s top genera...,Zimbabwe's Mugabe in talks with ousted vice pr...,True
4,NEW YORK (Reuters) - Republican presidential n...,"Trump calls tax avoidance 'smart,' most Americ...",True
...,...,...,...
44893,LIMA (Reuters) - President Pedro Pablo Kuczyns...,Peru's Congress prepares to oust President Kuc...,True
44894,SEOUL (Reuters) - South Korea s environment mi...,South Korea to announce approval of environmen...,True
44895,Donald Trump s self-congratulatory ad to celeb...,CNN Refuses To Air Trump’s First 100 Days Ad ...,Fake
44896,WASHINGTON (Reuters) - U.S. Republican preside...,Santorum to pull out of Republican White House...,True


Source Assessment

In [5]:
df.isna().sum()
df.dropna()

Unnamed: 0,text,title,source
0,A clear violation of Obama s lopsided deal w...,BREAKING: Iran Publicly Humiliates Obama…Unvei...,Fake
1,You have got to be kidding me! After all the t...,STUNNING DEVELOPMENT: OBAMA GIVES $75 MILLION ...,Fake
2,"Meanwhile, families with two full time working...",NO MORE FUN FREEBIES ON BACKS OF HARD WORKING ...,Fake
3,JOHANNESBURG (Reuters) - Zimbabwe s top genera...,Zimbabwe's Mugabe in talks with ousted vice pr...,True
4,NEW YORK (Reuters) - Republican presidential n...,"Trump calls tax avoidance 'smart,' most Americ...",True
...,...,...,...
44893,LIMA (Reuters) - President Pedro Pablo Kuczyns...,Peru's Congress prepares to oust President Kuc...,True
44894,SEOUL (Reuters) - South Korea s environment mi...,South Korea to announce approval of environmen...,True
44895,Donald Trump s self-congratulatory ad to celeb...,CNN Refuses To Air Trump’s First 100 Days Ad ...,Fake
44896,WASHINGTON (Reuters) - U.S. Republican preside...,Santorum to pull out of Republican White House...,True


Combining text and title column, these are our features

In [6]:
text = (df['title'] + df['text']).astype(str)
text

0        BREAKING: Iran Publicly Humiliates Obama…Unvei...
1        STUNNING DEVELOPMENT: OBAMA GIVES $75 MILLION ...
2        NO MORE FUN FREEBIES ON BACKS OF HARD WORKING ...
3        Zimbabwe's Mugabe in talks with ousted vice pr...
4        Trump calls tax avoidance 'smart,' most Americ...
                               ...                        
44893    Peru's Congress prepares to oust President Kuc...
44894    South Korea to announce approval of environmen...
44895     CNN Refuses To Air Trump’s First 100 Days Ad ...
44896    Santorum to pull out of Republican White House...
44897     The World Speaks Out On Trump — Top Foreign D...
Length: 44898, dtype: object

Cleaning and Lemmatizing Text

In [7]:
def clean_text(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    text = text.strip()
    return text

stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
def lemmatize_stopwords(words):
    return [lemmatizer.lemmatize(word) for word in words if word.lower() not in stop_words]

tokens = text.apply(clean_text).apply(word_tokenize).apply(lemmatize_stopwords)
tokens

0        [breaking, iran, publicly, humiliates, obama, ...
1        [stunning, development, obama, give, million, ...
2        [fun, freebie, back, hard, working, american, ...
3        [zimbabwe, mugabe, talk, ousted, vice, preside...
4        [trump, call, tax, avoidance, smart, american,...
                               ...                        
44893    [peru, congress, prepares, oust, president, ku...
44894    [south, korea, announce, approval, environment...
44895    [cnn, refuse, air, trump, first, day, ad, one,...
44896    [santorum, pull, republican, white, house, rac...
44897    [world, speaks, trump, top, foreign, diplomat,...
Length: 44898, dtype: object

Creating Corpus

In [8]:
corpus = tokens.apply(lambda x: ' '.join(x))
corpus

0        breaking iran publicly humiliates obama unveil...
1        stunning development obama give million u tax ...
2        fun freebie back hard working american kansa t...
3        zimbabwe mugabe talk ousted vice president arm...
4        trump call tax avoidance smart american call u...
                               ...                        
44893    peru congress prepares oust president kuczynsk...
44894    south korea announce approval environment repo...
44895    cnn refuse air trump first day ad one good rea...
44896    santorum pull republican white house race cnnw...
44897    world speaks trump top foreign diplomat unite ...
Length: 44898, dtype: object

Vectorizing (TF-IDF Vectorization)

In [9]:
vectorizer = TfidfVectorizer(use_idf=True, max_df=0.5, ngram_range=(1, 1))
tfidf = vectorizer.fit_transform(corpus)
X = tfidf
y = df['source']