In [1]:
# Import pandas for data handling
import pandas as pd

# NLTK is our Natural-Language-Took-Kit
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

# Libraries for helping us with strings
import string
# Regular Expression Library
import re

# Import our text vectorizers
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


# Import our classifiers
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier


# Import some ML helper function
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report


# Import our metrics to evaluate our model
from sklearn import metrics
from sklearn.metrics import classification_report


# Library for plotting
import matplotlib.pyplot as plt
import seaborn as sns

# You may need to download these from nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stopwords = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shariahoque/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/shariahoque/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/shariahoque/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Load Data

In [20]:
df = pd.read_csv('DataNews/news_articles.csv')
print(df.shape)
df.head()

(2096, 12)


Unnamed: 0,author,published,title,text,language,site_url,main_img_url,type,label,title_without_stopwords,text_without_stopwords,hasImage
0,Barracuda Brigade,2016-10-26T21:41:00.000+03:00,muslims busted they stole millions in govt ben...,print they should pay all the back all the mon...,english,100percentfedup.com,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,bias,Real,muslims busted stole millions govt benefits,print pay back money plus interest entire fami...,1.0
1,reasoning with facts,2016-10-29T08:47:11.259+03:00,re why did attorney general loretta lynch plea...,why did attorney general loretta lynch plead t...,english,100percentfedup.com,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,bias,Real,attorney general loretta lynch plead fifth,attorney general loretta lynch plead fifth bar...,1.0
2,Barracuda Brigade,2016-10-31T01:41:49.479+02:00,breaking weiner cooperating with fbi on hillar...,red state \nfox news sunday reported this mor...,english,100percentfedup.com,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,bias,Real,breaking weiner cooperating fbi hillary email ...,red state fox news sunday reported morning ant...,1.0
3,Fed Up,2016-11-01T05:22:00.000+02:00,pin drop speech by father of daughter kidnappe...,email kayla mueller was a prisoner and torture...,english,100percentfedup.com,http://100percentfedup.com/wp-content/uploads/...,bias,Real,pin drop speech father daughter kidnapped kill...,email kayla mueller prisoner tortured isis cha...,1.0
4,Fed Up,2016-11-01T21:56:00.000+02:00,fantastic trumps point plan to reform healthc...,email healthcare reform to make america great ...,english,100percentfedup.com,http://100percentfedup.com/wp-content/uploads/...,bias,Real,fantastic trumps point plan reform healthcare ...,email healthcare reform make america great sin...,1.0


In [21]:
#  Inspect 
print(df.isnull().sum())

author                      0
published                   0
title                       0
text                       46
language                    1
site_url                    1
main_img_url                1
type                        1
label                       1
title_without_stopwords     2
text_without_stopwords     50
hasImage                    1
dtype: int64


In [22]:
# remove nulls and duplicates
# df.dropna(inplace=True)
df = df.dropna()

# Sanity Check

print(df.shape)

(2045, 12)


In [23]:
df.isnull().sum()

author                     0
published                  0
title                      0
text                       0
language                   0
site_url                   0
main_img_url               0
type                       0
label                      0
title_without_stopwords    0
text_without_stopwords     0
hasImage                   0
dtype: int64

In [24]:
print(df.duplicated().sum())


10


In [25]:
df = df.drop_duplicates()

print(df.shape, 'after')

(2035, 12) after


In [27]:
print(df.duplicated().sum())

0


In [28]:
#  Find Label balances.
df.label.value_counts()

Fake    1281
Real     754
Name: label, dtype: int64

In [29]:
#  Find Type balances.
df.type.value_counts()

bs            588
conspiracy    430
bias          389
hate          244
satire        146
state         121
junksci       102
fake           15
Name: type, dtype: int64

In [41]:
print("Original TEXT:", df['title'][10])

Original TEXT: yikes hillary goes off the railspulls a howard dean video


In [42]:
print("Clean TEXT from data:", df['title_without_stopwords'][10])

Clean TEXT from data: yikes hillary goes railspulls howard dean video


In [39]:
# print("ORIGINAL TEXT:", df['text'][10])

In [37]:
print("Clean TEXT from data:", df['text_without_stopwords'][10])

ORIGINAL TEXT: comedian would move spain buy house another country case people threaten leave country dont leave country said live kelly michael weirdly called trump charming interview neve campbell house cards actress would move canada honesty terrifying told huffington post uk barry diller founder iac interactive would move unspecified donald trump doesnt fall ill either move country join resistance told bloomberg lena dunham creator girls would move vancouver know lot people threatening really said matrix awards keeganmichael key star key peele would move canada easy like minutes detroit thats im told tmz chloë sevigny actress guest star portlandia would move nova scotia answered simply nova scotia question would move trump elected al sharpton activist would move donald trump nominee im open support anyone im also reserving ticket wins said press conference natasha lyonne actress orange new black would move mental hospital ill move mental hospital youre like happening said eddie gri

# Text Pre-Processing

### Stop words are already removed
### Punctuation are already removed
### Already in Lowercase

In [31]:
#bring the root of the words
def root_words(string):
    porter = PorterStemmer()
    
    #  sentence into a list of words
    words = word_tokenize(string)
    
    valid_words = []

    for word in words:
        
        root_word = porter.stem(word)
        
        valid_words.append(root_word)
        
    string = ' '.join(valid_words)

    return string 

sent = 'I played and started playing with players and we all love to play with plays'
root_words(sent)

'I play and start play with player and we all love to play with play'

In [46]:
def text_pipeline(input_string):
    input_string = root_words(input_string)   
    return input_string


In [53]:
df['title_after'] = df['title_without_stopwords']
df['title_after'] = df['title_without_stopwords'].apply(text_pipeline)

print("Clean TEXT from data:", df['title_without_stopwords'][0])
print("CLEANDED TEXT:", df['title_after'][0])

Clean TEXT from data: muslims busted stole millions govt benefits
CLEANDED TEXT: muslim bust stole million govt benefit


In [50]:
df['text_after'] = df['text_without_stopwords']
df['text_after'] = df['text_without_stopwords'].apply(text_pipeline)

In [54]:
df.tail(2)

Unnamed: 0,author,published,title,text,language,site_url,main_img_url,type,label,title_without_stopwords,text_without_stopwords,hasImage,title_after,text_after
2044,Ann Coulter,2016-10-27T03:05:01.989+03:00,our new country women and minorities hit hardest,wars and rumors of wars russia unveils satan ...,english,wnd.com,http://www.wnd.com/files/2016/10/danney-willll...,bias,Real,wikileaks bombshells hillary need know,posted eddie skyhigh potency may scare away cr...,1.0,wikileak bombshel hillari need know,post eddi skyhigh potenc may scare away crysta...
2045,Larry Elder,2016-10-27T03:05:05.815+03:00,trump vs clinton a risk vs a disaster,check out hillarythemed haunted house anticlin...,english,wnd.com,http://www.wnd.com/files/2015/10/Hillary-Clint...,bias,Real,fascinated sex,billion even known keeping supposedly deleted ...,0.0,fascin sex,billion even known keep supposedli delet messa...


# Split the data into testing and training