In [5]:
import pandas as pd
import numpy as np
import re
from sklearn.utils import shuffle 

In [6]:
real = pd.read_csv('./Data_NLP/True.csv.zip')
fake = pd.read_csv('./Data_NLP/Fake.csv.zip')

In [7]:
real.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [8]:
fake.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [10]:
real['label'] = 1
fake['label'] = 0

news_data = pd.concat([real, fake], ignore_index= True)
news_data = shuffle(news_data)

In [11]:
news_data.head()

Unnamed: 0,title,text,subject,date,label
18184,"Austria's conservatives, Social Democrats to s...",VIENNA (Reuters) - Austria s co-governing Soci...,worldnews,"October 6, 2017",1
993,U.S. belatedly begins to comply with Russia sa...,WASHINGTON (Reuters) - The U.S. State Departme...,politicsNews,"October 26, 2017",1
38827,WHY CHRISTIANS ARE ASKING THEIR FELLOW CHRISTI...,"It s about time Rev. Patrick Mahoney, of Churc...",Government News,"Apr 18, 2015",0
30400,Happy 2016! Republicans Already Promise To Be...,Well that didn t take long at all did it? 2016...,News,"January 4, 2016",0
32213,TRIGGERED! FORMER CIA AGENT: ‘Trey Gowdy Ought...,Former CIA Agent Phil Mudd is a jackwagon! He ...,politics,"May 24, 2017",0


In [12]:
news_data.isna().sum()

title      0
text       0
subject    0
date       0
label      0
dtype: int64

In [13]:
news_data.drop(['subject', 'date'], axis= 1)

Unnamed: 0,title,text,label
18184,"Austria's conservatives, Social Democrats to s...",VIENNA (Reuters) - Austria s co-governing Soci...,1
993,U.S. belatedly begins to comply with Russia sa...,WASHINGTON (Reuters) - The U.S. State Departme...,1
38827,WHY CHRISTIANS ARE ASKING THEIR FELLOW CHRISTI...,"It s about time Rev. Patrick Mahoney, of Churc...",0
30400,Happy 2016! Republicans Already Promise To Be...,Well that didn t take long at all did it? 2016...,0
32213,TRIGGERED! FORMER CIA AGENT: ‘Trey Gowdy Ought...,Former CIA Agent Phil Mudd is a jackwagon! He ...,0
...,...,...,...
12432,Australian PM says government at risk if it lo...,SYDNEY (Reuters) - Australian Prime Minister M...,1
28300,Former ‘Apprentice’ Contestants Unite AGAINST...,When you want to learn more about what a perso...,0
6482,U.S. Congress certifies Trump's Electoral Coll...,WASHINGTON (Reuters) - The U.S. Congress on Fr...,1
43646,"Trump Asks O’Reilly, ‘Do you think our country...",21st Century Wire says Regardless of what one ...,0


In [14]:
news_data['text_processed'] = news_data['text'].map(lambda x: re.sub('(Reuters)', "", x))
news_data['text_processed'] = news_data['text_processed'].map(lambda x: re.sub('[^A-Za-z0-9]+', ' ', x))
news_data['text_processed'] = news_data['text_processed'].map(lambda x: x.lower())

In [15]:
news_data['text_processed']

18184    vienna austria s co governing social democrats...
993      washington the u s state department on thursda...
38827    it s about time rev patrick mahoney of church ...
30400    well that didn t take long at all did it 2016 ...
32213    former cia agent phil mudd is a jackwagon he t...
                               ...                        
12432    sydney australian prime minister malcolm turnb...
28300    when you want to learn more about what a perso...
6482     washington the u s congress on friday certifie...
43646    21st century wire says regardless of what one ...
2295     united nations the united nations security cou...
Name: text_processed, Length: 44898, dtype: object

In [18]:
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

def lemmatize(text):
    lm = WordNetLemmatizer()
    tokens = [lm.lemmatize(word) for word in text.split()]
    return " ".join(tokens)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


In [19]:
lemmatize(news_data['text_processed'] [0])

'washington the head of a conservative republican faction in the u s congress who voted this month for a huge expansion of the national debt to pay for tax cut called himself a fiscal conservative on sunday and urged budget restraint in 2018 in keeping with a sharp pivot under way among republican u s representative mark meadow speaking on cbs face the nation drew a hard line on federal spending which lawmaker are bracing to do battle over in january when they return from the holiday on wednesday lawmaker will begin trying to pas a federal budget in a fight likely to be linked to other issue such a immigration policy even a the november congressional election campaign approach in which republican will seek to keep control of congress president donald trump and his republican want a big budget increase in military spending while democrat also want proportional increase for non defense discretionary spending on program that support education scientific research infrastructure public heal

In [20]:
news_data['text_processed '] = news_data['text_processed'].apply(lemmatize)

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [22]:
y = news_data['label']
X = news_data.drop('label', axis=1)

In [24]:
X.head()

Unnamed: 0,title,text,subject,date,text_processed,text_processed.1
18184,"Austria's conservatives, Social Democrats to s...",VIENNA (Reuters) - Austria s co-governing Soci...,worldnews,"October 6, 2017",vienna austria s co governing social democrats...,vienna austria s co governing social democrat ...
993,U.S. belatedly begins to comply with Russia sa...,WASHINGTON (Reuters) - The U.S. State Departme...,politicsNews,"October 26, 2017",washington the u s state department on thursda...,washington the u s state department on thursda...
38827,WHY CHRISTIANS ARE ASKING THEIR FELLOW CHRISTI...,"It s about time Rev. Patrick Mahoney, of Churc...",Government News,"Apr 18, 2015",it s about time rev patrick mahoney of church ...,it s about time rev patrick mahoney of church ...
30400,Happy 2016! Republicans Already Promise To Be...,Well that didn t take long at all did it? 2016...,News,"January 4, 2016",well that didn t take long at all did it 2016 ...,well that didn t take long at all did it 2016 ...
32213,TRIGGERED! FORMER CIA AGENT: ‘Trey Gowdy Ought...,Former CIA Agent Phil Mudd is a jackwagon! He ...,politics,"May 24, 2017",former cia agent phil mudd is a jackwagon he t...,former cia agent phil mudd is a jackwagon he t...


In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, stratify = y , random_state = 100)

In [28]:
tfidf = TfidfVectorizer(stop_words= 'english', ngram_range= (1,3), lowercase= True, max_features= 5000)
X_train_transformed = tfidf.fit_transform(X_train['text_processed'])
X_test_transformed = tfidf.transform(X_test['text_processed'])

In [29]:
X_train_transformed.shape

(31428, 5000)

# Logistic Regression

In [30]:
from sklearn.linear_model import LogisticRegression

In [31]:
lr = LogisticRegression()
lr.fit(X_train_transformed, y_train)

LogisticRegression()

In [32]:
y_pred = lr.predict(X_test_transformed)

In [33]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [34]:
accuracy_score(y_test, y_pred)

0.9806978470675576

In [35]:
confusion_matrix(y_test, y_pred)

array([[6880,  165],
       [  95, 6330]], dtype=int64)

# Decision Tree Classifier

In [36]:
from sklearn.tree import DecisionTreeClassifier

In [37]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train_transformed, y_train)

DecisionTreeClassifier()

In [38]:
y_pred = dtc.predict(X_test_transformed)

In [39]:
accuracy_score(y_test, y_pred)

0.942761692650334

In [40]:
confusion_matrix(y_test, y_pred)

array([[6721,  324],
       [ 447, 5978]], dtype=int64)

# Random Forest Classifier

In [41]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train_transformed, y_train)

RandomForestClassifier()

In [42]:
y_pred = rfc.predict(X_test_transformed)

In [43]:
accuracy_score(y_test, y_pred)

0.9844840386043059

In [44]:
confusion_matrix(y_test, y_pred)

array([[6907,  138],
       [  71, 6354]], dtype=int64)