In [3]:
# Import the libraries
import re
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

In [1]:
# Upload the datasets
from google.colab import files
files.upload()
files.upload()

Saving real_news.csv to real_news.csv


Saving fake_news.csv to fake_news.csv


In [5]:
# Load the two datasets

df_fake = pd.read_csv("fake_news.csv")
df_real = pd.read_csv("real_news.csv")

In [6]:
df_fake

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"
...,...,...,...,...
23476,McPain: John McCain Furious That Iran Treated ...,21st Century Wire says As 21WIRE reported earl...,Middle-east,"January 16, 2016"
23477,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,21st Century Wire says It s a familiar theme. ...,Middle-east,"January 16, 2016"
23478,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,Patrick Henningsen 21st Century WireRemember ...,Middle-east,"January 15, 2016"
23479,How to Blow $700 Million: Al Jazeera America F...,21st Century Wire says Al Jazeera America will...,Middle-east,"January 14, 2016"


In [7]:
df_real

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"
...,...,...,...,...
21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017"
21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017"
21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017"
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017"


In [8]:
# Set the class of the news (1 real, 0 fake)
df_fake['class'] = 0
df_real['class'] = 1

In [14]:
# Create a new dataframe that combines the previous ones
df = pd.concat([df_fake, df_real])
# Save the new dataframe
df.to_csv("mixed_news.csv")

In [15]:
df

Unnamed: 0,title,text,subject,date,class
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0
...,...,...,...,...,...
21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",1
21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",1
21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",1
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",1


In [16]:
# Drop the useless columns
df = df.drop(['title', 'subject', 'date'], axis=1)
# Random rearrange of the order
df = df.sample(frac=1)
# Show the data
df

Unnamed: 0,text,class
18748,The protesters win by scaring people into sil...,0
9702,NEW YORK (Reuters) - U.S. Treasury Secretary J...,1
20599,WASHINGTON (Reuters) - The U.S.-led coalition ...,1
7607,WASHINGTON (Reuters) - Democratic presidential...,1
21808,More and more dirt on these two grifters who r...,0
...,...,...
2694,"WASHINGTON (Reuters) - Facebook Inc (FB.O), Al...",1
14340,WASHINGTON (Reuters) - The top U.S. general in...,1
22038,Tune in to the Alternate Current Radio Network...,0
4200,Two Donald Trump supporters openly carrying fi...,0


In [17]:
# Reset the index
df.reset_index(inplace=True)
df

Unnamed: 0,index,text,class
0,18748,The protesters win by scaring people into sil...,0
1,9702,NEW YORK (Reuters) - U.S. Treasury Secretary J...,1
2,20599,WASHINGTON (Reuters) - The U.S.-led coalition ...,1
3,7607,WASHINGTON (Reuters) - Democratic presidential...,1
4,21808,More and more dirt on these two grifters who r...,0
...,...,...,...
44893,2694,"WASHINGTON (Reuters) - Facebook Inc (FB.O), Al...",1
44894,14340,WASHINGTON (Reuters) - The top U.S. general in...,1
44895,22038,Tune in to the Alternate Current Radio Network...,0
44896,4200,Two Donald Trump supporters openly carrying fi...,0


In [18]:
# Drop the "Index" column
df.drop(["index"], axis=1, inplace=True)
df

Unnamed: 0,text,class
0,The protesters win by scaring people into sil...,0
1,NEW YORK (Reuters) - U.S. Treasury Secretary J...,1
2,WASHINGTON (Reuters) - The U.S.-led coalition ...,1
3,WASHINGTON (Reuters) - Democratic presidential...,1
4,More and more dirt on these two grifters who r...,0
...,...,...
44893,"WASHINGTON (Reuters) - Facebook Inc (FB.O), Al...",1
44894,WASHINGTON (Reuters) - The top U.S. general in...,1
44895,Tune in to the Alternate Current Radio Network...,0
44896,Two Donald Trump supporters openly carrying fi...,0


In [19]:
# Create a function that cleans the text
def clean_text(text):
	text = text.lower()
	text = re.sub('\[.*?\]', '', text)
	text = re.sub("\\W"," ",text)
	text = re.sub('https?://\S+|www\.\S+', '', text)
	text = re.sub('<.*?>+', '', text)
	text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
	text = re.sub('\n', '', text)
	text = re.sub('\w*\d\w*', '', text)
	return text

In [20]:
# Clean the text
df['text'] = df['text'].apply(clean_text)
df

Unnamed: 0,text,class
0,the protesters win by scaring people into sil...,0
1,new york reuters u s treasury secretary j...,1
2,washington reuters the u s led coalition ...,1
3,washington reuters democratic presidential...,1
4,more and more dirt on these two grifters who r...,0
...,...,...
44893,washington reuters facebook inc fb o al...,1
44894,washington reuters the top u s general in...,1
44895,tune in to the alternate current radio network...,0
44896,two donald trump supporters openly carrying fi...,0


In [21]:
# Prepare the data

x = df["text"]
y = df["class"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

In [22]:
# Vectorize the data
vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

#Logistic Regression

In [24]:
# Train the model
LR = LogisticRegression()
LR.fit(xv_train,y_train)

LogisticRegression()

In [25]:
# Predict
pred_lr=LR.predict(xv_test)

In [26]:
# See the results
print(LR.score(xv_test, y_test))
print(classification_report(y_test, pred_lr))

0.9846770601336303
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      5854
           1       0.98      0.99      0.98      5371

    accuracy                           0.98     11225
   macro avg       0.98      0.98      0.98     11225
weighted avg       0.98      0.98      0.98     11225



#Decision Tree Classifier

In [27]:
# Train the model
DT = DecisionTreeClassifier()
DT.fit(xv_train, y_train)

DecisionTreeClassifier()

In [28]:
# Predict
pred_dt = DT.predict(xv_test)

In [29]:
# See the results
print(DT.score(xv_test, y_test))
print(classification_report(y_test, pred_dt))

0.9956347438752784
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5854
           1       1.00      0.99      1.00      5371

    accuracy                           1.00     11225
   macro avg       1.00      1.00      1.00     11225
weighted avg       1.00      1.00      1.00     11225



#Gradient Boosting Classifier

In [30]:
# Train the model
GBC = GradientBoostingClassifier(random_state=0)
GBC.fit(xv_train, y_train)

GradientBoostingClassifier(random_state=0)

In [31]:
# Predict
pred_gbc = GBC.predict(xv_test)

In [32]:
# See the results
print(GBC.score(xv_test, y_test))
print(classification_report(y_test, pred_gbc))

0.995456570155902
              precision    recall  f1-score   support

           0       1.00      0.99      1.00      5854
           1       0.99      1.00      1.00      5371

    accuracy                           1.00     11225
   macro avg       1.00      1.00      1.00     11225
weighted avg       1.00      1.00      1.00     11225



#Random Forest Classifier

In [33]:
# Train the model
RFC = RandomForestClassifier(random_state=0)
RFC.fit(xv_train, y_train)

RandomForestClassifier(random_state=0)

In [34]:
# Predict
pred_rfc = RFC.predict(xv_test)

In [35]:
# See the results
print(RFC.score(xv_test, y_test))
print(classification_report(y_test, pred_rfc))

0.987260579064588
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5854
           1       0.99      0.99      0.99      5371

    accuracy                           0.99     11225
   macro avg       0.99      0.99      0.99     11225
weighted avg       0.99      0.99      0.99     11225



#Manual Testing

In [36]:
# Create a function that converts the output label (0 or 1 to "Fake" or "Real")
def output_lablel(n):
	if n == 0:
		return "Fake News"
	if n == 1:
		return "Real News"

In [37]:
# Create a function that test the input news
def manual_testing(news):
	# Convert the news to a dataframe
	testing_news = {"text":[news]}
	new_def_test = pd.DataFrame(testing_news)
	# Clean the text
	new_def_test["text"] = new_def_test["text"].apply(clean_text)
	# Extract the output text
	new_x_test = new_def_test["text"]
	# Vectorize the input
	new_xv_test = vectorization.transform(new_x_test)
	# Predict with the four models
	pred_LR = LR.predict(new_xv_test)
	pred_DT = DT.predict(new_xv_test)
	pred_GBC = GBC.predict(new_xv_test)
	pred_RFC = RFC.predict(new_xv_test)
	# Prepare the output string
	result = "\n\nLR Prediction: {} ".format(output_lablel(pred_LR[0]))
	result += "\nDT Prediction: {} ".format(output_lablel(pred_DT[0]))
	result += "\nGBC Prediction: {} ".format(output_lablel(pred_GBC[0]))
	result += "\nRFC Prediction: {}".format(output_lablel(pred_RFC[0]))
	print(result)

In [39]:
# Manual test
news = input("Manual Test: ")
manual_testing(news)

Manual Test: Indonesia to buy $1.14 billion worth of Russian jets,"JAKARTA (Reuters) - Indonesia will buy 11 Sukhoi fighter jets worth $1.14 billion from Russia in exchange for cash and Indonesian commodities, two cabinet ministers said on Tuesday. The Southeast Asian country has pledged to ship up to $570 million worth of commodities in addition to cash to pay for the Suhkoi SU-35 fighter jets, which are expected to be delivered in stages starting in two years. Indonesian Trade Minister Enggartiasto Lukita said in a joint statement with Defence Minister Ryamizard Ryacudu that details of the type and volume of commodities were  still being negotiated . Previously he had said the exports could include palm oil, tea, and coffee. The deal is expected to be finalised soon between Indonesian state trading company PT Perusahaan Perdangangan Indonesia and Russian state conglomerate Rostec. Russia is currently facing a new round of U.S.-imposed trade sanctions. Meanwhile, Southeast Asia s larg