In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB


In [51]:
#mport the dataset 
dataset = pd.read_csv("news_dataset.csv")

dataset.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [30]:
#leyts get information about the dataset
#We have 6335 rows and 4 columns
print(dataset.shape)

(6335, 4)


In [47]:
#Check for missing values in the dataset,as you can see there are no missing values in the dataset.
print(dataset['title'].isnull())
print(dataset['text'].isnull())
print(dataset['label'].isnull())

0       False
1       False
2       False
3       False
4       False
        ...  
6330    False
6331    False
6332    False
6333    False
6334    False
Name: title, Length: 6335, dtype: bool
0       False
1       False
2       False
3       False
4       False
        ...  
6330    False
6331    False
6332    False
6333    False
6334    False
Name: text, Length: 6335, dtype: bool
0       False
1       False
2       False
3       False
4       False
        ...  
6330    False
6331    False
6332    False
6333    False
6334    False
Name: label, Length: 6335, dtype: bool


In [65]:
#lets check the columns we have so that we can clean the dataset.
list(dataset.columns)

['Unnamed: 0', 'title', 'text', 'label']

In [49]:
#We don't neeed the 'Unnamed: 0' column in our model so let's drop it.
dataset.drop(['Unnamed: 0'], axis=1, inplace=True)

#We have a cleaner data. 
dataset.head()

Unnamed: 0,title,text,label
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [56]:
#The label column will be used to forecast the values, while the title column will be 
#utilized to train the machine learning model.

x = np.array(dataset["title"])
y = np.array(dataset["label"])

cv = CountVectorizer()
x = cv.fit_transform(x)

In [58]:
#Divide the dataset into training and testing sets, and then use the Multinomial Naive Bayes technique to train the false news detection model:

xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=42)
model = MultinomialNB()
model.fit(xtrain, ytrain)
print(model.score(xtest, ytest))

#we get 80% accuracy which is ok. 

0.8074191002367798


In [67]:
# lets test the model, I will just input any news title on Google news, and see if the trained model predicts tp be real or fake.

news_headline = "Ukraine: Austrian leader, Putin meet…other new developments"


data = cv.transform([news_headline]).toarray()
print(model.predict(data))
#Awesome as you can see, it predicted the news to be true.

['REAL']


In [69]:
#Now lets check if it will predict a fake news to be false, I will simply type a random news

news_headline = "A lion was found flying in South America"


data = cv.transform([news_headline]).toarray()
print(model.predict(data))

['FAKE']
