# Fake news detection
# LP-1 Mini project
# Roll no : 33202, 33206, 33207

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("news.csv")
df

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
...,...,...,...,...
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL


In [29]:
x = df['text']
y = df['label']
x

0       Daniel Greenfield, a Shillman Journalism Fello...
1       Google Pinterest Digg Linkedin Reddit Stumbleu...
2       U.S. Secretary of State John F. Kerry said Mon...
3       — Kaydee King (@KaydeeKing) November 9, 2016 T...
4       It's primary day in New York and front-runners...
                              ...                        
6330    The State Department told the Republican Natio...
6331    The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...
6332     Anti-Trump Protesters Are Tools of the Oligar...
6333    ADDIS ABABA, Ethiopia —President Obama convene...
6334    Jeb Bush Is Suddenly Attacking Trump. Here's W...
Name: text, Length: 6335, dtype: object

In [30]:
y

0       FAKE
1       FAKE
2       REAL
3       FAKE
4       REAL
        ... 
6330    REAL
6331    FAKE
6332    FAKE
6333    REAL
6334    REAL
Name: label, Length: 6335, dtype: object

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

In [24]:
x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=0 )
x_train

2402    Christian Whiton is a former deputy special en...
1922    Super Tuesday Brings Harsh Light And Heartache...
3475    Prev post Page 1 of 4 Next \nNurses are among ...
6197    The deadly hostage situation at a luxury hotel...
4748    Our new country: Women and minorities hit hard...
                              ...                        
4931    Twenty-two of the 37 corporations nominated fo...
3264    As pieces of luggage, human remains, wreckage ...
1653    0 Add Comment \nIN THE immediate aftermath of ...
2607    Palestine Palestinians check the flat of Amjad...
2732    For the second week in a row, there was a temp...
Name: text, Length: 5068, dtype: object

In [25]:
y_train

2402    REAL
1922    REAL
3475    FAKE
6197    REAL
4748    FAKE
        ... 
4931    REAL
3264    REAL
1653    FAKE
2607    FAKE
2732    REAL
Name: label, Length: 5068, dtype: object

In [26]:
tfvect = TfidfVectorizer(stop_words='english',max_df=0.7)
tfid_x_train = tfvect.fit_transform(x_train)
tfid_x_test = tfvect.transform(x_test)

max_df = 0.5 means ignore terms that appear in more than 50% of document

max_df = 0.25 mean ignore terms that appear more than 25% of document

In [32]:
classifier = PassiveAggressiveClassifier( max_iter=50 )
classifier.fit( tfid_x_train, y_train )

PassiveAggressiveClassifier(max_iter=50)

In [36]:
y_pred = classifier.predict( tfid_x_test )
score = accuracy_score( y_test, y_pred )
print(f"Accuracy: {round(score*100,2)}%")

Accuracy: 93.45%


In [39]:
cf = confusion_matrix( y_test, y_pred, labels=['FAKE','REAL'])
print(cf)

[[570  45]
 [ 38 614]]


In [46]:
def fake_news_det(news):
    input_data = [news]
    vectorized_input_data = tfvect.transform(input_data)
    prediction = classifier.predict(vectorized_input_data)
    print(prediction)

In [47]:
fake_news_det('A Czech stockbroker who saved more than 650 Jewish children from Nazi Germany has died at the age of 106. Dubbed â€œBritainâ€™s Schindler,â€ Nicholas Winton arranged to transport Jewish youngsters from Prague after Germany annexed Czechoslovakia in March 1939. Though the children were originally set to arrive in Britain by plane, the German invasion forced Winton to transport them by train through Germany before they eventually reached England by boat. Winton arranged eight trains, known as the Kindertransports (childrenâ€™s transports), to evacuate the children, and died on the anniversary of the 1939 departure of the one carrying the largest number of children: 241. Winton was knighted by Queen Elizabeth II in 2003 for his efforts, despite keeping it secret for nearly 50 years.')

['REAL']


In [48]:
fake_news_det("having a dead body laying in the yard,â€ she said. â€œWant to get people to be a little more focused on the issues, whatâ€™s going on in the world. We need to stick together more. We need to come together. And if we donâ€™t, this scene in my yard is going to be reality every single day")

['FAKE']


In [65]:
import pickle
pickle.dump(classifier, open('model.pkl', 'wb'))

In [76]:
#load model from disk
load_model = pickle.load(open('model.pkl', 'rb'))

In [79]:
def fake_news_det1(news):
    input_data = [news]
    vectorized_input_data = tfvect.transform(input_data)
    predict = load_model.predict(vectorized_input_data)
    print(predict)

In [86]:
fake_news_det1('Watch The Exact Moment Paul Ryan Committed Pol')

['FAKE']


In [87]:
fake_news_det1('Kerry to go to Paris in gesture of sympathy')

['REAL']
