<a href="https://colab.research.google.com/github/swethareddythukkani/GitHubTest/blob/main/Fake_news_Detector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer #convert text into vectors so models can work with it. it actually use TF-IDF params to make vectors
from sklearn.svm import LinearSVC

source: https://github.com/lutzhamel/fake-news/blob/master/data/fake_or_real_news.csv

In [None]:
df = pd.read_csv('fake_or_real_news.csv')

In [None]:
df.columns

Index(['id', 'title', 'text', 'label'], dtype='object')

In [None]:
df.head(3)

Unnamed: 0,id,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL


In [None]:
df['fake'] = (df['label'] == 'REAL').astype(int) #col name i sfake but we want ot give real values as 1 and fake valuesa sa 2

In [None]:
df.head(3)

Unnamed: 0,id,title,text,label,fake
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE,0
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE,0
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL,1


In [None]:
df.drop('label', axis=1)

Unnamed: 0,id,title,text,fake
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",0
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,0
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,1
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",0
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,1
...,...,...,...,...
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,1
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,0
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,0
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",1


In [None]:
# splitting independant and dependant features
x,y  = df['text'], df['fake']

In [None]:
# splitting train_test split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2)

In [None]:
#vectorizing the text data
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
x_train_vectorized = vectorizer.fit_transform(x_train)
x_test_vectorized = vectorizer.transform(x_test)

In [None]:
# Initializee the model
classifier = LinearSVC()
# fit the model
classifier.fit(x_train_vectorized, y_train)



In [None]:
# test and get accuracy
classifier.score(x_test_vectorized, y_test)

0.9518547750591949

In [None]:
x_test.shape # ie.., out 1267 it classified with 95% accuracy i.e.., really good

(1267,)

In [None]:
# to simulate real world useage -> we create txt file out of our dataset -> save txt file and ask model to classify it and check it -> then check how accurate it is
x_test.iloc[10] #11th news article from the test dataset

'Based on the possibility that you will become the nominee of the Republican Party, and perhaps the\xa0 president of the United States, I offer you the following thoughts on what you need to do as this unusual primary season approaches its homestretch.\n\nI do this out of dedication to my country and Party. \xa0I want Republicans to win the presidency and I am worried about the harm that will be done if Secretary Hillary Clinton becomes president.\n\nYour loss in Wisconsin on Tuesday is a sign that your style of campaigning is catching up to you.\n\nYou have done a powerful job of energizing some 35 to 40 percent of Republican primary voters and you have made the Republican Party more attractive to lower-income, working Americans who typically question whether the GOP cares about them.\xa0 But getting to 50 percent is your problem and many of your recent statements and tweets are driving away the people you need to close the deal.\n\nIf you want to close the gap and win a majority of t

In [None]:
#creating a text file of 11th article
with open('test.txt', 'w', encoding = "utf-8") as f: # w indicates write command
    f.write(x_test.iloc[10])

In [None]:
from os import read
# read the text file
with open('test.txt', 'r', encoding = "utf-8") as f:
    text = f.read()

In [None]:
text

'Based on the possibility that you will become the nominee of the Republican Party, and perhaps the\xa0 president of the United States, I offer you the following thoughts on what you need to do as this unusual primary season approaches its homestretch.\n\nI do this out of dedication to my country and Party. \xa0I want Republicans to win the presidency and I am worried about the harm that will be done if Secretary Hillary Clinton becomes president.\n\nYour loss in Wisconsin on Tuesday is a sign that your style of campaigning is catching up to you.\n\nYou have done a powerful job of energizing some 35 to 40 percent of Republican primary voters and you have made the Republican Party more attractive to lower-income, working Americans who typically question whether the GOP cares about them.\xa0 But getting to 50 percent is your problem and many of your recent statements and tweets are driving away the people you need to close the deal.\n\nIf you want to close the gap and win a majority of t

In [None]:
# transform the text to sparse matrix
vectorized_text = vectorizer.transform([text])

In [None]:
classifier.predict(vectorized_text)

array([1])

In [None]:
# so output from classifier is 1 i.e.., real article
# lets confirm it from y_test

y_test.iloc[10] # real label of 11th news article

1

Since both are 1 our model is really good