In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
fake=pd.read_csv('Fake.csv', engine='python', on_bad_lines='warn')
true=pd.read_csv('True.csv', engine='python', on_bad_lines='warn')

In [3]:
fake.head()
true.head()


Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [4]:
fake['class']=0
true['class']=1

In [5]:
data=pd.concat([fake,true],axis=0)

In [6]:
data.sample(5)

Unnamed: 0,title,text,subject,date,class
5444,Trump knew for weeks that aide was being misle...,WASHINGTON (Reuters) - U.S. President Donald T...,politicsNews,"February 14, 2017",1
11967,BREAKING: “AMERICA FIRST” Is Catching Fire…Fia...,FCA US today confirmed the next phase of its i...,politics,"Jan 8, 2017",0
6097,Richard Dreyfuss Just Showed ‘Little-Dick’ Do...,Richard Dreyfuss has provided us with some of ...,News,"May 31, 2016",0
20884,UNCENSORED VIDEO: Real New Yorkers’ Opinions O...,New York City that great melting pot of divers...,left-news,"Mar 13, 2016",0
20895,Germany says EU states must implement court ru...,BERLIN (Reuters) - German Foreign Minister Sig...,worldnews,"September 6, 2017",1


In [7]:
data=data.drop(["title","subject","date"],axis=1)

In [8]:
data.reset_index(inplace=True)
data.drop(["index"],axis=1,inplace=True)

In [9]:
data.sample(5)

Unnamed: 0,text,class
21929,Shawn Helton 21st Century WireSince late Octob...,0
36691,GENEVA (Reuters) - The United Nations called o...,1
36249,BRUSSELS (Reuters) - Britain will honor all it...,1
23944,(Reuters) - A group of Republicans in the U.S....,1
18182,Speaking at a Rotary Club gathering in Kentuck...,0


In [10]:
def clean_text(text):
    text=str(text).lower()
    text=re.sub('\[.*?\]','',text)
    text=re.sub('\\w*\\d\\w*','',text)
    text=re.sub('https?://\S+|www\.\S+','',text)
    text=re.sub('<.*?>+','',text)
    text=re.sub('[%s]'%re.escape(string.punctuation),'',text)
    text=re.sub('\n','',text)
    text=re.sub('\w*\d\w*','',text)
    return text

In [11]:
data['text']=data['text'].apply(clean_text)

In [12]:
x=data['text']
y=data['class']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=42)

In [13]:
vectorization=TfidfVectorizer()
xv_train=vectorization.fit_transform(x_train)
xv_test=vectorization.transform(x_test)

In [14]:
lr=LogisticRegression()
lr.fit(xv_train,y_train)

In [15]:
pred_lr=lr.predict(xv_test)
lr.score(xv_test,y_test)

0.9857461024498887

In [16]:
print(classification_report(y_test,pred_lr))

              precision    recall  f1-score   support

           0       0.99      0.98      0.99      5895
           1       0.98      0.99      0.99      5330

    accuracy                           0.99     11225
   macro avg       0.99      0.99      0.99     11225
weighted avg       0.99      0.99      0.99     11225



In [17]:
joblib.dump(vectorization,"vectorizer.jb")
joblib.dump(lr,'lr_model.jb')

['lr_model.jb']