# REQUIRED LIBRARIES

In [2]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pickle
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vkris\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vkris\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# 1.Data Gathering 

In [5]:
true_df = pd.read_csv(r"D:\projects\False_News_Detection\datasets\true.csv")
false_df = pd.read_csv(r"D:\projects\False_News_Detection\datasets\false.csv")
true_df

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"
...,...,...,...,...
21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017"
21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017"
21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017"
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017"


# 2. Data Analysis

In [61]:
true_df.info

<bound method DataFrame.info of                                                    title  \
0      As U.S. budget fight looms, Republicans flip t...   
1      U.S. military to accept transgender recruits o...   
2      Senior U.S. Republican senator: 'Let Mr. Muell...   
3      FBI Russia probe helped by Australian diplomat...   
4      Trump wants Postal Service to charge 'much mor...   
...                                                  ...   
21412  'Fully committed' NATO backs new U.S. approach...   
21413  LexisNexis withdrew two products from Chinese ...   
21414  Minsk cultural hub becomes haven from authorities   
21415  Vatican upbeat on possibility of Pope Francis ...   
21416  Indonesia to buy $1.14 billion worth of Russia...   

                                                    text       subject  \
0      WASHINGTON (Reuters) - The head of a conservat...  politicsNews   
1      WASHINGTON (Reuters) - Transgender people will...  politicsNews   
2      WASHINGTON (Reuter

In [62]:
true_df['title'].value_counts()

title
Factbox: Trump fills top jobs for his administration                                14
Factbox: Contenders for senior jobs in Trump's administration                        8
Highlights: The Trump presidency on April 13 at 9:30 P.M. EDT/0130 GMT on Friday     8
Factbox: International reaction to arrest of Reuters reporters in Myanmar            6
Highlights: The Trump presidency on April 21 at 6:12 p.m. EDT/2212 GMT               5
                                                                                    ..
Obama, on last trip to Europe, warns against nationalism, populism                   1
A post-Trump SEC could shake up current policy                                       1
U.S. panel urges probe on whether China weakening U.S. militarily                    1
Trump team weighs 'infrastructure bank' to fund projects: Trump adviser              1
Indonesia to buy $1.14 billion worth of Russian jets                                 1
Name: count, Length: 20826, dtype: in

In [63]:
true_df.shape

(21417, 4)

In [64]:
true_df.isna().sum()

title      0
text       0
subject    0
date       0
dtype: int64

In [65]:
true_df.reset_index(inplace=True)
true_df.head()

Unnamed: 0,index,title,text,subject,date
0,0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [66]:
true_df['title'][0]

'As U.S. budget fight looms, Republicans flip their fiscal script'

In [67]:
true_df = true_df.drop(['subject','date'],axis = 1)
true_df.head()

Unnamed: 0,index,title,text
0,0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...
1,1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...
2,2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...
3,3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...
4,4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...


# 3.Data Preprocessing 

## 1.Tokenization

In [68]:
sample_data = 'The quick brown fox jumps over the lazy dog'
sample_data = sample_data.split()
sample_data

['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']

## 2. Make Lowercase

In [69]:
sample_data = [data.lower() for data in sample_data]
sample_data

['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']

## 3. Remove Stopwords

In [71]:
stop_words = stopwords.words('english')
print(stop_words[0:10])
print(len(stop_words))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]
179


In [72]:
sample_data = [data for data in sample_data if data not in stop_words]
print(sample_data)
len(sample_data)

['quick', 'brown', 'fox', 'jumps', 'lazy', 'dog']


6

## 4. Stemming

In [73]:
ps = PorterStemmer()
sample_data_stemming = [ps.stem(data) for data in sample_data]
print(sample_data_stemming)

['quick', 'brown', 'fox', 'jump', 'lazi', 'dog']


## 5. Lemmatization

In [74]:
lm = WordNetLemmatizer()
sample_data_lemma = [lm.lemmatize(data) for data in sample_data]
print(sample_data_lemma)

['quick', 'brown', 'fox', 'jump', 'lazy', 'dog']


In [75]:
lm = WordNetLemmatizer()
true_corpus = []
false_corpus = []
for i in range (len(true_df)):
    review = re.sub('[^a-zA-Z0-9]', ' ', true_df['title'][i])
    review = review.lower()
    review = review.split()
    review = [lm.lemmatize(x) for x in review if x not in stop_words]
    review = " ".join(review)
    true_corpus.append(review)

for i in range (len(false_df)):
    review = re.sub('[^a-zA-Z0-9]', ' ', false_df['title'][i])
    review = review.lower()
    review = review.split()
    review = [lm.lemmatize(x) for x in review if x not in stop_words]
    review = " ".join(review)
    false_corpus.append(review)

In [76]:
len(true_corpus)

21417

In [47]:
true_df['title'][0]

'As U.S. budget fight looms, Republicans flip their fiscal script'

In [48]:
true_corpus[0]

'u budget fight loom republican flip fiscal script'

# 4.Vectorization (Convert Text data into the Vector)

In [77]:
true_df['label'] = 1
false_df['label'] = 0
df = pd.concat([true_df, false_df], axis=0).reset_index(drop=True)
df['content'] = df['title'] + " " + df['text']
X = df['content']
y = df['label']

In [78]:
tf = TfidfVectorizer()
X_tfidf = tf.fit_transform(X)

## Data splitting into the train and test

In [81]:
x_train, x_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.3, random_state=10, stratify=y)

## 5. Model Building

In [82]:
rf = RandomForestClassifier()

In [83]:
rf.fit(x_train, y_train)

In [84]:
lg=LogisticRegression()


In [85]:
lg.fit(x_train,y_train)

In [106]:
with open(r'D:\projects\false-news-detect\trained\logistic_regression.pkl', 'wb') as model_file:
    pickle.dump(lg, model_file)

In [107]:
with open(r'D:\projects\false-news-detect\trained\tfidf_vectorizer_rf.pkl', 'wb') as vectorizer_file:
    pickle.dump(tf, vectorizer_file)

## 6. Model Evaluation

In [108]:
with open(r'D:\projects\false-news-detect\trained\random_forest.pkl', 'wb') as vectorizer_file:
    pickle.dump(rf, vectorizer_file)

In [86]:
y_pred = rf.predict(x_test)
accuracy_score_ = accuracy_score(y_test,y_pred) 
accuracy_score_

0.9899777282850779

In [87]:
y_predict=lg.predict(x_test)
accuracy_scr=accuracy_score(y_test,y_predict)
print(accuracy_scr)

0.9879732739420936


In [91]:
class Evaluation:
    
    def __init__(self, model, x_train, x_test, y_train, y_test):
        self.model = model
        self.x_train = x_train
        self.x_test = x_test
        self.y_train = y_train
        self.y_test = y_test
        
    def train_evaluation(self):
        y_pred_train = self.model.predict(self.x_train)
        
        acc_scr_train = accuracy_score(self.y_train, y_pred_train)
        print("Accuracy Score On Training Data Set : ", acc_scr_train)
        print()
        
        con_mat_train = confusion_matrix(self.y_train, y_pred_train)
        print("Confusion Matrix On Training Data Set :\n", con_mat_train)
        print()
        
        class_rep_train = classification_report(self.y_train, y_pred_train)
        print("Classification Report On Training Data Set :\n", class_rep_train)
        
    def test_evaluation(self):
        y_pred_test = self.model.predict(self.x_test)
        
        acc_scr_test = accuracy_score(self.y_test, y_pred_test)
        print("Accuracy Score On Testing Data Set :", acc_scr_test)
        print()
        
        con_mat_test = confusion_matrix(self.y_test, y_pred_test)
        print("Confusion Matrix On Testing Data Set :\n", con_mat_test)
        print()
        
        class_rep_test = classification_report(self.y_test, y_pred_test)
        print("Classification Report On Testing Data Set :\n", class_rep_test)

# Predicting acuuracies for Random Forest Classifier

In [92]:
Evaluation(rf, x_train, x_test, y_train, y_test).train_evaluation()

Accuracy Score On Training Data Set :  1.0

Confusion Matrix On Training Data Set :
 [[16436     0]
 [    0 14992]]

Classification Report On Training Data Set :
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     16436
           1       1.00      1.00      1.00     14992

    accuracy                           1.00     31428
   macro avg       1.00      1.00      1.00     31428
weighted avg       1.00      1.00      1.00     31428



In [93]:
Evaluation(rf, x_train, x_test, y_train, y_test).test_evaluation()

Accuracy Score On Testing Data Set : 0.9899777282850779

Confusion Matrix On Testing Data Set :
 [[6990   55]
 [  80 6345]]

Classification Report On Testing Data Set :
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      7045
           1       0.99      0.99      0.99      6425

    accuracy                           0.99     13470
   macro avg       0.99      0.99      0.99     13470
weighted avg       0.99      0.99      0.99     13470



# Predicting accuracies for Logistic Regression 

In [94]:
Evaluation(lg, x_train, x_test, y_train, y_test).train_evaluation()

Accuracy Score On Training Data Set :  0.9920771286750668

Confusion Matrix On Training Data Set :
 [[16297   139]
 [  110 14882]]

Classification Report On Training Data Set :
               precision    recall  f1-score   support

           0       0.99      0.99      0.99     16436
           1       0.99      0.99      0.99     14992

    accuracy                           0.99     31428
   macro avg       0.99      0.99      0.99     31428
weighted avg       0.99      0.99      0.99     31428



In [95]:
Evaluation(lg,x_train,x_test,y_train,y_test).test_evaluation()

Accuracy Score On Testing Data Set : 0.9879732739420936

Confusion Matrix On Testing Data Set :
 [[6955   90]
 [  72 6353]]

Classification Report On Testing Data Set :
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      7045
           1       0.99      0.99      0.99      6425

    accuracy                           0.99     13470
   macro avg       0.99      0.99      0.99     13470
weighted avg       0.99      0.99      0.99     13470



In [96]:
Evaluation(lg, x_train, x_test, y_train, y_test).test_evaluation()

Accuracy Score On Testing Data Set : 0.9879732739420936

Confusion Matrix On Testing Data Set :
 [[6955   90]
 [  72 6353]]

Classification Report On Testing Data Set :
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      7045
           1       0.99      0.99      0.99      6425

    accuracy                           0.99     13470
   macro avg       0.99      0.99      0.99     13470
weighted avg       0.99      0.99      0.99     13470



# Prediction pipeline

In [97]:
class Preprocessing:
    
    def __init__(self, data):
        self.data = data
        
    def text_preprocessing_user(self):
        lm = WordNetLemmatizer()
        pred_data = [self.data]    
        preprocess_data = []
        for data in pred_data:
            review = re.sub('[^a-zA-Z0-9]', ' ', data)
            review = review.lower()
            review = review.split()
            review = [lm.lemmatize(x) for x in review if x not in stop_words]
            review = " ".join(review)
            preprocess_data.append(review)
        return preprocess_data

In [98]:
df['title'][2]


"Senior U.S. Republican senator: 'Let Mr. Mueller do his job'"

In [99]:
data = 'FLYNN: Hillary Clinton, Big Woman on Campus - Breitbart'
Preprocessing(data).text_preprocessing_user()

['flynn hillary clinton big woman campus breitbart']

# Prediction 

In [100]:
class Prediction:
    
    def __init__(self, pred_data, model):
        self.pred_data = pred_data
        self.model = model
        
    def prediction_model(self):
        preprocess_data = Preprocessing(self.pred_data).text_preprocessing_user()
        data = tf.transform(preprocess_data)
        prediction = self.model.predict(data)
        
        if prediction[0] == 0:
            return "The News Is Fake"
        
        else:
            return "The News Is Real"

In [101]:
data = 'Why the Truth Might Get You Fired'
print(Prediction(data, rf).prediction_model())

The News Is Fake
