# Fake News Detection

1. Pre-processing
2. Feature Extraction
3. Classification Model
4. Evaluation

## Importing Libraries

In [1]:
# Importing The most fundamental libraries

!pip install scikit-plot
# !pip install wordcloud

import pandas as pd
import numpy as np
import nltk
import os
import re
import string
import time

# for pre-processing dataset
from nltk import ngrams
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer 
from nltk.tokenize import word_tokenize

# for feature Extraction
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier

# for Splitting our dataset
from sklearn.model_selection import train_test_split

# for building classification models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm

# for evaluation our model
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

# for plotting our confusion matrix
import matplotlib.pyplot as plt
import scikitplot as skplt

#specify english stop words only
nltk.download('stopwords')
stops= stopwords.words('english') 

nltk.download('wordnet')
nltk.download('punkt')

# append rt for stop word dictionary
stops.append("rt") 

#Create stemmer obejct
porter = PorterStemmer()



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Laptop\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Laptop\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Laptop\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Pre-Processing

In [2]:
# Cleanning Our dataset by removing unwanted Characters, Non Letters and Punctuation
def cleanText(csv_file):
    # Reading our dataset as pandas dataframe
    data = pd.read_csv(csv_file)
    # dropping the id, title and author column
    data = data.drop(columns=['id','author']) 
    # droping all null values in our data
    data = data.dropna() 
    data = data.reset_index(drop = True)
    
    return data

# Cleanning our text and converting it to lower case, delete stopwords, Stemming and remove punctuation
def stem_tokenize(data):

    # Frist converting all letters to lower case
    data= data.lower()

    # removing unwanted digits ,special chracters from the text
    data= ' '.join(re.sub("(@[A-Za-z0-9]+)", " ", data).split())
    data= ' '.join(re.sub("^@?(\w){1,15}$", " ", data).split())
    data= ' '.join(re.sub("(\w+:\/\/\S+)", " ", data).split())

    # removing stopwards and numbers from STRING library
    table= str.maketrans('', '', string.punctuation+string.digits)
    data = data.translate(table)
  
    # Split Sentence as tokens words 
    token = word_tokenize(data)
  
    # converting words to their root forms by STEMMING THE WORDS 
    stem = [porter.stem(word) for word in token] 
  
    # remove stopwords from our text
    words = [word for word in stem if not word in stops]
    data  = ' '.join(words)

    return data

# Splitting our Dataset into trainning and testing sets 80/20
def Splite_clean_data(csv_file, colX, colY):
    
    # reading Clean Dataset
    df = cleanText(csv_file)
    
    # Applying Clean function to remove unwanted characters , stopwords and apply STEMMING
    for i in range(len(df)):
        df.loc[i, colX] = stem_tokenize(df.loc[i,colX])

    # Splitting dataset into trainning and testing sets
    x_train, x_test, y_train, y_test = train_test_split(df[colX], df[colY], test_size=0.2, random_state=7)
    
    return x_train, x_test, y_train, y_test

## Feature Extraction

In [3]:
def TF_IDF(Model, title):

    # Spiltting the dataset after calling Clean function to pre-process datat before extracting feature
    xtrain, xtest, ytrain, ytest = Splite_clean_data('train.csv', 'text', 'label')

    # Initialization TF-IDF vector model to convert all textual content to numercial one
    vector = TfidfVectorizer(stop_words='english', max_df=0.7)
    train_vector = vector.fit_transform(xtrain)
    test_vector  = vector.transform(xtest)
    
    TF_IDF_model     = Model
    TF_IDF_model.fit(train_vector, ytrain)
    test_pred = TF_IDF_model.predict(test_vector)
    
    # Calculating accuracy score for trainning model
    accuracy  = TF_IDF_model.score(train_vector, ytrain)*100
    y_pred = TF_IDF_model.predict(test_vector)
    
    # Calculating accuracy score for testing model
    acc_score = accuracy_score(ytest, y_pred)*100
    class_report = classification_report(ytest, y_pred, output_dict=True)
    class_df = pd.DataFrame(class_report).transpose()

    # Calculating f1_score for evalution our model
    test_f1score = f1_score(ytest, y_pred)*100
    
    # plotting Confusin Matrix 
    skplt.metrics.plot_confusion_matrix(ytest, y_pred)
    
    print(title), print('*'*len(title))
    print('Accuracy score train set :'+ format(accuracy, '.2f') + "%")
    print('Accuracy score test set  :'+ format(acc_score, '.2f') + "%",'\n')
    print('F1 score:'+ format(test_f1score, '.2f') + "%",'\n'), print('*'*len(title))
    print('Classification Report: ')
    print(class_df, '\n'), print('*'*len(title))
    plt.show()
    print('-'*80)

In [4]:
def Count_Vector(Model, title, n):
    
    # Spiltting the dataset after calling Clean function to pre-process datat before extracting feature
    xtrain, xtest, ytrain, ytest = Splite_clean_data('train.csv', 'text', 'label')

    # Initialization Count Vectorizer vector model to convert all textual content to numercial one
    vector = CountVectorizer(max_features=1000 , ngram_range=(n,n))
    train_vector = vector.fit_transform(xtrain)
    test_vector = vector.transform(xtest)
    
    count_vector_model     = Model
    count_vector_model.fit(train_vector, ytrain)
    y_pred = count_vector_model.predict(test_vector)
    
    # Calculating accuracy score for trainning model
    accuracy  = count_vector_model.score(train_vector, ytrain)*100
    y_pred = count_vector_model.predict(test_vector)
    
    # Calculating accuracy score for testing model
    acc_score = accuracy_score(ytest, y_pred)*100
    class_report = classification_report(ytest, y_pred, output_dict=True)
    class_df = pd.DataFrame(class_report).transpose()

    # Calculating f1_score for evalution our model
    test_f1score = f1_score(ytest, y_pred)*100
    
    # plotting Confusin Matrix 
    skplt.metrics.plot_confusion_matrix(ytest, y_pred)
    
    print("Models with " , n , "-grams :\n")
    print('********************** \n')
    print(title), print('*'*len(title))
    print('Accuracy score train set : '+ format(accuracy, '.2f') + "%")
    print('Accuracy score test set  : '+ format(acc_score, '.2f') + "%",'\n')
    print('F1 score : '+ format(test_f1score, '.2f') + "%",'\n'), print('*'*len(title))
    print('Classification Report: ')
    print(class_df, '\n'), print('*'*len(title))
    plt.show()
    print('-'*80)

## Classification Models

In [5]:
def TF_IDF_PA_Model():
    # Logistic Regression Classifier with TF_IDF
    PA_Model = TF_IDF(Model = LogisticRegression(), 
                      title='TFIDF Logistic Regression Model : \n')
    return PA_Model

In [6]:
def Count_Vect_PA_Model():
    # Logistic Regression  Classifier with count vectorizer
    PA_Model = Count_Vector(Model = LogisticRegression(), 
                            title='ount verctorizer Passive Aggressive Model : \n',
                            n=2)
    return PA_Model

In [7]:
def Count_Vect_RF():
    # Random Forest Classifier with Count Vectorizer
    RF_Model = Count_Vector(Model=RandomForestClassifier(), 
                            title='Count Vectorizer Random Forest Model : \n ', 
                            n=2)
    return RF_Model

In [8]:
def TF_IDF_RF():
    # Random Forest Classifier with TF_IDF
    RF_Model = TF_IDF(Model=RandomForestClassifier(), 
                            title='TF-IDF Random Forest Model : \n ')
    return RF_Model

In [9]:
def Count_Vect_SVM():
    # Random Forest Classifier with Count Vectorizer
    SVM_Model = Count_Vector(Model=svm.LinearSVC(), 
                            title='Count Vectorizer Random Forest Model : \n ', 
                            n=2)
    return SVM_Model

In [10]:
def TF_IDF_SVM():
    # Random Forest Classifier with Count Vectorizer
    SVM_Model = TF_IDF(Model=svm.LinearSVC(), 
                            title='Count Vectorizer Random Forest Model : \n ')
    return SVM_Model

## Evaluation Methods

In [None]:
# TFIDF with Passive Aggressive classification Model 
if __name__ == '__main__':
    TF_IDF_PA_Model()

In [None]:
# Count Vectorizer with Passive Aggressive classification Model
if __name__ == '__main__':
    Count_Vect_PA_Model()

In [None]:
# TFIDF with Random Forest classification Model 
if __name__ == '__main__':
    TF_IDF_RF_Model()

In [None]:
# Count Vectorizer with Random Forest classification Model
if __name__ == '__main__':
    Count_Vect_RF()

In [None]:
# Count Vectorizer with SVM classification Model
if __name__ == '__main__':
    Count_Vect_SVM()

In [None]:
# TF-IDF with SVM classification Model
if __name__ == '__main__':
    TF_IDF_SVM()

<!--  -->

<!--  -->