In [None]:
pip install Unidecode

In [None]:
#Import the necessary libraries
import numpy as np 
import pandas as pd
import re
from bs4 import BeautifulSoup
import unidecode
from nltk import word_tokenize,pos_tag
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# Importing Datasets

In [None]:
#Function that cleans the dataset represented by a Pandas dataframe passed into it 
def clean_dataset(dataset):
    # With combining various datasets, there are bound to be duplicates, and this removes such duplicates if they exist
    duplicate_rows_data = dataset[dataset.duplicated()]
    dataset = dataset.drop_duplicates()
    #Remove all rows that do not have 0 or 1 as the label
    dataset = dataset[(dataset.label == 0) | (dataset.label == 1)]
    dataset.isnull().sum()
    #Remove any rows that have null in any of the columns 
    dataset = dataset.dropna(how='any',axis=0) 
    return dataset

Dataset 1 (IEEE DataPort)

In [None]:
true_data = pd.read_csv('Data/IEEE/ieee_true.csv')
true_data = true_data.drop('Label', 1)
true_data = true_data.drop('Publisher', 1)
true_data = true_data.drop('Username', 1)
true_data = true_data.drop('Region', 1)
true_data = true_data.drop('Link', 1)
true_data = true_data.drop('Date Posted', 1)
#Drop the column 'Label' and replace it with 'label'
true_data['label']=[1]*len(true_data)

In [None]:
fake_data = pd.read_csv('Data/IEEE/ieee_fake.csv')
fake_data = fake_data.drop('Binary Label', 1)
fake_data = fake_data.drop('Poynter_Label', 1)
fake_data = fake_data.drop('Fact_checked_by', 1)
fake_data = fake_data.drop('Origin_URL', 1)
fake_data = fake_data.drop('Origin', 1)
fake_data = fake_data.drop('Explanation', 1)
fake_data = fake_data.drop('Country', 1)
fake_data = fake_data.drop('Region', 1)
fake_data = fake_data.drop('Link', 1)
fake_data = fake_data.drop('Date Posted', 1)
fake_data['label']=[0]*len(fake_data)

In [None]:
#Append the true and fake datasets
dataset_1=true_data.append(fake_data).sample(frac=1).reset_index()
dataset_1 = dataset_1.drop('index', 1)
dataset_1 = clean_dataset(dataset_1)

In [None]:
dataset_1.shape

Dataset 2

In [None]:
dataset_2 = pd.read_csv('Data/zenodo_dataset.csv')
#Rename the hedalines and outcome columns to the names we are looking for 
dataset_2 = dataset_2.rename(columns={'headlines':'Text'})
dataset_2 = dataset_2.rename(columns={'outcome':'label'})
dataset_2 = clean_dataset(dataset_2)
dataset_2.shape


Dataset 3

In [None]:
diptamath_first = pd.read_csv('Data/Diptamath/first_set.csv')
diptamath_first = diptamath_first.drop('id', 1)
diptamath_first = diptamath_first.rename(columns={'tweet':'Text'})
#Replaces 'fake' with 0, 1 otherwise
diptamath_first.loc[diptamath_first['label'] == 'fake', 'label'] = 0
diptamath_first.loc[diptamath_first['label'] == 'real', 'label'] = 1
diptamath_first.head()

In [None]:
diptamath_second = pd.read_csv('Data/Diptamath/second_set.csv')
diptamath_second = diptamath_second.drop('id', 1)
diptamath_second = diptamath_second.rename(columns={'tweet':'Text'})
diptamath_second.loc[diptamath_second['label'] == 'fake', 'label'] = 0
diptamath_second.loc[diptamath_second['label'] == 'real', 'label'] = 1
diptamath_second.head()

In [None]:
dataset_3=diptamath_first.append(diptamath_second).sample(frac=1).reset_index()
dataset_3 = dataset_3.drop('index', 1)
dataset_3.head()

In [None]:
dataset_3.shape

Dataset 4

In [None]:
dataset_4 = pd.read_csv('Data/github_dataset.csv')
dataset_4 = dataset_4.rename(columns={'News':'Text'})
dataset_4 = dataset_4.rename(columns={'Outcome':'label'})
dataset_4 = clean_dataset(dataset_4)
dataset_4.shape

In [None]:
dataset_4.loc[dataset_4['label'] == 'fake', 'label'] = 0
dataset_4.loc[dataset_4['label'] == 'real', 'label'] = 1

# Data Cleaning

In [None]:
# remove whitespace from text
def remove_whitespace(text):
    return  " ".join(text.split())

In [None]:
#In some cases, there is no space after a period, comma, or dash in an article, and as a result, when the special characters are removed 
#the last word of the previous sentence and first word in the next sentence are added together 

def add_space_after_period(text):
    text = re.sub(r'(?<=[.,-])(?=[^\s])', r' ', text)
    return text


In [None]:
#Remove any emails 
def remove_emails(text):
    return re.sub(r'([a-z0-9+._-]+@[a-z0-9+._-]+\.[a-z0-9+_-]+)',"", text)

In [None]:
#Remove any HTML tags
def remove_html_tags(text):
    parser = BeautifulSoup(text, "html.parser")
    without_html = parser.get_text(separator = " ")
    return without_html

In [None]:
def remove_rt(text):
    return re.sub(r'\brt\b', '', text).strip()

In [None]:
#Remove any special characters
def remove_special_characters(text):
    text = re.sub(r'[^\w ]+', "", text)
    text = ' '.join(text.split())
    return text

In [None]:
def remove_accented_chars(text):
    text = unidecode.unidecode(text)
    return text

In [None]:
#Remove stop words
def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    return ' '.join([t for t in text.split() if t not in stop_words])

In [None]:
#Upon inspecting the cleaned data, there were several tokens of the form pictwittercom... that needed to be removed 
def remove_twitter_pics(text):
    text = re.sub(r"pictwittercom.*", '', text)
    return text

In [None]:
#Convert the spelling from British to American
def convert_to_american(text):
    # Copied from here
    text = re.sub(r"(...)our$", r"\1or", text)
    text = re.sub(r"([bt])re$", r"\1er", text)
    text = re.sub(r"([iy])s(e$|ing|ation)", r"\1z\2", text)
    text = re.sub(r"ogue$", "og", text)
    return text

In [None]:
def lemmatization(text):
    result=[]
    wordnet = WordNetLemmatizer()
    for token,tag in pos_tag(text):
        pos=tag[0].lower()
        
        if pos not in ['a', 'r', 'n', 'v']:
            pos='n'
            
        result.append(wordnet.lemmatize(token,pos))
    return result

In [None]:
def clean_text(text):
    clean = remove_whitespace(text)
    clean = clean.strip()
    clean = clean.lower()
    clean = remove_emails(clean)
    clean = remove_html_tags(clean)
    clean = remove_rt(clean)
    clean = add_space_after_period(clean)
    clean = remove_special_characters(clean)
    clean = remove_accented_chars(clean)
    clean = remove_twitter_pics(clean)
    clean = convert_to_american(clean)
    clean = re.sub("(.)\\1{2,}", "\\1", clean)
    clean = word_tokenize(clean)
    clean = lemmatization(clean)
    return clean 

In [None]:
dataset_1['Text'] = dataset_1['Text'].apply(lambda x: clean_text(x))
dataset_1['Text'] = dataset_1['Text'].apply(lambda x: ' '.join(x))

dataset_2['Text'] = dataset_2['Text'].apply(lambda x: clean_text(x))
dataset_2['Text'] = dataset_2['Text'].apply(lambda x: ' '.join(x))


dataset_3['Text'] = dataset_3['Text'].apply(lambda x: clean_text(x))
dataset_3['Text'] = dataset_3['Text'].apply(lambda x: ' '.join(x))


dataset_4['Text'] = dataset_4['Text'].apply(lambda x: clean_text(x))
dataset_4['Text'] = dataset_4['Text'].apply(lambda x: ' '.join(x))

# Running Models

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn import metrics
import itertools
from sklearn import svm
import xlsxwriter
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix 
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [None]:
def break_model(data):
    data.label = data.label.astype('int')
    X_train, X_test, y_train, y_test = train_test_split(data['Text'], data.label, test_size=0.20, random_state=42, shuffle=True)
    return (X_train, X_test, y_train, y_test)

def feature_extractor(X_train, X_test, y_train, y_test, data):
    y_train=y_train.astype('int')
    y_test=y_test.astype('int')
    #Initialize the `tfidf_vectorizer` 
    tfidf_vectorizer_ngram = TfidfVectorizer(analyzer= 'word', stop_words='english', ngram_range=(1, 2), min_df=2)
    #Fit and transform the training data 
    tfidf_train_ngram = tfidf_vectorizer_ngram.fit_transform(X_train)
    #Transform the test set 
    tfidf_test_ngram = tfidf_vectorizer_ngram.transform(X_test)
    return tfidf_vectorizer_ngram

In [None]:
def run_models(X_train, X_test, y_train, y_test, tfidf_vectorizer_ngram):    
    #Multinomial Naive Bayes
    nb_pipeline_ngram = Pipeline([
        ('nb_tfidf',tfidf_vectorizer_ngram),
        ('nb_clf',MultinomialNB())])
    nb_pipeline_ngram.fit(X_train,y_train)
    predicted_nb_ngram = nb_pipeline_ngram.predict(X_test)
    accuracy = np.mean(predicted_nb_ngram == y_test)
    print("Multinomal Naive Bayes:", accuracy)
    
    param_grid = {'nb_tfidf__ngram_range': [(1,2), (1,3)],
                  'nb_clf__alpha': [0.1, 1.0],
                  'nb_clf__fit_prior': [True, False]}
    
    grid = GridSearchCV(nb_pipeline_ngram, param_grid, scoring='accuracy') 
    grid.fit(X_train, y_train) 

    # print best parameter after tuning 
    print(grid.best_params_) 
    grid_predictions = grid.predict(X_test) 
    accuracy = np.mean(grid_predictions == y_test)
    print("Multinomal Naive Bayes:", accuracy)
    
    
    
    #building classifier using logistic regression
    logR_pipeline_ngram = Pipeline([
            ('LogRCV_tfidf_ngram',tfidf_vectorizer_ngram),
            ('LogR_clf',LogisticRegression(max_iter=10000))
            ])
    logR_pipeline_ngram.fit(X_train,y_train)
    predicted_LogR_ngram = logR_pipeline_ngram.predict(X_test)
    accuracy = np.mean(predicted_LogR_ngram == y_test)
    print("Logistic Regression:", accuracy)
    
    
    param_grid = {'LogR_clf__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
                 'LogR_clf__C': [100, 10],
                 'LogR_clf__class_weight': [None, 'balanced'],
                 'LogR_clf__multi_class': ['auto', 'ovr'],
                  'LogR_clf__tol': [0.01, 0.1]}

    grid = GridSearchCV(logR_pipeline_ngram, param_grid, scoring='accuracy', verbose=True) 
    grid.fit(X_train, y_train) 

    # print best parameter after tuning 
    print(grid.best_params_) 
    grid_predictions = grid.predict(X_test) 
    accuracy = np.mean(grid_predictions == y_test)
    print("Logistic Regression:", accuracy)
    
    
    #building Linear SVM classfier
    svm_pipeline_ngram = Pipeline([
            ('svm_tfidf',tfidf_vectorizer_ngram),
            ('svm_clf',svm.LinearSVC(max_iter=100000))
            ])
    svm_pipeline_ngram.fit(X_train,y_train)
    predicted_svm_ngram = svm_pipeline_ngram.predict(X_test)
    accuracy = np.mean(predicted_svm_ngram == y_test)
    print("Linear SVM Classifier:", accuracy)
    
    param_grid = {'svm_tfidf__ngram_range': [(1, 1), (1, 2),(1,3)],
                  'svm_clf__tol': [0.0001, 0.01, 0.1],
                  'svm_clf__loss': ['hinge', 'squared_hinge'],
                  'svm_clf__class_weight': [None, 'balanced']
                 }

    grid = GridSearchCV(svm_pipeline_ngram, param_grid) 
    grid.fit(X_train, y_train) 

    # print best parameter after tuning 
    print(grid.best_params_) 
    grid_predictions = grid.predict(X_test) 
    accuracy = np.mean(grid_predictions == y_test)
    print("Linear SVM Classifier:", accuracy)
    
    
    #building Passive Aggressive Classifier
    passive_pipeline_ngram = Pipeline([
            ('pass_tfidf',tfidf_vectorizer_ngram),
            ('pas_clf',PassiveAggressiveClassifier())
            ])
    passive_pipeline_ngram.fit(X_train,y_train)
    predicted_passive_ngram = passive_pipeline_ngram.predict(X_test)
    accuracy = np.mean(predicted_passive_ngram == y_test)
    print("Passive Aggressive Classifier:", accuracy)
    
    
    param_grid = {'pass_tfidf__ngram_range': [(1,1), (1, 2),(1,3)],
                  'pas_clf__C': [1, 10],
                  'pas_clf__fit_intercept': [True, False], 
                  'pas_clf__tol': [0.0001, 0.01, 0.1],
                  'pas_clf__early_stopping': [True, False]}
    grid = GridSearchCV(passive_pipeline_ngram, param_grid) 
    grid.fit(X_train, y_train) 

    # print best parameter after tuning 
    print(grid.best_params_) 
    grid_predictions = grid.predict(X_test) 
    accuracy = np.mean(grid_predictions == y_test)
    print("Passive Aggressive Classifier:", accuracy)
    
    
    #building Decision Tree Classifier
    decision_pipeline_ngram = Pipeline([
            ('decision_tfidf_ngram',tfidf_vectorizer_ngram),
            ('decision_clf',DecisionTreeClassifier())
            ])
    decision_pipeline_ngram.fit(X_train,y_train)
    predicted_decision_ngram = decision_pipeline_ngram.predict(X_test)
    accuracy = np.mean(predicted_decision_ngram == y_test)
    print("Decision Tree Classifier:", accuracy)
    
    param_grid = {'decision_tfidf_ngram__ngram_range': [(1,2), (1,3)],
                  'decision_clf__criterion': ["gini", "entropy"],
                  'decision_clf__splitter': ["best", "random"],
                  'decision_clf__max_depth': [30],
                  'decision_clf__min_samples_split': [0.5, 1.0, 2], 
                  'decision_clf__class_weight': [None, 'balanced']}


    grid = GridSearchCV(decision_pipeline_ngram, param_grid) 
    grid.fit(X_train, y_train) 

    # print best parameter after tuning 
    print(grid.best_params_) 
    grid_predictions = grid.predict(X_test) 
    accuracy = np.mean(grid_predictions == y_test)
    print("Decision Tree Classifier:", accuracy)
    
    #building Random Forest Classifier
    forest_pipeline_ngram = Pipeline([
            ('forest_tfidf_ngram',tfidf_vectorizer_ngram),
            ('forest_clf',RandomForestClassifier(n_estimators=50))
            ])
    forest_pipeline_ngram.fit(X_train,y_train)
    predicted_forest_ngram = forest_pipeline_ngram.predict(X_test)
    accuracy = np.mean(predicted_forest_ngram == y_test)
    print("Random Forest Classifier:", accuracy)
    
    param_grid = {'forest_clf__criterion': ["gini", "entropy"],
                  'forest_clf__class_weight': [None, "balanced", "balanced_subsample"],
                  'forest_clf__bootstrap': [True, False]}
    grid = GridSearchCV(forest_pipeline_ngram, param_grid) 
    grid.fit(X_train, y_train) 

    # print best parameter after tuning 
    print(grid.best_params_) 
    grid_predictions = grid.predict(X_test) 
    accuracy = np.mean(grid_predictions == y_test)
    print("Decision Tree Classifier:", accuracy) 

In [None]:
(X_train, X_test, y_train, y_test) = break_model(dataset_1)
dataset1_extractor = feature_extractor(X_train, X_test, y_train, y_test, dataset_1)
run_models(X_train, X_test, y_train, y_test, dataset1_extractor)
print("\n\n")

(X_train, X_test, y_train, y_test) = break_model(dataset_2)
dataset2_extractor = feature_extractor(X_train, X_test, y_train, y_test, dataset_2)
run_models(X_train, X_test, y_train, y_test, dataset2_extractor)
print("\n\n")


(X_train, X_test, y_train, y_test) = break_model(dataset_3)
dataset3_extractor = feature_extractor(X_train, X_test, y_train, y_test, dataset_3)
run_models(X_train, X_test, y_train, y_test, dataset3_extractor)
print("\n\n")


(X_train, X_test, y_train, y_test) = break_model(dataset_4)
dataset4_extractor = feature_extractor(X_train, X_test, y_train, y_test, dataset_4)
run_models(X_train, X_test, y_train, y_test, dataset4_extractor)
print("\n\n")