In [98]:
from functools import reduce
import nltk
import warnings
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer 
import string
from nltk.corpus import stopwords
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
import joblib
from sklearn.pipeline import make_pipeline

warnings.filterwarnings("ignore")

# Read data

In [16]:
data = pd.read_csv('train.csv', index_col=0)
X_train = data[['title', 'author', 'text']]
y_train = data[['label']]
X_test = pd.read_csv('test.csv', index_col=0)
y_test = pd.read_csv('submit.csv', index_col=0)
# Transform class names
y_test[y_test['label'] == 1] = 'Fake'
y_test[y_test['label'] == 0] = 'Truth'
y_train[y_train['label'] == 1] = 'Fake'
y_train[y_train['label'] == 0] = 'Truth'

In [22]:
X_test['title'] + ' ' + X_test['text']

id
20800    Specter of Trump Loosens Tongues, if Not Purse...
20801    Russian warships ready to strike terrorists ne...
20802    #NoDAPL: Native American Leaders Vow to Stay A...
20803    Tim Tebow Will Attempt Another Comeback, This ...
20804    Keiser Report: Meme Wars (E995) 42 mins ago 1 ...
                               ...                        
25995    The Bangladeshi Traffic Jam That Never Ends - ...
25996    John Kasich Signs One Abortion Bill in Ohio bu...
25997    California Today: What, Exactly, Is in Your Su...
25998    300 US Marines To Be Deployed To Russian Borde...
25999    Awkward Sex, Onscreen and Off - The New York T...
Length: 5200, dtype: object

# Create pre-processing class

In [178]:
class Transformer():
    def __init__(self):
        pass
    
    def transform(self, x):
        x['summary'] = x['title'] + ' ' + x['text']
        x['summary'] = x['summary'].map(Transformer.__remove_punctuation)
        x['summary'] = x['summary'].map(Transformer.__clean_text)
        x['summary'] = x['summary'].map(Transformer.__lemmatizer)
        x['summary'] = x['summary'].map(Transformer.__remove_stopwords)

        return x['summary']

    def fit(self, x, y):
        return self
    
    @staticmethod
    def __clean_text(text):
        text= text.lower()
        text= ' '.join(re.sub("(@[A-Za-z0-9]+)", " ", text).split()) #tags
        text= ' '.join(re.sub("^@?(\w){1,15}$", " ", text).split())

        text= ' '.join(re.sub("(\w+:\/\/\S+)", " ", text).split())   #Links
        text= ' '.join(re.sub("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\), ]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"," ", text).split()) 
        text= ' '.join(re.sub(r'http\S+', '',text).split())
        text= ' '.join(re.sub(r'www\S+', '',text).split())
        text= ' '.join(re.sub("\s+", " ",text).split()) #Extrem white Space
        text= ' '.join(re.sub("[^-9A-Za-z ]", "" ,text).split()) #digits 
        text= ' '.join(re.sub('-', ' ', text).split()) 
        text= ' '.join(re.sub('_', ' ', text).split()) #underscore     
        return text
    
    @staticmethod
    def __remove_stopwords(text):
        """The function to removing stopwords"""
        stop = stopwords.words('english')
        text = [word.lower() for word in text.split() if word.lower() not in stop]
        return " ".join(text)
    
    @staticmethod
    def __lemmatizer(text):
        """The function to apply lemmatizing"""
        word_list = nltk.word_tokenize(text)
        lemmatized_text = ' '.join([WordNetLemmatizer().lemmatize(w) for w in word_list])
        return lemmatized_text
    
    @staticmethod
    def __remove_punctuation(text):
        """The function to remove punctuation"""
        text = str(text)
        table = str.maketrans('', '', string.punctuation)
        return text.translate(table)    

# Fit models

## Random forest

In [185]:
# fit new model
model_RF = make_pipeline(Transformer(), CountVectorizer(), TfidfTransformer(), RandomForestClassifier())
model_RF = GridSearchCV(model_RF, {'randomforestclassifier__max_depth': [10, 20, 50, 100, 200],
                                   'randomforestclassifier__n_estimators': [100, 200, 400]}).fit(X_train, y_train).best_estimator_
model_RF

Pipeline(steps=[('transformer',
                 <__main__.Transformer object at 0x000002098C2E7340>),
                ('countvectorizer', CountVectorizer()),
                ('tfidftransformer', TfidfTransformer()),
                ('randomforestclassifier',
                 RandomForestClassifier(max_depth=200, n_estimators=400))])

In [186]:
# Dump model
joblib.dump(model_RF, 'models/RF.joblib')

['models/RF.joblib']

In [187]:
# Load fitted model
model_RF = joblib.load('models/RF.joblib')
model_RF

Pipeline(steps=[('transformer',
                 <__main__.Transformer object at 0x00000208D6AA8A90>),
                ('countvectorizer', CountVectorizer()),
                ('tfidftransformer', TfidfTransformer()),
                ('randomforestclassifier',
                 RandomForestClassifier(max_depth=200, n_estimators=400))])

## Logistic regression

In [188]:
# fit new model
model_LR = make_pipeline(Transformer(), CountVectorizer(), TfidfTransformer(), LogisticRegression(C=35)).fit(X_train, y_train)
#model_LR = GridSearchCV(model_LR, {'logisticregression__C': [0.1, 0.5, 1, 3, 7, 15, 20, 35, 50]}).fit(X_train, y_train).best_estimator_
model_LR

Pipeline(steps=[('transformer',
                 <__main__.Transformer object at 0x00000208E019A790>),
                ('countvectorizer', CountVectorizer()),
                ('tfidftransformer', TfidfTransformer()),
                ('logisticregression', LogisticRegression(C=35))])

In [189]:
# Dump model
joblib.dump(model_LR, 'models/LR.joblib')

['models/LR.joblib']

In [190]:
# Load fitted model
model_LR = joblib.load('models/LR.joblib')
model_LR

Pipeline(steps=[('transformer',
                 <__main__.Transformer object at 0x00000208E019A0D0>),
                ('countvectorizer', CountVectorizer()),
                ('tfidftransformer', TfidfTransformer()),
                ('logisticregression', LogisticRegression(C=35))])

# Models testing

## Naive Bayes

In [191]:
y_pred_NB = model_NB.predict(X_test)
accuracy_score(y_test, y_pred_NB)

0.6023076923076923

In [192]:
print(classification_report(y_test, y_pred_NB))

              precision    recall  f1-score   support

        Fake       0.67      0.54      0.60      2861
       Truth       0.55      0.68      0.61      2339

    accuracy                           0.60      5200
   macro avg       0.61      0.61      0.60      5200
weighted avg       0.62      0.60      0.60      5200



## Linear SVC

In [193]:
y_pred_SVC = model_SVC.predict(X_test)
accuracy_score(y_test, y_pred_SVC)

0.6375

In [194]:
print(classification_report(y_test, y_pred_SVC))

              precision    recall  f1-score   support

        Fake       0.69      0.63      0.65      2861
       Truth       0.59      0.65      0.62      2339

    accuracy                           0.64      5200
   macro avg       0.64      0.64      0.64      5200
weighted avg       0.64      0.64      0.64      5200



## Random forest

In [195]:
y_pred_RF = model_RF.predict(X_test)
accuracy_score(y_test, y_pred_RF)

0.6784615384615384

In [196]:
print(classification_report(y_test, y_pred_RF))

              precision    recall  f1-score   support

        Fake       0.74      0.64      0.69      2861
       Truth       0.62      0.73      0.67      2339

    accuracy                           0.68      5200
   macro avg       0.68      0.68      0.68      5200
weighted avg       0.69      0.68      0.68      5200



## Logistic regression

In [197]:
y_pred_LR = model_LR.predict(X_test)
accuracy_score(y_test, y_pred_LR)

0.6388461538461538

In [198]:
print(classification_report(y_test, y_pred_LR))

              precision    recall  f1-score   support

        Fake       0.69      0.63      0.66      2861
       Truth       0.59      0.65      0.62      2339

    accuracy                           0.64      5200
   macro avg       0.64      0.64      0.64      5200
weighted avg       0.64      0.64      0.64      5200

