In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# general untility libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# machine learning libraries 
from sklearn import model_selection, naive_bayes, svm
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, LabelBinarizer, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
import scipy

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.metrics import f1_score, roc_curve, precision_recall_curve, auc, make_scorer, recall_score, accuracy_score, precision_score, confusion_matrix

In [3]:
# read csv file 
df = pd.read_csv("C://GoogleDrive/dissertation/data/output/final.csv")

# get shape
df.shape

(27499, 8)

In [4]:
df.head()

Unnamed: 0,usrvrfd,label,txt,wrdsize,url,mention,charsize,puncsize
0,False,real,coronavirus disease covid affect eyes review c...,large,absent,present,large,more
1,False,real,covid vaccine candidate shows promise first pe...,large,absent,absent,medium,more
2,False,real,many front line staff self isolating took test...,large,present,absent,medium,less
3,False,real,please abandon pets due unfounded fear spreadi...,large,absent,present,large,more
4,False,real,notext nohashtags haraldharefoot nooter vinland,small,present,absent,small,more


## Perform encoding for ML algorithms
***

### Separate class variable with the rest
***

In [4]:
from sklearn.preprocessing import LabelEncoder

# create label encoder object
lbl_encoder = LabelEncoder()

# encode label feature to 0 and 1
y = lbl_encoder.fit_transform(df['label'])

# Get all the non label features to one variable 
X = df.drop(columns='label', axis=1)

### Split the dataset to train and test
***

In [5]:
# split dataset into training and test 
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33, random_state=12)

### Create pipeline
***

Ref: https://stackoverflow.com/questions/57867974/one-pipeline-to-fit-both-text-and-categorical-features

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import Normalizer

ct = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(max_features=5000), 'txt'), #TfidfVectorizer accepts column name only between quotes
        ('category', OneHotEncoder(), ['usrvrfd','wrdsize','url','mention','charsize','puncsize']),
        ],
    )
pipe = Pipeline(
        steps=[
                ('preprocessor', ct),
                ('classifier', LogisticRegression()),
            ],
        )

df_encode_train = ct.fit_transform(X_train)
df_encode_test = ct.fit_transform(X_test)

In [7]:
df_encode_train.shape, df_encode_test.shape

((18424, 5014), (9075, 5014))

## Perform model training
***

### Create common fuctions and variables used by all the models
***

In [8]:
# evaluation scorers 
scorers = {
    'precision_score': make_scorer(precision_score),
    'recall_score': make_scorer(recall_score),
    'accuracy_score': make_scorer(accuracy_score),
    'f1_score': make_scorer(f1_score)
}

# evaluation parameters
evaluation_params = ['mean_test_f1_score','mean_test_precision_score', 'mean_test_recall_score', 'mean_test_accuracy_score']

# a function that train all the ml models with different parameters 
def grid_search_wrapper(ml_model, param_grid, crss_val=5, refit_score='precision_score'):
    """
    fits a GridSearchCV classifier using refit_score for optimization
    prints classifier performance metrics
    """
    skf = StratifiedKFold(n_splits=crss_val)
    grid_search = GridSearchCV(ml_model, param_grid, scoring=scorers, refit=refit_score,
                           cv=skf, return_train_score=True, n_jobs=-1)
    grid_search.fit(df_encode_train, y_train)

    # make the predictions
    y_pred = grid_search.predict(df_encode_test)

    print('Best params for {}'.format(refit_score))
    print(grid_search.best_params_)

    return grid_search

# a function to return the result in a sorted order in pandas dataframe form
def GetResult(grdSrchObj):
    # Thanks to the article for the help. https://towardsdatascience.com/fine-tuning-a-classifier-in-scikit-learn-66e048c21e65
    results = pd.DataFrame(grdSrchObj.cv_results_)
    results = results.sort_values(by='mean_test_precision_score', ascending=False)
    return results

### 1. Random Forest classifier
***

#### Without parameter tuning

In [37]:
rf = RandomForestClassifier()
rf.fit(df_encode_train,y_train)

# predict the labels on validation dataset
pred_rf = rf.predict(df_encode_test)

# Use accuracy_score function to get the accuracy
print("Random forest Accuracy without performance tuning -> ",accuracy_score(pred_rf, y_test)*100)

Random forest Accuracy without performance tuning ->  78.50137741046832


#### With parameter tuning

In [34]:
clf = RandomForestClassifier(n_jobs=-1)
rnd_frst_params = ['param_min_samples_split', 'param_n_estimators']
combined_params = evaluation_params + rnd_frst_params

param_grid = {
    'min_samples_split': [3, 5, 10], 
    'n_estimators' : [100, 300]
}

grid_search_clf = grid_search_wrapper(clf, param_grid, refit_score='precision_score')
results = GetResult(grid_search_clf)
results[combined_params].sort_values(by='mean_test_precision_score', ascending=False)

Best params for precision_score
{'min_samples_split': 5, 'n_estimators': 100}


Unnamed: 0,mean_test_f1_score,mean_test_precision_score,mean_test_recall_score,mean_test_accuracy_score,param_min_samples_split,param_n_estimators
2,0.918542,0.886212,0.953341,0.869789,5,100
1,0.918753,0.885986,0.954045,0.87006,3,300
0,0.918006,0.885272,0.95327,0.868866,3,100
3,0.91863,0.885092,0.954821,0.869735,5,300
5,0.918173,0.884493,0.954539,0.868975,10,300
4,0.918109,0.883999,0.954962,0.868812,10,100


### 2. Logistic regression
***

#### Without parameter tuning 

In [14]:
lr = LogisticRegression()
lr.fit(df_encode_train,y_train)

# predict the labels on validation dataset
pred_lr = lr.predict(df_encode_test)

# Use accuracy_score function to get the accuracy
print("Random forest Accuracy without performance tuning -> ",accuracy_score(pred_lr, y_test)*100)

Random forest Accuracy without performance tuning ->  77.63085399449035


#### With parameter tuning

In [38]:
# Initiate Logistic regression model
model_lr = LogisticRegression()
rnd_frst_params = ['param_solver', 'param_C']
combined_params = evaluation_params + rnd_frst_params

# set hyper parameters
param_grid = {
    'solver': ['newton-cg', 'lbfgs', 'liblinear'], 
    'C': [1, 1000, 10000000000]
}

grid_search_lr = grid_search_wrapper(model_lr, param_grid, refit_score='precision_score')
results = GetResult(grid_search_lr)
results[combined_params].sort_values(by='mean_test_precision_score', ascending=False).head(5)
# df_result = results[combined_params].round(3).head()
# df_result

Best params for precision_score
{'C': 1000, 'solver': 'lbfgs'}


Unnamed: 0,mean_test_f1_score,mean_test_precision_score,mean_test_recall_score,mean_test_accuracy_score,param_solver,param_C
4,0.930636,0.924239,0.93713,0.892423,lbfgs,1000
7,0.929956,0.923791,0.936214,0.891392,lbfgs,10000000000
5,0.916102,0.91907,0.913166,0.871201,liblinear,1000
3,0.915692,0.918888,0.912532,0.870604,newton-cg,1000
8,0.902898,0.915893,0.890399,0.852638,liblinear,10000000000


### 3. SVM Model
***

#### Without parameter tuning

In [52]:
svm = SVC()
svm.fit(df_encode_train,y_train)

# predict the labels on validation dataset
pred_svm = svm.predict(df_encode_test)

# Use accuracy_score function to get the accuracy
print("Random forest Accuracy without performance tuning -> ",accuracy_score(pred_svm, y_test)*100)

Random forest Accuracy without performance tuning ->  78.92011019283747


#### With parameter tuning

In [41]:
import timeit
start_time = timeit.default_timer()
 
model_svm = SVC()
svm_params = ['param_kernel', 'param_C']
combined_params = evaluation_params + svm_params

# define parameters 
param_grid = {
    'kernel': ['linear', 'sigmoid'], 
#     'C': [1, 1000, 10000000000]
    'C': [1]
}

grid_search_lr = grid_search_wrapper(model_svm, param_grid, refit_score='precision_score')
results = GetResult(grid_search_lr)
df_result = results[combined_params].round(3).head()

end_time = timeit.default_timer()
print('Time taken by SVM model: ', end_time - start_time) 
df_result

Time taken by SVM model:  261.9601743000003


Unnamed: 0,mean_test_f1_score,mean_test_precision_score,mean_test_recall_score,mean_test_accuracy_score,param_kernel,param_C
0,0.941,0.91,0.973,0.906,linear,1
1,0.843,0.814,0.874,0.75,sigmoid,1


## 4. Linear SVC model
***

#### Without parameter tuning

In [54]:
svm = LinearSVC()
svm.fit(df_encode_train,y_train)

# predict the labels on validation dataset
pred_svm = svm.predict(df_encode_test)

# Use accuracy_score function to get the accuracy
print("Linear SVC Accuracy (without performance tuning) -> ",accuracy_score(pred_svm, y_test)*100)

Linear SVC Accuracy (without performance tuning) ->  72.94765840220386


#### With parameter tuning

In [64]:
import timeit
start_time = timeit.default_timer()
 
model_lsvc = LinearSVC()
svm_params = ['param_penalty', 'param_loss','param_multi_class']
combined_params = evaluation_params + svm_params

# define parameters 
param_grid = {
    'penalty': ['l2'],
    'loss': ['hinge', 'squared_hinge'],
    'multi_class': ['ovr', 'crammer_singer']
}

grid_search_lr = grid_search_wrapper(model_lsvc, param_grid, refit_score='precision_score')
results = GetResult(grid_search_lr)
df_result = results[combined_params].round(3).head()

end_time = timeit.default_timer()
dur_secs = int(end_time - start_time)
dur_mins = round(dur_secs/60, 1)
print('Time taken by SVM model: {} seconds.'.format(dur_secs)) 
df_result

Best params for precision_score
{'loss': 'squared_hinge', 'multi_class': 'ovr', 'penalty': 'l2'}
Time taken by SVM model: 35 seconds.


Unnamed: 0,mean_test_f1_score,mean_test_precision_score,mean_test_recall_score,mean_test_accuracy_score,param_penalty,param_loss,param_multi_class
2,0.938,0.92,0.957,0.902,l2,squared_hinge,ovr
1,0.939,0.917,0.962,0.903,l2,hinge,crammer_singer
3,0.939,0.917,0.962,0.903,l2,squared_hinge,crammer_singer
0,0.941,0.91,0.973,0.906,l2,hinge,ovr


## 5. Naive Bayes 
***

#### Without parameter tuning

In [11]:
nb = naive_bayes.MultinomialNB()
nb.fit(df_encode_train,y_train)

# predict the labels on validation dataset
pred_nb = nb.predict(df_encode_test)

# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy (without performance tuning) -> ",accuracy_score(pred_nb, y_test)*100)

Random forest Accuracy without performance tuning ->  77.168044077135


#### With parameter tuning

In [33]:
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()
mnb_params = ['param_alpha', 'param_class_prior', 'param_fit_prior']
combined_params = evaluation_params + mnb_params
param_grid = {
    'alpha': [0.0001, 0.000000001, 1.0], 
    'class_prior' : [None],
    'fit_prior':[True, False]
}

grid_search_clf = grid_search_wrapper(mnb, param_grid, refit_score='accuracy_score')
results_mnb = GetResult(grid_search_clf)
results_mnb[combined_params].sort_values(by='mean_test_accuracy_score', ascending=False).head(5)

Best params for accuracy_score
{'alpha': 1e-09, 'class_prior': None, 'fit_prior': True}


Unnamed: 0,mean_test_f1_score,mean_test_precision_score,mean_test_recall_score,mean_test_accuracy_score,param_alpha,param_class_prior,param_fit_prior
2,0.917168,0.916188,0.918171,0.872286,1e-09,,True
0,0.91659,0.916938,0.916268,0.871581,0.0001,,True
3,0.910134,0.924171,0.896532,0.863656,1e-09,,False
1,0.909009,0.928502,0.89033,0.862733,0.0001,,False
4,0.884013,0.895028,0.873274,0.823546,1.0,,True


## 6. KNN (K-Nearest kneighbour)
***

#### Without parameter tuning

In [68]:
knn = KNeighborsClassifier()
knn.fit(df_encode_train,y_train)

# predict the labels on validation dataset
pred_knn = knn.predict(df_encode_test)

# Use accuracy_score function to get the accuracy
print("Nearest kneighbour -KNN- accuracy (without performance tuning) -> ",accuracy_score(pred_knn, y_test)*100)

Nearest kneighbour -KNN- accuracy (without performance tuning) ->  76.45179063360882


#### With parameter tuning

In [9]:
import timeit
start_time = timeit.default_timer()
 
model_KNN = KNeighborsClassifier()
knn_params = ['param_n_neighbors', 'param_p']
combined_params = evaluation_params + knn_params

# define parameters 
param_grid = {
    'n_neighbors': [5], 
    'p': [1]
}

grid_search_knn = grid_search_wrapper(model_KNN, param_grid, refit_score='precision_score')
results = GetResult(grid_search_knn)
df_result = results[combined_params].round(3).head()

end_time = timeit.default_timer()
print('Time taken by nearest kneighbour (KNN) model: {} secs.'.format(int(end_time - start_time))) 
df_result

Best params for precision_score
{'n_neighbors': 5, 'p': 1}
Time taken by nearest kneighbour (KNN) model: 193 secs.


Unnamed: 0,mean_test_f1_score,mean_test_precision_score,mean_test_recall_score,mean_test_accuracy_score,param_n_neighbors,param_p
0,0.888,0.863,0.915,0.822,5,1


# Deep learning algorithms
***

This is totally a different way and hence handling it separately

In [4]:
# KERAS library use for the neural network related work preprocessing and other important work
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.models import Model
from keras.optimizers import RMSprop
from keras import backend as K
from keras.callbacks import EarlyStopping

# Tensorflow provides key algorithm for modeling
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

In [6]:
df.head()

Unnamed: 0,usrvrfd,label,txt,wrdsize,url,mention,charsize,puncsize
0,False,real,coronavirus disease covid affect eyes review c...,large,absent,present,large,more
1,False,real,covid vaccine candidate shows promise first pe...,large,absent,absent,medium,more
2,False,real,many front line staff self isolating took test...,large,present,absent,medium,less
3,False,real,please abandon pets due unfounded fear spreadi...,large,absent,present,large,more
4,False,real,notext nohashtags haraldharefoot nooter vinland,small,present,absent,small,more


In [None]:
#categorical data
cat_columns = ['wrdsize', 'url', 'mention', 'charsize', 'puncsize', 'usrvrfd']

#import pandas as pd
df_encode = pd.get_dummies(df, columns = cat_columns)

# display output
df_encode.head(2)

In [19]:
max_words = 5000
max_len = df.shape[0]
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X_train)
sequences = tok.texts_to_sequences(X_train)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)

sequences_matrix.shape, y_train.shape

((7, 27499), (18424,))

In [23]:
sequences_matrix

array([[0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 2],
       [0, 0, 0, ..., 0, 0, 3],
       ...,
       [0, 0, 0, ..., 0, 0, 5],
       [0, 0, 0, ..., 0, 0, 6],
       [0, 0, 0, ..., 0, 0, 7]])

In [28]:
max_words = 5000
max_len = 150

# split data to train and test
X = df.txt
Y = df.label
le = LabelEncoder()
Y = le.fit_transform(Y)
Y = Y.reshape(-1,1)

X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.15)

# do text tokenization for machine to read the train data
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X_train)
sequences = tok.texts_to_sequences(X_train)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)

# X_train,X_test,Y_train,Y_test
X_train_prcs = tok.texts_to_sequences(X_train)
X_train_prcs = sequence.pad_sequences(X_train_prcs, maxlen=max_len)

X_test_prcs = tok.texts_to_sequences(X_test)
X_test_prcs = sequence.pad_sequences(X_test_prcs, maxlen=max_len)

In [29]:
X_train_prcs.shape

(23374, 150)