# Assignment 1

### Ayush Yadav (MDS202315)

Build a prototype for sms spam classification <br>

In `train.ipynb` write the functions to 
1) Fit a model on train data
2) Score a model on given data
3) Evaluate the model predictions 
4) Validate the model:

   1) Fit on train
   2) Score on train and validation
   3) Evaluate on train and validation
   4) Fine-tune hyper-params using train and validation (if necessary)
5) Score three benchmark models on test data and select the best one 


In [93]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix, roc_auc_score, roc_curve
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from scipy import sparse
from scipy.sparse import load_npz

import pickle
import warnings

warnings.filterwarnings('ignore')

#### Importing the Data

In [3]:
TRAIN = pd.read_csv('./sms+spam+collection/TRAIN.csv').dropna()
VAL = pd.read_csv('./sms+spam+collection/VALIDATION.csv').dropna()
TEST = pd.read_csv('./sms+spam+collection/TEST.csv').dropna()

bow_msgs = load_npz('./sms+spam+collection/bag_of_words.npz')

with open('./sms+spam+collection/bag_of_words.pkl','rb') as f:
    bag_of_words = pickle.load(f)

In [4]:
train_x = TRAIN.drop(columns=['label'])
train_y = TRAIN.label

val_x = VAL.drop(columns=['label'])
val_y = VAL.label

test_x = TEST.drop(columns=['label'])
test_y = TEST.label

In [5]:
train_x = bag_of_words.transform(train_x['preprocessed'])
val_x = bag_of_words.transform(val_x['preprocessed'])
test_x = bag_of_words.transform(test_x['preprocessed'])

In [6]:
### Vectorizing using TF-IDF Vectorize

tfidf_transformer = TfidfTransformer().fit(bow_msgs)

X_train = tfidf_transformer.transform(train_x)
print("Training data shape:",X_train.shape)

X_val = tfidf_transformer.transform(val_x)
print("Validation data shape:",X_val.shape)

X_test = tfidf_transformer.transform(test_x)
print("Testing data shape:",X_test.shape)

Training data shape: (3785, 7947)
Validation data shape: (947, 7947)
Testing data shape: (834, 7947)


#### Fitting the Model

##### Naive Bayes

In [90]:
def naive_bayes(X_train, y_train, X_val, y_val, X_test, y_test):

    clf=MultinomialNB(fit_prior=True, class_prior=None)

    clf_parameters = {
        'clf__alpha':(0,1),
        }

    pipeline = Pipeline([('clf', clf)])

    parameters={**clf_parameters}

    grid = GridSearchCV(pipeline,parameters,n_jobs=-1,scoring='f1',cv=10)

    grid.fit(X_train, y_train)

    clf = grid.best_estimator_

    val_preds = clf.predict(X_val)
    test_preds = clf.predict(X_test)

    print('='*100)
    print('Performance on validation set\n'.upper())
    print("Accuracy: ",accuracy_score(y_val, val_preds))
    print("F1 Score: ",f1_score(y_val, val_preds))
    print(classification_report(y_val, val_preds))
    print('='*100)
    print('Performance on test set\n'.upper())
    print("Accuracy: ",accuracy_score(y_test, test_preds))
    print("F1 Score: ",f1_score(y_test, test_preds))
    print(classification_report(y_test, test_preds))
    print('='*100)

In [91]:
naive_bayes(X_train, train_y, X_val, val_y, X_test, test_y)

PERFORMANCE ON VALIDATION SET

Accuracy:  0.9640971488912354
F1 Score:  0.8454545454545455
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       820
           1       1.00      0.73      0.85       127

    accuracy                           0.96       947
   macro avg       0.98      0.87      0.91       947
weighted avg       0.97      0.96      0.96       947

PERFORMANCE ON TEST SET

Accuracy:  0.9532374100719424
F1 Score:  0.7891891891891892
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       722
           1       1.00      0.65      0.79       112

    accuracy                           0.95       834
   macro avg       0.97      0.83      0.88       834
weighted avg       0.96      0.95      0.95       834



##### Logistic Regression

In [70]:
def logistic_regression(X_train, y_train, X_val, y_val, X_test, y_test):

    clf = LogisticRegression(class_weight='balanced', n_jobs=-1)
    clf_parameters = {
                    'clf__solver':('newton-cg','lbfgs','liblinear','saga'),
                }

    pipeline = Pipeline([('clf', clf)])

    parameters={**clf_parameters}

    grid = GridSearchCV(pipeline,parameters,n_jobs=-1,scoring='f1',cv=10)

    grid.fit(X_train, y_train)

    clf = grid.best_estimator_

    val_preds = clf.predict(X_val)
    test_preds = clf.predict(X_test)

    print('='*100)
    print('Performance on validation set\n'.upper())
    print("Accuracy: ",accuracy_score(y_val, val_preds))
    print("F1 Score: ",f1_score(y_val, val_preds))
    print(classification_report(y_val, val_preds))
    print('='*100)
    print('Performance on test set\n'.upper())
    print("Accuracy: ",accuracy_score(y_test, test_preds))
    print("F1 Score: ",f1_score(y_test, test_preds))
    print(classification_report(y_test, test_preds))
    print('='*100)

In [94]:
logistic_regression(X_train, train_y, X_val, val_y, X_test, test_y)

PERFORMANCE ON VALIDATION SET

Accuracy:  0.9704329461457233
F1 Score:  0.889763779527559
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       820
           1       0.89      0.89      0.89       127

    accuracy                           0.97       947
   macro avg       0.94      0.94      0.94       947
weighted avg       0.97      0.97      0.97       947

PERFORMANCE ON TEST SET

Accuracy:  0.9688249400479616
F1 Score:  0.8818181818181818
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       722
           1       0.90      0.87      0.88       112

    accuracy                           0.97       834
   macro avg       0.94      0.93      0.93       834
weighted avg       0.97      0.97      0.97       834



##### Random Forest

In [73]:
def random_forest(X_train, y_train, X_val, y_val, X_test, y_test):

    clf = RandomForestClassifier(class_weight='balanced', max_depth=10)
    clf_parameters = {
                'clf__criterion':('gini', 'entropy'), 
                'clf__max_features':('sqrt', 'log2'),   
                'clf__n_estimators':(10, 30,50,100,200),
                'clf__max_depth':(10,20),
                } 

    pipeline = Pipeline([('clf', clf)])

    parameters={**clf_parameters}

    grid = GridSearchCV(pipeline,parameters,n_jobs=-1,scoring='f1',cv=10)

    grid.fit(X_train, y_train)

    clf = grid.best_estimator_

    val_preds = clf.predict(X_val)
    test_preds = clf.predict(X_test)

    print('='*100)
    print('Performance on validation set\n'.upper())
    print("Accuracy: ",accuracy_score(y_val, val_preds))
    print("F1 Score: ",f1_score(y_val, val_preds))
    print(classification_report(y_val, val_preds))
    print('='*100)
    print('Performance on test set\n'.upper())
    print("Accuracy: ",accuracy_score(y_test, test_preds))
    print("F1 Score: ",f1_score(y_test, test_preds))
    print(classification_report(y_test, test_preds))
    print('='*100)

In [74]:
random_forest(X_train, train_y, X_val, val_y, X_test, test_y)

PERFORMANCE ON VALIDATION SET

Accuracy:  0.9757127771911299
F1 Score:  0.9037656903765691
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       820
           1       0.96      0.85      0.90       127

    accuracy                           0.98       947
   macro avg       0.97      0.92      0.94       947
weighted avg       0.98      0.98      0.98       947

PERFORMANCE ON TEST SET

Accuracy:  0.973621103117506
F1 Score:  0.8921568627450981
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       722
           1       0.99      0.81      0.89       112

    accuracy                           0.97       834
   macro avg       0.98      0.91      0.94       834
weighted avg       0.97      0.97      0.97       834



##### Gradient Boosting

In [78]:
def grad_boost(X_train, y_train, X_val, y_val, X_test, y_test):

    clf = GradientBoostingClassifier()
    clf_parameters = {
                'clf__loss':('log_loss','exponential'),        
                'clf__criterion':('friedman_mse', 'squared_error'), 
                'clf__max_features':('sqrt', 'log2'),   
                'clf__n_estimators':(50,100,200),
                'clf__max_depth':(5,10),
                } 

    pipeline = Pipeline([('clf', clf)])

    parameters={**clf_parameters}

    grid = GridSearchCV(pipeline,parameters,n_jobs=-1,scoring='f1',cv=10)

    grid.fit(X_train, y_train)

    clf = grid.best_estimator_

    val_preds = clf.predict(X_val)
    test_preds = clf.predict(X_test)

    print('='*100)
    print('Performance on validation set\n'.upper())
    print("Accuracy: ",accuracy_score(y_val, val_preds))
    print("F1 Score: ",f1_score(y_val, val_preds))
    print(classification_report(y_val, val_preds))
    print('='*100)
    print('Performance on test set\n'.upper())
    print("Accuracy: ",accuracy_score(y_test, test_preds))
    print("F1 Score: ",f1_score(y_test, test_preds))
    print(classification_report(y_test, test_preds))
    print('='*100)

In [79]:
grad_boost(X_train, train_y, X_val, val_y, X_test, test_y)

PERFORMANCE ON VALIDATION SET

Accuracy:  0.9725448785638859
F1 Score:  0.8859649122807017
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       820
           1       1.00      0.80      0.89       127

    accuracy                           0.97       947
   macro avg       0.98      0.90      0.94       947
weighted avg       0.97      0.97      0.97       947

PERFORMANCE ON TEST SET

Accuracy:  0.9664268585131894
F1 Score:  0.8571428571428571
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       722
           1       1.00      0.75      0.86       112

    accuracy                           0.97       834
   macro avg       0.98      0.88      0.92       834
weighted avg       0.97      0.97      0.96       834



##### Support Vector Machine

In [82]:
def SVM(X_train, y_train, X_val, y_val, X_test, y_test):

    clf = SVC(class_weight='balanced')  
    clf_parameters = {
        'clf__C':(0.1,0.5,1,2,10,50,100),
        'clf__kernel': ('linear', 'rbf','poly')
        }

    pipeline = Pipeline([('clf', clf)])

    parameters={**clf_parameters}

    grid = GridSearchCV(pipeline,parameters,n_jobs=-1,scoring='f1',cv=10)

    grid.fit(X_train, y_train)

    clf = grid.best_estimator_

    val_preds = clf.predict(X_val)
    test_preds = clf.predict(X_test)

    print('='*100)
    print('Performance on validation set\n'.upper())
    print("Accuracy: ",accuracy_score(y_val, val_preds))
    print("F1 Score: ",f1_score(y_val, val_preds))
    print(classification_report(y_val, val_preds))
    print('='*100)
    print('Performance on test set\n'.upper())
    print("Accuracy: ",accuracy_score(y_test, test_preds))
    print("F1 Score: ",f1_score(y_test, test_preds))
    print(classification_report(y_test, test_preds))
    print('='*100)

In [83]:
SVM(X_train, train_y, X_val, val_y, X_test, test_y)

PERFORMANCE ON VALIDATION SET

Accuracy:  0.9725448785638859
F1 Score:  0.8925619834710744
              precision    recall  f1-score   support

           0       0.98      0.99      0.98       820
           1       0.94      0.85      0.89       127

    accuracy                           0.97       947
   macro avg       0.96      0.92      0.94       947
weighted avg       0.97      0.97      0.97       947

PERFORMANCE ON TEST SET

Accuracy:  0.9772182254196643
F1 Score:  0.9107981220657277
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       722
           1       0.96      0.87      0.91       112

    accuracy                           0.98       834
   macro avg       0.97      0.93      0.95       834
weighted avg       0.98      0.98      0.98       834



#### Results

|          Model         | Validation Accuracy | Validation F1 Score | Test Accuracy | Test F1 Score |
|:----------------------:|---------------------|---------------------|---------------|---------------|
| Naive Bayes            | 0.96                | 0.85                | 0.95          | 0.79          |
| Logistic Regression    | 0.97                | 0.89                | 0.97          | 0.88          |
| Random Forest          | 0.97                | 0.90                | 0.97          | 0.89          |
| Gradient Boosting      | 0.97                | 0.89                | 0.97          | 0.86          |
| Support Vector Machine | 0.97                | 0.89                | 0.98          | 0.91          |

**Best Model (acc to F1 Score)**: Support Vector Machine Classifier