# SVM Model
* Build base SVM model
* Over sampling with SMOTE
* Under sampling with RandomSampler
* Compare performance of all three models
* Use Grid Search to tune hyperparameters
* Compare all the models and save the model with best performance
* Store all models' metrics to an excel file for report

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns; sns.set()
%matplotlib inline
import nltk
from sklearn.feature_extraction import text
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from nltk.probability import FreqDist
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn import metrics, model_selection, svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics as metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc, classification_report
import pickle

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Read preprocessed train data (X & Y)

In [3]:
path = '/content/drive/MyDrive/Guvi Files/Final Project2-V2/'

df = pd.read_pickle(path + 'data/train_tweets_clean.pkl')
train_y = df['label']
train_x = df['lem_tweet']

In [4]:
stop_words = stopwords.words('english')

## Split train & test from the training data (Note the actual test data given doesn't have label)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, test_size=0.2, random_state=42)
#vectorize using TFIDFVectorizer
tfidf = TfidfVectorizer(stop_words=stop_words,ngram_range=(1,2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [6]:
%%time
#build svm model
svm_base_model = svm.SVC(C=1.0,kernel='linear',degree=3, gamma='auto', class_weight='balanced', random_state=20)
svm_base_model.fit(X_train_tfidf, y_train)

CPU times: user 2min 4s, sys: 316 ms, total: 2min 5s
Wall time: 2min 7s


In [7]:
#predict using the model
y_pred = svm_base_model.predict(X_test_tfidf)

In [8]:
# get evaluation metrics for the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98      5498
           1       0.75      0.67      0.71       408

    accuracy                           0.96      5906
   macro avg       0.86      0.83      0.84      5906
weighted avg       0.96      0.96      0.96      5906



## Above result is better than SVM model of V1. Keeping '#' tags and removing duplicates helped

## Print metrics

In [9]:
svm_base_accuracy = accuracy_score(y_test, y_pred)
svm_base_precision = precision_score(y_test, y_pred)
svm_base_recall = recall_score(y_test, y_pred)
svm_base_f1 = f1_score(y_test, y_pred)

#print the values
print("Accuracy: ", svm_base_accuracy)
print("Precision: ", svm_base_precision)
print("Recall: ", svm_base_recall)
print("F1 Score: ", svm_base_f1)

Accuracy:  0.9617338300033864
Precision:  0.7513812154696132
Recall:  0.6666666666666666
F1 Score:  0.7064935064935064


In [10]:
# store the values in a dict
svm_metrics = {}
svm_metrics['SVM - Base model'] = {'Accuracy': svm_base_accuracy, 'Precision': svm_base_precision, 'Recall': svm_base_recall, 'F1 Score': svm_base_f1}

## Over-sampling with SMOTE and evaluate the model

In [11]:
# import smote
from imblearn.over_sampling import SMOTE

In [12]:
#over sample data with smote
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_tfidf, y_train)

In [13]:
# Build SVM model on smote data
svm_smote_model = svm.SVC(C=1.0,kernel='linear',degree=3, gamma='auto', class_weight='balanced', random_state=20)
svm_smote_model.fit(X_train_smote, y_train_smote)
#predict using the svm_smote_model
y_pred_smote = svm_smote_model.predict(X_test_tfidf)


In [14]:
# model evaluation metrics
svm_smote_accuracy = accuracy_score(y_test, y_pred_smote)
svm_smote_precision = precision_score(y_test, y_pred_smote)
svm_smote_recall = recall_score(y_test, y_pred_smote)
svm_smote_f1 = f1_score(y_test, y_pred_smote)
#print
print("Accuracy: ", svm_smote_accuracy)
print("Precision: ", svm_smote_precision)
print("Recall: ", svm_smote_recall)
print("F1 Score: ", svm_smote_f1)

Accuracy:  0.9419234676600068
Precision:  0.5743707093821511
Recall:  0.6151960784313726
F1 Score:  0.5940828402366864


In [15]:
#store the values in the dict
svm_metrics['SVM - SMOTE'] = {'Accuracy': svm_smote_accuracy, 'Precision': svm_smote_precision, 'Recall': svm_smote_recall, 'F1 Score': svm_smote_f1}
svm_metrics

{'SVM - Base model': {'Accuracy': 0.9617338300033864,
  'Precision': 0.7513812154696132,
  'Recall': 0.6666666666666666,
  'F1 Score': 0.7064935064935064},
 'SVM - SMOTE': {'Accuracy': 0.9419234676600068,
  'Precision': 0.5743707093821511,
  'Recall': 0.6151960784313726,
  'F1 Score': 0.5940828402366864}}

## Under-sample using RandomUnderSampler

In [16]:
# import RandomUnderSampler
from imblearn.under_sampling import RandomUnderSampler

In [17]:
# under sample majority data
rus = RandomUnderSampler(random_state=42)
X_train_rus, y_train_rus = rus.fit_resample(X_train_tfidf, y_train)

In [18]:
%%time
# build SVM model using under sampled data
svm_rus_model = svm.SVC(C=1.0,kernel='linear',degree=3, gamma='auto', class_weight='balanced', random_state=20)
svm_rus_model.fit(X_train_rus, y_train_rus)
#predict using the svm_rus_model
y_pred_rus = svm_rus_model.predict(X_test_tfidf)

CPU times: user 3.38 s, sys: 7.92 ms, total: 3.39 s
Wall time: 3.52 s


In [19]:
# accuracy, precision, recall, f1-score
svm_rus_accuracy = accuracy_score(y_test, y_pred_rus)
svm_rus_precision = precision_score(y_test, y_pred_rus)
svm_rus_recall = recall_score(y_test, y_pred_rus)
svm_rus_f1 = f1_score(y_test, y_pred_rus)

#print
print("Accuracy: ", svm_rus_accuracy)
print("Precision: ", svm_rus_precision)
print("Recall: ", svm_rus_recall)
print("F1 Score: ", svm_rus_f1)

Accuracy:  0.8801219099221131
Precision:  0.34909456740442657
Recall:  0.8504901960784313
F1 Score:  0.49500713266761764


In [20]:
# store in metrics dict
svm_metrics['SVM - RUS'] = {'Accuracy': svm_rus_accuracy, 'Precision': svm_rus_precision, 'Recall': svm_rus_recall, 'F1 Score': svm_rus_f1}
svm_metrics

{'SVM - Base model': {'Accuracy': 0.9617338300033864,
  'Precision': 0.7513812154696132,
  'Recall': 0.6666666666666666,
  'F1 Score': 0.7064935064935064},
 'SVM - SMOTE': {'Accuracy': 0.9419234676600068,
  'Precision': 0.5743707093821511,
  'Recall': 0.6151960784313726,
  'F1 Score': 0.5940828402366864},
 'SVM - RUS': {'Accuracy': 0.8801219099221131,
  'Precision': 0.34909456740442657,
  'Recall': 0.8504901960784313,
  'F1 Score': 0.49500713266761764}}

## Conclusion 1: SVM with Base Data has better performance

## Tune Hyperparameters using GridSearchCV

In [21]:
# Create SVM model
svm_model = svm.SVC(degree=3,class_weight='balanced', random_state=20)
# params
params = {'C': [0.1,1,10], 'gamma': [1,0.1,0.01],'kernel':['sigmoid','rbf']}

In [None]:
%%time
# GridSearchCV
svm_grid = GridSearchCV(estimator=svm_model, param_grid=params, cv=5, scoring='f1',verbose=3)
svm_grid.fit(X_train_tfidf, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 1/5] END ....C=0.1, gamma=1, kernel=sigmoid;, score=0.594 total time=  56.6s
[CV 2/5] END ....C=0.1, gamma=1, kernel=sigmoid;, score=0.601 total time=  56.5s
[CV 3/5] END ....C=0.1, gamma=1, kernel=sigmoid;, score=0.611 total time=  57.0s
[CV 4/5] END ....C=0.1, gamma=1, kernel=sigmoid;, score=0.604 total time=  56.5s
[CV 5/5] END ....C=0.1, gamma=1, kernel=sigmoid;, score=0.585 total time=  55.3s
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.552 total time= 1.2min
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.539 total time= 1.2min
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.513 total time= 1.2min
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.567 total time= 1.2min
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.534 total time= 1.2min
[CV 1/5] END ..C=0.1, gamma=0.1, kernel=sigmoid;, score=0.388 total time= 1.2min
[CV 2/5] END ..C=0.1, gamma=0.1, kernel=sigmoid;

In [23]:
# generate score with .best_score_ and hyperparemeters with .best_params_
print('F1 Score:', svm_grid.best_score_)
print('Best Hyperparameters:', svm_grid.best_params_)
print('Model object with best parameters: ')
print(svm_grid.best_estimator_)

F1 Score: 0.6419744316286261
Best Hyperparameters: {'C': 1, 'gamma': 1, 'kernel': 'sigmoid'}
Model object with best parameters: 
SVC(C=1, class_weight='balanced', gamma=1, kernel='sigmoid', random_state=20)


In [25]:
# Compile model with tuned parameters
svm_tuned_model = svm.SVC(C=1, class_weight='balanced', gamma=1, kernel='sigmoid', random_state=20)
svm_tuned_model.fit(X_train_tfidf, y_train)

In [26]:
# predict using tuned_model
y_pred_tuned = svm_tuned_model.predict(X_test_tfidf)

In [27]:
# evaluate metrix
svm_tuned_accuracy = accuracy_score(y_test, y_pred_tuned)
svm_tuned_precision = precision_score(y_test, y_pred_tuned)
svm_tuned_recall = recall_score(y_test, y_pred_tuned)
svm_tuned_f1 = f1_score(y_test, y_pred_tuned)
#print
print("Accuracy: ", svm_tuned_accuracy)
print("Precision: ", svm_tuned_precision)
print("Recall: ", svm_tuned_recall)
print("F1 Score: ", svm_tuned_f1)

Accuracy:  0.9576701659329495
Precision:  0.6872037914691943
Recall:  0.7107843137254902
F1 Score:  0.6987951807228916


In [28]:
# add metics to dict
svm_metrics['SVM - Tuned'] = {'Accuracy': svm_tuned_accuracy, 'Precision': svm_tuned_precision, 'Recall': svm_tuned_recall, 'F1 Score': svm_tuned_f1}
svm_metrics

{'SVM - Base model': {'Accuracy': 0.9617338300033864,
  'Precision': 0.7513812154696132,
  'Recall': 0.6666666666666666,
  'F1 Score': 0.7064935064935064},
 'SVM - SMOTE': {'Accuracy': 0.9419234676600068,
  'Precision': 0.5743707093821511,
  'Recall': 0.6151960784313726,
  'F1 Score': 0.5940828402366864},
 'SVM - RUS': {'Accuracy': 0.8801219099221131,
  'Precision': 0.34909456740442657,
  'Recall': 0.8504901960784313,
  'F1 Score': 0.49500713266761764},
 'SVM - Tuned': {'Accuracy': 0.9576701659329495,
  'Precision': 0.6872037914691943,
  'Recall': 0.7107843137254902,
  'F1 Score': 0.6987951807228916}}

## Conclustion :
1. In V1 SVM Base and SVM tuned performed similarly
2. In V2 SVM Base has better performance in terms of F1 Score

## Pickle and save svm_base_model

In [29]:
#pickle and save svm_base_model
pickle.dump(svm_base_model, open(path + 'models/svm_base_model.pkl', 'wb'))


In [30]:
#Save metrics to excel sheet
with pd.ExcelWriter(path + 'models/svn_model_metrics.xlsx') as writer:
    df = pd.DataFrame.from_dict(svm_metrics, orient = 'index')
    df.to_excel(writer, sheet_name='SVM')