# SVM Model
* Build base SVM model
* Over sampling with SMOTE
* Under sampling with RandomSampler
* Compare performance of all three models
* Use Grid Search to tune hyperparameters
* Compare all the models and save the model with best performance
* Store all models' metrics to an excel file for report

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns; sns.set()
%matplotlib inline
import nltk
from sklearn.feature_extraction import text
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from nltk.probability import FreqDist
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn import metrics, model_selection, svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics as metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc, classification_report
import pickle

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Read preprocessed train data (X & Y)

In [2]:
path = '/content/drive/MyDrive/Guvi Files/Final Project2(Tweets)/'
train_x = pd.read_pickle(path + 'data/lemmatized_data.pkl')
train_y = pd.read_pickle(path + 'data/train_target.pkl')

In [3]:
stop_words = stopwords.words('english')

## Split train & test from the training data (Note the actual test data given doesn't have label)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, test_size=0.2, random_state=42)
#vectorize using TFIDFVectorizer
tfidf = TfidfVectorizer(stop_words=stop_words,ngram_range=(1,2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [5]:
%%time
#build svm model
svm_base_model = svm.SVC(C=1.0,kernel='linear',degree=3, gamma='auto', class_weight='balanced', random_state=20)
svm_base_model.fit(X_train_tfidf, y_train)

CPU times: user 1min 27s, sys: 590 ms, total: 1min 27s
Wall time: 1min 28s


In [6]:
#predict using the model
y_pred = svm_base_model.predict(X_test_tfidf)

In [7]:
# get evaluation metrics for the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97      5937
           1       0.62      0.59      0.61       456

    accuracy                           0.95      6393
   macro avg       0.79      0.78      0.79      6393
weighted avg       0.94      0.95      0.94      6393



In [8]:
svm_base_accuracy = accuracy_score(y_test, y_pred)
svm_base_precision = precision_score(y_test, y_pred)
svm_base_recall = recall_score(y_test, y_pred)
svm_base_f1 = f1_score(y_test, y_pred)

#print the values
print("Accuracy: ", svm_base_accuracy)
print("Precision: ", svm_base_precision)
print("Recall: ", svm_base_recall)
print("F1 Score: ", svm_base_f1)

Accuracy:  0.9450961989676209
Precision:  0.620137299771167
Recall:  0.5942982456140351
F1 Score:  0.6069428891377379


In [9]:
# store the values in a dict
svm_metrics = {}
svm_metrics['SVM - Base model'] = {'Accuracy': svm_base_accuracy, 'Precision': svm_base_precision, 'Recall': svm_base_recall, 'F1 Score': svm_base_f1}

## Over-sampling with SMOTE and evaluate the model

In [10]:
# import smote
from imblearn.over_sampling import SMOTE

In [11]:
#over sample data with smote
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_tfidf, y_train)

In [12]:
# Build SVM model on smote data
svm_smote_model = svm.SVC(C=1.0,kernel='linear',degree=3, gamma='auto', class_weight='balanced', random_state=20)
svm_smote_model.fit(X_train_smote, y_train_smote)
#predict using the svm_smote_model
y_pred_smote = svm_smote_model.predict(X_test_tfidf)


In [13]:
# model evaluation metrics
svm_smote_accuracy = accuracy_score(y_test, y_pred_smote)
svm_smote_precision = precision_score(y_test, y_pred_smote)
svm_smote_recall = recall_score(y_test, y_pred_smote)
svm_smote_f1 = f1_score(y_test, y_pred_smote)
#print
print("Accuracy: ", svm_smote_accuracy)
print("Precision: ", svm_smote_precision)
print("Recall: ", svm_smote_recall)
print("F1 Score: ", svm_smote_f1)

Accuracy:  0.900672610667918
Precision:  0.36779911373707536
Recall:  0.5460526315789473
F1 Score:  0.43954104148278905


In [14]:
#store the values in the dict
svm_metrics['SVM - SMOTE'] = {'Accuracy': svm_smote_accuracy, 'Precision': svm_smote_precision, 'Recall': svm_smote_recall, 'F1 Score': svm_smote_f1}
svm_metrics

{'SVM - Base model': {'Accuracy': 0.9450961989676209,
  'Precision': 0.620137299771167,
  'Recall': 0.5942982456140351,
  'F1 Score': 0.6069428891377379},
 'SVM - SMOTE': {'Accuracy': 0.900672610667918,
  'Precision': 0.36779911373707536,
  'Recall': 0.5460526315789473,
  'F1 Score': 0.43954104148278905}}

## Under-sample using RandomUnderSampler

In [15]:
# import RandomUnderSampler
from imblearn.under_sampling import RandomUnderSampler

In [16]:
# under sample majority data
rus = RandomUnderSampler(random_state=42)
X_train_rus, y_train_rus = rus.fit_resample(X_train_tfidf, y_train)

In [17]:
%%time
# build SVM model using under sampled data
svm_rus_model = svm.SVC(C=1.0,kernel='linear',degree=3, gamma='auto', class_weight='balanced', random_state=20)
svm_rus_model.fit(X_train_rus, y_train_rus)
#predict using the svm_rus_model
y_pred_rus = svm_rus_model.predict(X_test_tfidf)

CPU times: user 2.3 s, sys: 6 ms, total: 2.31 s
Wall time: 2.32 s


In [18]:
# accuracy, precision, recall, f1-score
svm_rus_accuracy = accuracy_score(y_test, y_pred_rus)
svm_rus_precision = precision_score(y_test, y_pred_rus)
svm_rus_recall = recall_score(y_test, y_pred_rus)
svm_rus_f1 = f1_score(y_test, y_pred_rus)

#print
print("Accuracy: ", svm_rus_accuracy)
print("Precision: ", svm_rus_precision)
print("Recall: ", svm_rus_recall)
print("F1 Score: ", svm_rus_f1)

Accuracy:  0.8362271234162365
Precision:  0.27216653816499614
Recall:  0.7741228070175439
F1 Score:  0.40273816314888755


In [19]:
# store in metrics dict
svm_metrics['SVM - RUS'] = {'Accuracy': svm_rus_accuracy, 'Precision': svm_rus_precision, 'Recall': svm_rus_recall, 'F1 Score': svm_rus_f1}
svm_metrics

{'SVM - Base model': {'Accuracy': 0.9450961989676209,
  'Precision': 0.620137299771167,
  'Recall': 0.5942982456140351,
  'F1 Score': 0.6069428891377379},
 'SVM - SMOTE': {'Accuracy': 0.900672610667918,
  'Precision': 0.36779911373707536,
  'Recall': 0.5460526315789473,
  'F1 Score': 0.43954104148278905},
 'SVM - RUS': {'Accuracy': 0.8362271234162365,
  'Precision': 0.27216653816499614,
  'Recall': 0.7741228070175439,
  'F1 Score': 0.40273816314888755}}

## Conclusion 1: SVM with Base Data has better performance

## Tune Hyperparameters using GridSearchCV

In [20]:
# Create SVM model
svm_model = svm.SVC(degree=3,class_weight='balanced', random_state=20)
# params
params = {'C': [0.1,1,10,100], 'gamma': [1,0.1,0.01],'kernel':['sigmoid','rbf']}

In [21]:
%%time
# GridSearchCV
svm_grid = GridSearchCV(estimator=svm_model, param_grid=params, cv=5, scoring='f1',verbose=3)
svm_grid.fit(X_train_tfidf, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV 1/5] END ....C=0.1, gamma=1, kernel=sigmoid;, score=0.519 total time=  43.8s
[CV 2/5] END ....C=0.1, gamma=1, kernel=sigmoid;, score=0.541 total time=  44.7s
[CV 3/5] END ....C=0.1, gamma=1, kernel=sigmoid;, score=0.561 total time=  45.0s
[CV 4/5] END ....C=0.1, gamma=1, kernel=sigmoid;, score=0.539 total time=  45.3s
[CV 5/5] END ....C=0.1, gamma=1, kernel=sigmoid;, score=0.493 total time=  44.6s
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.509 total time=  57.7s
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.509 total time=  58.3s
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.596 total time=  58.2s
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.571 total time=  58.6s
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.471 total time=  57.4s
[CV 1/5] END ..C=0.1, gamma=0.1, kernel=sigmoid;, score=0.218 total time=  54.2s
[CV 2/5] END ..C=0.1, gamma=0.1, kernel=sigmoid

In [22]:
# generate score with .best_score_ and hyperparemeters with .best_params_
print('F1 Score:', svm_grid.best_score_)
print('Best Hyperparameters:', svm_grid.best_params_)
print('Model object with best parameters: ')
print(svm_grid.best_estimator_)

F1 Score: 0.5836969358896887
Best Hyperparameters: {'C': 100, 'gamma': 0.01, 'kernel': 'sigmoid'}
Model object with best parameters: 
SVC(C=100, class_weight='balanced', gamma=0.01, kernel='sigmoid',
    random_state=20)


In [23]:
# Compile model with tuned parameters
svm_tuned_model = svm.SVC(C=100, class_weight='balanced', gamma=0.01, kernel='sigmoid',random_state=20)
svm_tuned_model.fit(X_train_tfidf, y_train)

In [24]:
# predict using tuned_model
y_pred_tuned = svm_tuned_model.predict(X_test_tfidf)

In [25]:
# evaluate metrix
svm_tuned_accuracy = accuracy_score(y_test, y_pred_tuned)
svm_tuned_precision = precision_score(y_test, y_pred_tuned)
svm_tuned_recall = recall_score(y_test, y_pred_tuned)
svm_tuned_f1 = f1_score(y_test, y_pred_tuned)
#print
print("Accuracy: ", svm_tuned_accuracy)
print("Precision: ", svm_tuned_precision)
print("Recall: ", svm_tuned_recall)
print("F1 Score: ", svm_tuned_f1)

Accuracy:  0.9450961989676209
Precision:  0.620137299771167
Recall:  0.5942982456140351
F1 Score:  0.6069428891377379


In [26]:
# add metics to dict
svm_metrics['SVM - Tuned'] = {'Accuracy': svm_tuned_accuracy, 'Precision': svm_tuned_precision, 'Recall': svm_tuned_recall, 'F1 Score': svm_tuned_f1}
svm_metrics

{'SVM - Base model': {'Accuracy': 0.9450961989676209,
  'Precision': 0.620137299771167,
  'Recall': 0.5942982456140351,
  'F1 Score': 0.6069428891377379},
 'SVM - SMOTE': {'Accuracy': 0.900672610667918,
  'Precision': 0.36779911373707536,
  'Recall': 0.5460526315789473,
  'F1 Score': 0.43954104148278905},
 'SVM - RUS': {'Accuracy': 0.8362271234162365,
  'Precision': 0.27216653816499614,
  'Recall': 0.7741228070175439,
  'F1 Score': 0.40273816314888755},
 'SVM - Tuned': {'Accuracy': 0.9450961989676209,
  'Precision': 0.620137299771167,
  'Recall': 0.5942982456140351,
  'F1 Score': 0.6069428891377379}}

## Conclustion : SVM Base and SVM tuned perform similarly

## Pickle and save svm_base_model

In [27]:
#pickle and save svm_base_model
pickle.dump(svm_base_model, open(path + 'models/svm_base_model.pkl', 'wb'))


In [29]:
#Save metrics to excel sheet
with pd.ExcelWriter(path + 'models/svn_model_metrics.xlsx') as writer:
    df = pd.DataFrame.from_dict(svm_metrics, orient = 'index')
    df.to_excel(writer, sheet_name='SVM')