In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
spammail=pd.read_csv('spambase.csv')

In [3]:
X=spammail.drop('spam',axis=1)

In [4]:
y=spammail['spam']

In [5]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20)

In [6]:
#Feature scaling-standardizing features by removing mean and scaling to unit variance
from sklearn.preprocessing import StandardScaler
scaler_x=StandardScaler()
X_Train=scaler_x.fit_transform(X_train)
X_Test=scaler_x.transform(X_test)

In [15]:
tuned_parameters_quad = [{'kernel':['poly'],'degree':[2],'C':[1,10,100,1000,10000,30000,50000]}]
tuned_parameters_linear=[{'kernel':['linear'],'C':[1,2]}]
tuned_parameters_rbf=[{'kernel':['rbf'],'C':[1,10,100,1000,10000]}]

In [8]:
#SVM model to predict if a mail is spam or non spam
#In order to vary regulation parameter C and decide an optimal value, we are using an exhaustive grid search
#C has been given the values of 1,10,100,1000 and 10000

def svmmodel(tuned_parameters):
  from sklearn.svm import SVC
  from sklearn.model_selection import GridSearchCV
  from sklearn.metrics import confusion_matrix
  svclassifier = GridSearchCV(SVC(), param_grid=tuned_parameters, scoring='accuracy',verbose=10,n_jobs=-1)
  svclassifier.fit(X_train, y_train)
  print('Scores:')
  means = svclassifier.cv_results_['mean_test_score']
  stds = svclassifier.cv_results_['std_test_score']
  for mean, std, params in zip(means, stds, svclassifier.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
    print()
  print('Best score:')
  print(svclassifier.best_params_)
  y_true_test, y_predtest = y_test, svclassifier.predict(X_test)
  y_true_train, y_predtrain = y_train, svclassifier.predict(X_train)
  cfmatrixtrain=confusion_matrix(y_true_train,y_predtrain)
  cfmatrixtest=confusion_matrix(y_true_test,y_predtest)
  cfmetrics(cfmatrixtrain,cfmatrixtest)

In [9]:
def cfmetrics(cfmatrixtrain,cfmatrixtest):  
  print('confusion matrix for training data:')
  print(cfmatrixtrain)
  TN=cfmatrixtrain[0][0]
  FN=cfmatrixtrain[1][0]
  TP=cfmatrixtrain[1][1]
  FP=cfmatrixtrain[0][1]
  accuracy_train=(TN+TP)/(TN+TP+FN+FP)
  precision_train=(TP)/(TP+FP)
  recall_train=TP/(TP+FN)
  print('Training accuracy')
  print(accuracy_train)
  print('Training precision')
  print(precision_train)
  print('Training recall')
  print(recall_train)
  print('confusion matrix for test data:')
  print(cfmatrixtest)
  TN=cfmatrixtest[0][0]
  FN=cfmatrixtest[1][0]
  TP=cfmatrixtest[1][1]
  FP=cfmatrixtest[0][1]
  accuracy_test=(TN+TP)/(TN+TP+FN+FP)
  precision_test=(TP)/(TP+FP)
  recall_test=TP/(TP+FN)
  print('Test accuracy')
  print(accuracy_test)
  print('Test precision')
  print(precision_test)
  print('Test recall')
  print(recall_test)

In [12]:
#Linear kernel function for SVM
svmmodel(tuned_parameters_linear)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed: 32.1min remaining: 32.1min
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed: 54.8min remaining: 23.5min
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 62.7min finished


Scores:
0.927 (+/-0.024) for {'C': 1, 'kernel': 'linear'}

0.926 (+/-0.022) for {'C': 2, 'kernel': 'linear'}

Best score:
{'C': 1, 'kernel': 'linear'}
confusion matrix for traing data:
[[2122  106]
 [ 142 1310]]
Training accuracy
0.9326086956521739
Training precision
0.9251412429378532
Training recall
0.9022038567493113
confusion matrix for traing data:
[[535  25]
 [ 34 327]]
Test accuracy
0.9359391965255157
Test precision
0.9289772727272727
Test recall
0.9058171745152355


In [16]:
#Quadratic kernel function for SVM
svmmodel(tuned_parameters_quad)

Fitting 5 folds for each of 7 candidates, totalling 35 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   16.0s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   34.3s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done  32 out of  35 | elapsed: 15.7min remaining:  1.5min
[Parallel(n_jobs=-1)]: Done  35 out of  35 | elapsed: 19.1min finished


Scores:
0.671 (+/-0.023) for {'C': 1, 'degree': 2, 'kernel': 'poly'}

0.687 (+/-0.023) for {'C': 10, 'degree': 2, 'kernel': 'poly'}

0.709 (+/-0.030) for {'C': 100, 'degree': 2, 'kernel': 'poly'}

0.771 (+/-0.020) for {'C': 1000, 'degree': 2, 'kernel': 'poly'}

0.833 (+/-0.019) for {'C': 10000, 'degree': 2, 'kernel': 'poly'}

0.862 (+/-0.026) for {'C': 30000, 'degree': 2, 'kernel': 'poly'}

0.874 (+/-0.027) for {'C': 50000, 'degree': 2, 'kernel': 'poly'}

Best score:
{'C': 50000, 'degree': 2, 'kernel': 'poly'}
confusion matrix for traing data:
[[2182   64]
 [ 368 1066]]
Training accuracy
0.8826086956521739
Training precision
0.9433628318584071
Training recall
0.7433751743375174
confusion matrix for traing data:
[[529  13]
 [ 98 281]]
Test accuracy
0.8794788273615635
Test precision
0.95578231292517
Test recall
0.741424802110818


In [14]:
#RBF kernel function for SVM
svmmodel(tuned_parameters_rbf)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    9.9s
[Parallel(n_jobs=-1)]: Done  21 out of  25 | elapsed:   13.3s remaining:    2.5s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:   16.3s finished


Scores:
0.707 (+/-0.022) for {'C': 1, 'kernel': 'rbf'}

0.734 (+/-0.015) for {'C': 10, 'kernel': 'rbf'}

0.821 (+/-0.015) for {'C': 100, 'kernel': 'rbf'}

0.902 (+/-0.011) for {'C': 1000, 'kernel': 'rbf'}

0.926 (+/-0.021) for {'C': 10000, 'kernel': 'rbf'}

Best score:
{'C': 10000, 'kernel': 'rbf'}
confusion matrix for traing data:
[[2122  106]
 [ 137 1315]]
Training accuracy
0.9339673913043478
Training precision
0.9254046446164673
Training recall
0.9056473829201102
confusion matrix for traing data:
[[535  25]
 [ 41 320]]
Test accuracy
0.9283387622149837
Test precision
0.927536231884058
Test recall
0.8864265927977839
