In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split,KFold,cross_val_score,GridSearchCV


In [None]:
"""
Cross Validation and Grid search CV is used for hyperparameter tuning. Here you can make
multiple models at a time and get the best one
"""

In [2]:
df = pd.read_csv('spam.csv')
y = df.pop('spam')
X = df
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.8,random_state=42)

In [3]:
# Grid Search
from sklearn.svm import SVC
folds = KFold(n_splits=5,shuffle=True,random_state=42)
hyper_params = [{
    'gamma': [1e-2,1e-3,1e-4],
    'C':[1,10,100,1000]
}]

"""
Here 12 modelswill be made gamma*c ---> 3*4 for every possible combo
"""
model = SVC(kernel="rbf")
model_cv = GridSearchCV(estimator=model,
                       param_grid = hyper_params,
                       scoring ='accuracy',
                       cv = folds,
                       verbose=1,
                       return_train_score=True,
                       n_jobs=12)
model_cv.fit(X_train,y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:   30.2s
[Parallel(n_jobs=12)]: Done  60 out of  60 | elapsed:   53.7s finished


GridSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),
             error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=12,
             param_grid=[{'C': [1, 10, 100, 1000],
                          'gamma': [0.01, 0.001, 0.0001]}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring='accuracy', verbose=1)

In [4]:
cv_results = pd.DataFrame(model_cv.cv_results_)
cv_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_gamma,params,split0_test_score,split1_test_score,split2_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,3.944223,0.05577,0.682654,0.063937,1,0.01,"{'C': 1, 'gamma': 0.01}",0.820652,0.807065,0.838315,...,0.823641,0.010433,10,0.915421,0.927989,0.926291,0.920177,0.921535,0.922283,0.004488
1,3.36904,0.248778,0.565577,0.063781,1,0.001,"{'C': 1, 'gamma': 0.001}",0.807065,0.80163,0.809783,...,0.807065,0.00285,11,0.839334,0.836617,0.835598,0.841712,0.845448,0.839742,0.003565
2,3.4496,0.061611,0.580186,0.047737,1,0.0001,"{'C': 1, 'gamma': 0.0001}",0.747283,0.762228,0.747283,...,0.755435,0.006711,12,0.769361,0.765285,0.771739,0.768003,0.764946,0.767867,0.002547
3,4.895158,0.126496,0.644228,0.040303,10,0.01,"{'C': 10, 'gamma': 0.01}",0.850543,0.835598,0.865489,...,0.853261,0.010903,7,0.982677,0.986073,0.986413,0.987772,0.984035,0.985394,0.00181
4,3.124079,0.207478,0.517551,0.037779,10,0.001,"{'C': 10, 'gamma': 0.001}",0.873641,0.845109,0.880435,...,0.865489,0.012975,5,0.922554,0.929348,0.924592,0.927989,0.929008,0.926698,0.002669
5,3.140897,0.120459,0.469313,0.072522,10,0.0001,"{'C': 10, 'gamma': 0.0001}",0.868207,0.847826,0.860054,...,0.855163,0.009118,6,0.865489,0.873641,0.867867,0.873641,0.871264,0.87038,0.003234
6,5.750824,0.629944,0.681854,0.050176,100,0.01,"{'C': 100, 'gamma': 0.01}",0.835598,0.835598,0.870924,...,0.84837,0.013134,8,0.996264,0.997622,0.997622,0.998302,0.996264,0.997215,0.000815
7,5.099391,0.238063,0.446297,0.036213,100,0.001,"{'C': 100, 'gamma': 0.001}",0.903533,0.887228,0.908967,...,0.898098,0.00754,2,0.968071,0.974864,0.970448,0.973845,0.970448,0.971535,0.002481
8,3.932614,0.163147,0.347432,0.045313,100,0.0001,"{'C': 100, 'gamma': 0.0001}",0.910326,0.875,0.903533,...,0.896467,0.011857,3,0.923234,0.929348,0.919497,0.92697,0.924253,0.92466,0.003352
9,6.379658,0.283233,0.663041,0.041868,1000,0.01,"{'C': 1000, 'gamma': 0.01}",0.83288,0.820652,0.857337,...,0.840217,0.013845,9,0.998302,0.998981,0.999321,0.998981,0.998302,0.998777,0.000408


In [7]:
print(model_cv.best_score_)
print(model_cv.best_params_)
print(model_cv.best_estimator_)

0.9138586956521738
{'C': 1000, 'gamma': 0.0001}
SVC(C=1000, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.0001, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


In [10]:
cv_results = cross_val_score(model,X_train,y_train,cv=3,scoring='accuracy') # Gives test score for every split

In [11]:
cv_results

array([0.70660147, 0.71312143, 0.71533442])