In [66]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.compose  import make_column_transformer, make_column_selector

In [2]:
df = pd.read_csv('Cases/Wisconsin/BreastCancer.csv', index_col = 0)

In [3]:
X = df.drop('Class', axis = 1)
y = df.Class
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state= 24, test_size = .3, stratify = y)
svm = SVC(kernel = 'linear')

# SVC
- probabilitybool, default=False
- - Whether to enable probability estimates. This must be enabled prior to calling fit, will slow down that method as it internally uses 5-fold cross-validation, and predict_proba may be inconsistent with predict

- random_state(int, RandomState instance or None, default=None- -  
Controls the pseudo random number generation for shuffling the data for probability estimates. Ignored when probability is False. Pass an int for reproducible output across multiple function calls.

In [5]:
svm.fit(X_train, y_train)

In [6]:
y_pred = svm.predict(X_test)
accuracy_score(y_test, y_pred)

0.9714285714285714

In [7]:
svm = SVC(kernel = 'linear', probability = True, random_state = 24)
svm.fit(X_train, y_train)

- predict_proba will only work when predict = True

In [9]:
y_pred_proba = svm.predict_proba(X_test)[:,1]
roc_auc_score(y_test, y_pred_proba)

0.9960748792270531

### Using GridSearchCV

In [11]:
svm = SVC(kernel = 'linear', probability = True, random_state = 24)
kfold = StratifiedKFold(n_splits= 5, shuffle = True , random_state =24)
params = {'C': np.linspace(0.001,5,10)}
gcv = GridSearchCV(svm, param_grid= params, cv = kfold, scoring = 'roc_auc', verbose= 3)
gcv.fit(X,y)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END ...........................C=0.001;, score=0.998 total time=   0.0s
[CV 2/5] END ...........................C=0.001;, score=0.997 total time=   0.0s
[CV 3/5] END ...........................C=0.001;, score=0.993 total time=   0.0s
[CV 4/5] END ...........................C=0.001;, score=0.991 total time=   0.0s
[CV 5/5] END ...........................C=0.001;, score=0.996 total time=   0.0s
[CV 1/5] END ..............C=0.5564444444444444;, score=0.995 total time=   0.0s
[CV 2/5] END ..............C=0.5564444444444444;, score=0.995 total time=   0.0s
[CV 3/5] END ..............C=0.5564444444444444;, score=0.995 total time=   0.0s
[CV 4/5] END ..............C=0.5564444444444444;, score=0.987 total time=   0.0s
[CV 5/5] END ..............C=0.5564444444444444;, score=0.997 total time=   0.0s
[CV 1/5] END ..............C=1.1118888888888887;, score=0.995 total time=   0.0s
[CV 2/5] END ..............C=1.1118888888888887;

In [12]:
print(gcv.best_params_)
print(gcv.best_score_)


{'C': 0.001}
0.9948458438559371


#### Polynomial Kernel
-kernel{‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’} or callable, default=’rbf’


In [14]:
svm = SVC(kernel = 'poly', probability = True, random_state = 24)
kfold = StratifiedKFold(n_splits= 5, shuffle = True , random_state =24)
params = {'C': np.linspace(0.001,5,10),'degree': [2,3,4,5]}
gcv = GridSearchCV(svm, param_grid= params, cv = kfold, scoring = 'roc_auc', verbose= 3)
gcv.fit(X,y)

Fitting 5 folds for each of 40 candidates, totalling 200 fits
[CV 1/5] END .................C=0.001, degree=2;, score=0.998 total time=   0.0s
[CV 2/5] END .................C=0.001, degree=2;, score=0.997 total time=   0.0s
[CV 3/5] END .................C=0.001, degree=2;, score=0.993 total time=   0.0s
[CV 4/5] END .................C=0.001, degree=2;, score=0.990 total time=   0.0s
[CV 5/5] END .................C=0.001, degree=2;, score=0.995 total time=   0.0s
[CV 1/5] END .................C=0.001, degree=3;, score=0.998 total time=   0.0s
[CV 2/5] END .................C=0.001, degree=3;, score=0.997 total time=   0.0s
[CV 3/5] END .................C=0.001, degree=3;, score=0.993 total time=   0.0s
[CV 4/5] END .................C=0.001, degree=3;, score=0.990 total time=   0.0s
[CV 5/5] END .................C=0.001, degree=3;, score=0.995 total time=   0.0s
[CV 1/5] END .................C=0.001, degree=4;, score=0.998 total time=   0.0s
[CV 2/5] END .................C=0.001, degree=4

In [15]:
print(gcv.best_score_, gcv.best_params_)

0.9949384143289424 {'C': 0.001, 'degree': 5}


#### Radial Kernel
- radial basis function
- also called Gaussian Kernel

In [17]:
svm = SVC(kernel = 'rbf', probability = True, random_state = 24)
kfold = StratifiedKFold(n_splits= 5, shuffle = True , random_state =24)
params = {'C': np.linspace(0.001,5,10),'gamma': np.linspace(0.001, 5, 10)} # gamma is inversely proportional to radial size.
gcv = GridSearchCV(svm, param_grid= params, cv = kfold, scoring = 'roc_auc', verbose= 3)
gcv.fit(X,y)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV 1/5] END ..............C=0.001, gamma=0.001;, score=0.998 total time=   0.1s
[CV 2/5] END ..............C=0.001, gamma=0.001;, score=0.997 total time=   0.0s
[CV 3/5] END ..............C=0.001, gamma=0.001;, score=0.993 total time=   0.0s
[CV 4/5] END ..............C=0.001, gamma=0.001;, score=0.990 total time=   0.0s
[CV 5/5] END ..............C=0.001, gamma=0.001;, score=0.994 total time=   0.0s
[CV 1/5] END .C=0.001, gamma=0.5564444444444444;, score=0.994 total time=   0.0s
[CV 2/5] END .C=0.001, gamma=0.5564444444444444;, score=0.990 total time=   0.0s
[CV 3/5] END .C=0.001, gamma=0.5564444444444444;, score=0.988 total time=   0.0s
[CV 4/5] END .C=0.001, gamma=0.5564444444444444;, score=0.969 total time=   0.0s
[CV 5/5] END .C=0.001, gamma=0.5564444444444444;, score=0.975 total time=   0.0s
[CV 1/5] END .C=0.001, gamma=1.1118888888888887;, score=0.984 total time=   0.0s
[CV 2/5] END .C=0.001, gamma=1.111888888888888

In [18]:
print(gcv.best_score_, gcv.best_params_)

0.9947580674649219 {'C': 1.6673333333333331, 'gamma': 0.001}


## On HR_comma_sep dataset

In [20]:
hr = pd.read_csv('Cases/human-resources-analytics/HR_comma_sep.csv')

In [21]:
X= hr.drop('left', axis = 1 )
y= hr.left
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state= 24, train_size= 0.7)

In [22]:
ohe = OneHotEncoder(drop= 'first', handle_unknown= 'ignore')
ct = make_column_transformer(('passthrough', make_column_selector(dtype_exclude= object)), (ohe, make_column_selector(dtype_include= object)))

In [24]:
scl_std = StandardScaler()

In [26]:
svm = SVC(kernel = 'linear', probability = True, random_state = 24)
pipe = Pipeline([('CT', ct),('SCL',scl_std),('SVM', svm)])
kfold= StratifiedKFold(n_splits = 5, shuffle = True, random_state = 24)
params = {'SVM__C' : np.linspace(0.001, 5, 3)}
gcv = GridSearchCV(pipe, param_grid= params, cv= kfold, scoring= 'roc_auc', verbose= 3)
gcv.fit(X,y)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV 1/5] END ......................SVM__C=0.001;, score=0.804 total time=  28.1s
[CV 2/5] END ......................SVM__C=0.001;, score=0.792 total time=  29.0s
[CV 3/5] END ......................SVM__C=0.001;, score=0.800 total time=  29.0s
[CV 4/5] END ......................SVM__C=0.001;, score=0.796 total time=  28.6s
[CV 5/5] END ......................SVM__C=0.001;, score=0.812 total time=  28.0s
[CV 1/5] END .........SVM__C=2.5004999999999997;, score=0.808 total time=  53.3s
[CV 2/5] END .........SVM__C=2.5004999999999997;, score=0.801 total time=  51.3s
[CV 3/5] END .........SVM__C=2.5004999999999997;, score=0.809 total time=  52.8s
[CV 4/5] END .........SVM__C=2.5004999999999997;, score=0.802 total time=  51.8s
[CV 5/5] END .........SVM__C=2.5004999999999997;, score=0.818 total time=  53.9s
[CV 1/5] END ........................SVM__C=5.0;, score=0.808 total time= 1.2min
[CV 2/5] END ........................SVM__C=5.0;,

KeyboardInterrupt: 

In [None]:
print(gcv.best_params_, gcv.best_score_, sep = "\n")

### for polynomial kernal

In [None]:
svm = SVC(kernel = 'poly', probability = True, random_state = 24)
pipe = Pipeline([('CT', ct),('SCL',scl_std),('SVM', svm)])
kfold= StratifiedKFold(n_splits = 5, shuffle = True, random_state = 24)
params = {'SVM__C' : np.linspace(0.001, 5, 3), 'SVM__degree' : [2, 3]}
gcv = GridSearchCV(pipe, param_grid= params, cv= kfold, scoring= 'roc_auc', verbose= 3)
gcv.fit(X,y)

### for rbf

In [34]:
scl_mm = MinMaxScaler()

In [36]:
svm = SVC(probability = True, random_state = 24)
pipe = Pipeline([('CT', ct),('SCL',scl_mm),('SVM', svm)])
kfold= StratifiedKFold(n_splits = 5, shuffle = True, random_state = 24)
params = {'SVM__C' : np.linspace(0.001, 5, 3), 'SVM__gamma' : np.linspace(0.001, 5, 3)}
gcv = GridSearchCV(pipe, param_grid= params, cv= kfold, scoring= 'roc_auc', verbose= 3)
gcv.fit(X,y)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5] END ....SVM__C=0.001, SVM__gamma=0.001;, score=0.767 total time=  35.3s
[CV 2/5] END ....SVM__C=0.001, SVM__gamma=0.001;, score=0.771 total time=  36.2s
[CV 3/5] END ....SVM__C=0.001, SVM__gamma=0.001;, score=0.762 total time=  35.0s
[CV 4/5] END ....SVM__C=0.001, SVM__gamma=0.001;, score=0.761 total time=  34.9s
[CV 5/5] END ....SVM__C=0.001, SVM__gamma=0.001;, score=0.792 total time=  36.0s
[CV 1/5] END SVM__C=0.001, SVM__gamma=2.5004999999999997;, score=0.931 total time=  39.0s
[CV 2/5] END SVM__C=0.001, SVM__gamma=2.5004999999999997;, score=0.931 total time=  38.0s
[CV 3/5] END SVM__C=0.001, SVM__gamma=2.5004999999999997;, score=0.941 total time=  38.2s
[CV 4/5] END SVM__C=0.001, SVM__gamma=2.5004999999999997;, score=0.928 total time=  38.7s
[CV 5/5] END SVM__C=0.001, SVM__gamma=2.5004999999999997;, score=0.935 total time=  38.8s
[CV 1/5] END ......SVM__C=0.001, SVM__gamma=5.0;, score=0.936 total time=  41.6s
[CV 


KeyboardInterrupt



# multiclass svm

In [38]:
satellite = pd.read_csv("Cases/Satellite Imaging/Satellite.csv",sep = ';')

In [40]:
satellite.head(1)

Unnamed: 0,x.1,x.2,x.3,x.4,x.5,x.6,x.7,x.8,x.9,x.10,...,x.28,x.29,x.30,x.31,x.32,x.33,x.34,x.35,x.36,classes
0,92,115,120,94,84,102,106,79,84,102,...,104,88,121,128,100,84,107,113,87,grey soil


In [42]:
y= satellite.classes
X= satellite.drop('classes', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state= 24, train_size= 0.7, stratify = y)

#### Linear Kernel

In [71]:
svm = SVC (kernel = 'linear', probability= True, random_state= 24, decision_function_shape= 'ovr') #default= ovr
scaler_mm=MinMaxScaler()

In [73]:
pipe= Pipeline([('SCL', scaler_mm),('SVM', svm)])
kfold= StratifiedKFold(n_splits=5, shuffle= True, random_state=24)
params= {'SVM__C': np.linspace(0.001,5,3), 'SVM__decision_function_shape':['ovo','ovr']}
gcv= GridSearchCV(pipe, param_grid = params, cv = kfold, scoring= 'neg_log_loss', verbose= 3)
gcv.fit(X,y)


Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV 1/5] END SVM__C=0.001, SVM__decision_function_shape=ovo;, score=-0.487 total time=   9.0s
[CV 2/5] END SVM__C=0.001, SVM__decision_function_shape=ovo;, score=-0.492 total time=   9.6s
[CV 3/5] END SVM__C=0.001, SVM__decision_function_shape=ovo;, score=-0.526 total time=   9.6s
[CV 4/5] END SVM__C=0.001, SVM__decision_function_shape=ovo;, score=-0.497 total time=   9.7s
[CV 5/5] END SVM__C=0.001, SVM__decision_function_shape=ovo;, score=-0.472 total time=   9.3s
[CV 1/5] END SVM__C=0.001, SVM__decision_function_shape=ovr;, score=-0.487 total time=   9.1s
[CV 2/5] END SVM__C=0.001, SVM__decision_function_shape=ovr;, score=-0.492 total time=   9.6s
[CV 3/5] END SVM__C=0.001, SVM__decision_function_shape=ovr;, score=-0.526 total time=   9.6s
[CV 4/5] END SVM__C=0.001, SVM__decision_function_shape=ovr;, score=-0.497 total time=   9.5s
[CV 5/5] END SVM__C=0.001, SVM__decision_function_shape=ovr;, score=-0.472 total time=   9.5s


In [83]:
print('neg_log_loss: ',gcv.best_score_ )
print('Best parameters:',gcv.best_params_)

neg_log_loss:  -0.33206246904927117
Best parameters: {'SVM__C': 5.0, 'SVM__decision_function_shape': 'ovo'}


In [89]:
svm = SVC (probability= True, random_state= 24, decision_function_shape= 'ovr') #default= ovr
scaler_mm=MinMaxScaler()

In [91]:
pipe= Pipeline([('SCL', scaler_mm),('SVM', svm)]) #kernel = 'kbf' is the default 
kfold= StratifiedKFold(n_splits=5, shuffle= True, random_state=24)
params= {'SVM__C': np.linspace(0.001,5,3), 'SVM__decision_function_shape':['ovo','ovr']}
gcv= GridSearchCV(pipe, param_grid = params, cv = kfold, scoring= 'neg_log_loss', verbose= 3)
gcv.fit(X,y)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV 1/5] END SVM__C=0.001, SVM__decision_function_shape=ovo;, score=-0.613 total time=  18.7s
[CV 2/5] END SVM__C=0.001, SVM__decision_function_shape=ovo;, score=-0.577 total time=  18.9s
[CV 3/5] END SVM__C=0.001, SVM__decision_function_shape=ovo;, score=-0.607 total time=  18.8s
[CV 4/5] END SVM__C=0.001, SVM__decision_function_shape=ovo;, score=-0.604 total time=  19.0s
[CV 5/5] END SVM__C=0.001, SVM__decision_function_shape=ovo;, score=-0.563 total time=  18.6s
[CV 1/5] END SVM__C=0.001, SVM__decision_function_shape=ovr;, score=-0.613 total time=  18.4s
[CV 2/5] END SVM__C=0.001, SVM__decision_function_shape=ovr;, score=-0.577 total time=  18.9s
[CV 3/5] END SVM__C=0.001, SVM__decision_function_shape=ovr;, score=-0.607 total time=  18.8s
[CV 4/5] END SVM__C=0.001, SVM__decision_function_shape=ovr;, score=-0.604 total time=  18.9s
[CV 5/5] END SVM__C=0.001, SVM__decision_function_shape=ovr;, score=-0.563 total time=  19.1s


In [93]:
print('neg_log_loss: ',gcv.best_score_ )
print('Best parameters:',gcv.best_params_)

neg_log_loss:  -0.2469870158619638
Best parameters: {'SVM__C': 5.0, 'SVM__decision_function_shape': 'ovo'}
