In [29]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [30]:
df = pd.read_csv('data/small-data-proc.csv')
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,target_encoded
0,2.362074,1.470039,0.441633,1.520508,-1.641380,-1.065624,0.150034,0.390391,-1.141347,-0.911345,...,-0.568474,0.107667,-0.573991,0.060431,0.738714,0.344006,0.224462,0.015576,-0.361662,0
1,-1.549877,0.573138,-3.410800,0.208847,0.063588,0.108842,-0.882114,1.101050,0.235559,0.808586,...,-0.895983,1.209265,-0.191432,0.307348,0.144694,0.327906,-0.024191,0.159230,0.257426,0
2,-2.355207,0.742339,-1.831391,0.907496,0.385618,-0.737358,0.782288,-0.189488,-0.403835,-0.373098,...,0.672292,-1.151802,-0.398247,0.012279,-0.168192,-0.172165,-0.374906,0.286748,0.790820,0
3,0.914082,-1.836789,1.182471,-0.810971,-0.955504,0.348837,0.539104,-0.350276,0.329440,-0.136489,...,0.391929,0.062351,0.505980,0.197659,0.099709,0.082185,-1.169788,0.641136,-1.583918,0
4,2.806884,-1.767765,-2.125613,0.292358,-3.223884,0.878356,0.597292,0.076083,-1.228277,1.406282,...,-6.686914,-2.292819,0.953884,1.695459,-2.747924,6.014440,-3.858811,1.074051,1.567524,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3702,-0.996163,-1.754557,1.839770,2.200369,0.188324,-2.170293,0.926515,-1.038025,2.110224,-0.690487,...,0.476494,1.553268,0.870252,-1.159454,-0.344648,-0.029891,-0.165434,0.097239,0.302724,0
3703,-1.791339,-0.465957,-0.581370,0.132726,0.240000,-0.797515,0.644898,-0.064089,-0.045678,0.180980,...,-0.526343,0.204700,0.083357,-0.328189,0.711965,0.415585,-0.224218,0.110719,-0.644359,0
3704,5.857167,1.672363,-3.541175,-0.262264,2.832429,-1.123075,-0.930276,-0.163315,0.513262,2.255137,...,-0.592097,1.731605,0.717432,0.946930,0.108542,0.690355,1.341345,-0.745431,0.197613,0
3705,-0.750934,-1.595330,0.831358,-1.780411,0.113803,1.373188,-1.230884,1.178736,0.835277,0.671154,...,0.328884,-0.025657,-1.010121,-0.351540,0.591530,0.988056,-0.285795,-0.277615,0.220822,0


In [31]:
from sklearn.model_selection import train_test_split

X = df.drop('target_encoded', axis=1)
y = df['target_encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [32]:
df['target_encoded']

0       0
1       0
2       0
3       0
4       0
       ..
3702    0
3703    0
3704    0
3705    0
3706    0
Name: target_encoded, Length: 3707, dtype: int64

In [33]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, recall_score, precision_score, f1_score

clf = SVC()
params = {'kernel': ['linear', 'rbf'], 'C': [0.1, 1, 10], 'gamma': ['scale', 'auto']}
scores = {'accuracy': make_scorer(accuracy_score), 
           'precision': make_scorer(precision_score, average='weighted'), 
           'recall': make_scorer(recall_score, average='weighted'), 
           'f1_score': make_scorer(f1_score, average='weighted')}
grid_clf = GridSearchCV(clf, params, scoring=scores, refit='f1_score')

grid_clf.fit(X_train, y_train)

In [34]:
best_params = grid_clf.best_params_
best_scores = grid_clf.best_score_
all_scores = pd.DataFrame(grid_clf.cv_results_)[['params','mean_test_accuracy','mean_test_precision', 'mean_test_recall', 'mean_test_f1_score', 'rank_test_f1_score']]
print('Best Parameters: ', best_params)
print('Best Scores: ', best_scores)

Best Parameters:  {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Best Scores:  0.9033674028533497


In [35]:
all_scores

Unnamed: 0,params,mean_test_accuracy,mean_test_precision,mean_test_recall,mean_test_f1_score,rank_test_f1_score
0,"{'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}",0.915346,0.883316,0.915346,0.88509,8
1,"{'C': 0.1, 'gamma': 'scale', 'kernel': 'rbf'}",0.91602,0.839094,0.91602,0.875871,11
2,"{'C': 0.1, 'gamma': 'auto', 'kernel': 'linear'}",0.915346,0.883316,0.915346,0.88509,8
3,"{'C': 0.1, 'gamma': 'auto', 'kernel': 'rbf'}",0.91602,0.839094,0.91602,0.875871,11
4,"{'C': 1, 'gamma': 'scale', 'kernel': 'linear'}",0.91602,0.889368,0.91602,0.888345,5
5,"{'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}",0.916695,0.888432,0.916695,0.886604,7
6,"{'C': 1, 'gamma': 'auto', 'kernel': 'linear'}",0.91602,0.889368,0.91602,0.888345,5
7,"{'C': 1, 'gamma': 'auto', 'kernel': 'rbf'}",0.91602,0.884719,0.91602,0.884116,10
8,"{'C': 10, 'gamma': 'scale', 'kernel': 'linear'}",0.916695,0.892403,0.916695,0.890958,3
9,"{'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}",0.910287,0.898722,0.910287,0.903367,1


In [36]:
best_model = grid_clf.best_estimator_
y_pred = best_model.predict(X_test)
best_model.score(X_test, y_test)
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print('Metrics of testing data')
print('Accuracy:', acc)
print('precision: ', prec)
print('Recall: ', rec)
print('F1 score: ', f1)

Metrics of testing data
Accuracy: 0.9164420485175202
precision:  0.48484848484848486
Recall:  0.26229508196721313
F1 score:  0.34042553191489355
