In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn import model_selection
from sklearn.svm import SVC

In [2]:
data_full = pd.read_csv('dados.csv', sep=',')
display(data_full)

Unnamed: 0,MADE,SHOT TYPE,PERIOD,TIME REMAINING,SHOT DISTANCE (FT),is_home,off_win_rate,def_blocks,def_win_rate,Age,Height,Weight,DRAFT NUMBER
0,1,2,1,675,13,1,0.621951,435,0.621951,29,195.072,100.000000,6
1,0,3,1,402,22,1,0.621951,435,0.621951,29,195.072,100.000000,6
2,0,2,1,205,6,1,0.621951,435,0.621951,29,195.072,100.000000,6
3,1,2,2,453,9,1,0.621951,435,0.621951,29,195.072,100.000000,6
4,0,2,2,393,6,1,0.621951,435,0.621951,29,195.072,100.000000,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
209621,1,2,1,189,13,0,0.439024,460,0.560976,29,201.168,90.909091,40
209622,0,3,2,614,24,0,0.439024,460,0.560976,29,201.168,90.909091,40
209623,0,2,2,225,7,0,0.439024,460,0.560976,29,201.168,90.909091,40
209624,0,2,4,640,8,0,0.439024,460,0.560976,29,201.168,90.909091,40


In [3]:
## versão com 20 mil linhas

data = data_full.sample(n=20000, random_state=23)

### Normalizando os dados

In [4]:
data_normalizado = data.copy()
for column in data_normalizado.columns:
    data_normalizado[column] = (data_normalizado[column] - data_normalizado[column].min()) / (data_normalizado[column].max() - data_normalizado[column].min())

display(data_normalizado)

Unnamed: 0,MADE,SHOT TYPE,PERIOD,TIME REMAINING,SHOT DISTANCE (FT),is_home,off_win_rate,def_blocks,def_win_rate,Age,Height,Weight,DRAFT NUMBER
183099,1.0,1.0,0.6,0.130070,0.316456,1.0,0.227273,0.436567,0.000000,0.541667,0.519651,0.285714,1.000000
173516,1.0,0.0,0.2,0.348252,0.012658,1.0,1.000000,1.000000,0.818182,0.541667,0.694323,0.452381,1.000000
204465,0.0,1.0,0.6,0.801399,0.316456,1.0,0.363636,0.436567,0.000000,0.500000,0.519651,0.206349,0.090909
121609,0.0,0.0,0.4,0.881119,0.000000,1.0,0.590909,0.597015,0.295455,0.500000,0.868996,0.746032,0.262626
11882,0.0,1.0,0.4,0.381818,0.329114,1.0,0.545455,0.350746,1.000000,0.083333,0.519651,0.365079,0.262626
...,...,...,...,...,...,...,...,...,...,...,...,...,...
31855,0.0,1.0,0.0,0.307692,0.316456,1.0,0.704545,0.380597,0.045455,0.291667,0.606987,0.325397,0.535354
187623,0.0,0.0,0.6,0.167832,0.037975,0.0,0.727273,0.514925,0.318182,0.041667,0.519651,0.269841,0.363636
10570,0.0,0.0,0.0,0.367832,0.012658,1.0,0.545455,0.500000,0.386364,0.333333,0.563319,0.253968,0.515152
207474,1.0,1.0,0.8,0.327273,0.303797,0.0,0.363636,0.429104,0.090909,0.166667,0.737991,0.333333,0.161616


### Dividindo X e Y para o modelo

In [5]:
X = np.array(data_normalizado.drop(['MADE'], axis=1))
y = np.array(data_normalizado['MADE'])

### Dividindo dados de treino e teste para o modelo

In [6]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, random_state=23, test_size=0.2)

## SVM

In [None]:
param_grid = {'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf']}
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=3)
grid.fit(X_train, y_train)

grid_predictions = grid.predict(X_test)

In [None]:
print('Ein: %0.4f' % (1 - accuracy_score(y_train, grid.predict(X_train))))
print('Eout: %0.4f' % (1 - accuracy_score(y_test, grid.predict(X_test))))
y_pred = grid.predict(X_test)

print(classification_report(y_test, y_pred))

Ein: 0.3640
Eout: 0.3765
              precision    recall  f1-score   support

         0.0       0.63      0.71      0.67      1078
         1.0       0.61      0.52      0.56       922

    accuracy                           0.62      2000
   macro avg       0.62      0.62      0.62      2000
weighted avg       0.62      0.62      0.62      2000



### Abaixo temos, com base nos testes do gridsearch, os melhores parâmetros C e gamma para o modelo

In [None]:
grid.best_params_

{'C': 1000, 'gamma': 0.1, 'kernel': 'rbf'}

### Recriando o modelo com base nos melhores parâmetros encontrados no grid

In [None]:
# o valor de k geralmente é setado como 5 ou 10
kfold = KFold(n_splits=10, shuffle=True, random_state=0)

model = SVC(C=1000, gamma=0.1, kernel='rbf')

# Cross validation
results = cross_val_score(model, X_train, y_train, cv=kfold)
results.mean()

0.611375