In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score

# Importando os dados

In [13]:
df = pd.read_csv('dados_jogos.csv')
df = df[df['season']>2010]
df.head()

Unnamed: 0,season,date_game,team_1,team_2,team_1_mp,team_1_fg,team_1_fga,team_1_fg_pct,team_1_fg3,team_1_fg3a,...,team_2_trb_pct,team_2_ast_pct,team_2_stl_pct,team_2_blk_pct,team_2_tov_pct,team_2_usg_pct,team_2_off_rtg,team_2_def_rtg,team_2_ws,game_result
8569,2011,"Tue, Oct 26, 2010",BOS,MIA,,,,,,,...,,,,,,,,,,
8570,2011,"Fri, Oct 29, 2010",BOS,NYK,240.0,34.0,72.0,0.472,3.0,12.0,...,52.1,31.6,4.1,11.9,14.4,100.0,100.6,95.5,W 1,W
8571,2011,"Wed, Nov 3, 2010",BOS,MIL,240.0,42.0,81.0,0.519,7.0,21.0,...,50.6,59.3,4.7,7.1,17.0,100.0,89.5,106.0,L 1,W
8572,2011,"Fri, Nov 5, 2010",BOS,CHI,265.0,37.0,79.0,0.468,4.0,12.0,...,52.5,64.3,8.1,10.7,17.7,100.0,113.6,121.7,L 1,W
8573,2011,"Wed, Nov 17, 2010",BOS,WAS,265.0,45.0,78.0,0.577,7.0,12.0,...,56.6,62.2,9.8,6.7,15.3,100.0,118.4,102.1,W 1,W


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16683 entries, 8569 to 25251
Data columns (total 73 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   season                   16683 non-null  int64  
 1   date_game                16683 non-null  object 
 2   team_1                   16683 non-null  object 
 3   team_2                   16683 non-null  object 
 4   team_1_mp                16450 non-null  float64
 5   team_1_fg                16450 non-null  float64
 6   team_1_fga               16450 non-null  float64
 7   team_1_fg_pct            16450 non-null  float64
 8   team_1_fg3               16450 non-null  float64
 9   team_1_fg3a              16450 non-null  float64
 10  team_1_fg3_pct           16450 non-null  float64
 11  team_1_ft                16450 non-null  float64
 12  team_1_fta               16450 non-null  float64
 13  team_1_ft_pct            16449 non-null  float64
 14  team_1_orb              

In [15]:
df.columns

Index(['season', 'date_game', 'team_1', 'team_2', 'team_1_mp', 'team_1_fg',
       'team_1_fga', 'team_1_fg_pct', 'team_1_fg3', 'team_1_fg3a',
       'team_1_fg3_pct', 'team_1_ft', 'team_1_fta', 'team_1_ft_pct',
       'team_1_orb', 'team_1_drb', 'team_1_trb', 'team_1_ast', 'team_1_stl',
       'team_1_blk', 'team_1_tov', 'team_1_pf', 'team_1_pts', 'team_1_ts_pct',
       'team_1_efg_pct', 'team_1_fg3a_per_fga_pct', 'team_1_fta_per_fga_pct',
       'team_1_orb_pct', 'team_1_drb_pct', 'team_1_trb_pct', 'team_1_ast_pct',
       'team_1_stl_pct', 'team_1_blk_pct', 'team_1_tov_pct', 'team_1_usg_pct',
       'team_1_off_rtg', 'team_1_def_rtg', 'team_1_ws', 'team_2_mp',
       'team_2_fg', 'team_2_fga', 'team_2_fg_pct', 'team_2_fg3', 'team_2_fg3a',
       'team_2_fg3_pct', 'team_2_ft', 'team_2_fta', 'team_2_ft_pct',
       'team_2_orb', 'team_2_drb', 'team_2_trb', 'team_2_ast', 'team_2_stl',
       'team_2_blk', 'team_2_tov', 'team_2_pf', 'team_2_pts', 'team_2_ts_pct',
       'team_2_efg_pct

In [53]:
df['team_2_ws'] = df['team_2_ws'].apply(lambda x: int(x[1:].strip()) if x[0]=='W' else -int(x[1:].strip()))

In [56]:
df['team_1_ws'] = df['team_1_ws'].apply(lambda x: int(x[1:].strip()) if x[0]=='W' else -int(x[1:].strip()))

In [57]:
df.dropna(subset=['team_1_mp', 'team_1_fg',
       'team_1_fga', 'team_1_fg_pct', 'team_1_fg3', 'team_1_fg3a',
       'team_1_fg3_pct', 'team_1_ft', 'team_1_fta', 'team_1_ft_pct',
       'team_1_orb', 'team_1_drb', 'team_1_trb', 'team_1_ast', 'team_1_stl',
       'team_1_blk', 'team_1_tov', 'team_1_pf', 'team_1_pts', 'team_1_ts_pct',
       'team_1_efg_pct', 'team_1_fg3a_per_fga_pct', 'team_1_fta_per_fga_pct',
       'team_1_orb_pct', 'team_1_drb_pct', 'team_1_trb_pct', 'team_1_ast_pct',
       'team_1_stl_pct', 'team_1_blk_pct', 'team_1_tov_pct', 'team_1_usg_pct',
       'team_1_off_rtg', 'team_1_def_rtg', 'team_1_ws','team_2_mp',
       'team_2_fg', 'team_2_fga', 'team_2_fg_pct', 'team_2_fg3', 'team_2_fg3a',
       'team_2_fg3_pct', 'team_2_ft', 'team_2_fta', 'team_2_ft_pct',
       'team_2_orb', 'team_2_drb', 'team_2_trb', 'team_2_ast', 'team_2_stl',
       'team_2_blk', 'team_2_tov', 'team_2_pf', 'team_2_pts', 'team_2_ts_pct',
       'team_2_efg_pct', 'team_2_fg3a_per_fga_pct', 'team_2_fta_per_fga_pct',
       'team_2_orb_pct', 'team_2_drb_pct', 'team_2_trb_pct', 'team_2_ast_pct',
       'team_2_stl_pct', 'team_2_blk_pct', 'team_2_tov_pct', 'team_2_usg_pct',
       'team_2_off_rtg', 'team_2_def_rtg', 'team_2_ws'], inplace = True )

In [58]:
X = df[['team_1_mp', 'team_1_fg',
       'team_1_fga', 'team_1_fg_pct', 'team_1_fg3', 'team_1_fg3a',
       'team_1_fg3_pct', 'team_1_ft', 'team_1_fta', 'team_1_ft_pct',
       'team_1_orb', 'team_1_drb', 'team_1_trb', 'team_1_ast', 'team_1_stl',
       'team_1_blk', 'team_1_tov', 'team_1_pf', 'team_1_pts', 'team_1_ts_pct',
       'team_1_efg_pct', 'team_1_fg3a_per_fga_pct', 'team_1_fta_per_fga_pct',
       'team_1_orb_pct', 'team_1_drb_pct', 'team_1_trb_pct', 'team_1_ast_pct',
       'team_1_stl_pct', 'team_1_blk_pct', 'team_1_tov_pct', 'team_1_usg_pct',
       'team_1_off_rtg', 'team_1_def_rtg', 'team_1_ws','team_2_mp',
       'team_2_fg', 'team_2_fga', 'team_2_fg_pct', 'team_2_fg3', 'team_2_fg3a',
       'team_2_fg3_pct', 'team_2_ft', 'team_2_fta', 'team_2_ft_pct',
       'team_2_orb', 'team_2_drb', 'team_2_trb', 'team_2_ast', 'team_2_stl',
       'team_2_blk', 'team_2_tov', 'team_2_pf', 'team_2_pts', 'team_2_ts_pct',
       'team_2_efg_pct', 'team_2_fg3a_per_fga_pct', 'team_2_fta_per_fga_pct',
       'team_2_orb_pct', 'team_2_drb_pct', 'team_2_trb_pct', 'team_2_ast_pct',
       'team_2_stl_pct', 'team_2_blk_pct', 'team_2_tov_pct', 'team_2_usg_pct',
       'team_2_off_rtg', 'team_2_def_rtg', 'team_2_ws']]

y = df['game_result']

In [59]:
# Separando em validação e teste
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y)

In [60]:
scaler = MinMaxScaler()
X_train_norma =  pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)

In [9]:
# Definir o modelo SVC
svc = SVC(kernel='linear')

# Configurar o RFE para selecionar o número desejado de features
rfe = RFE(estimator=svc, step=1)

# Criar um pipeline que inclui RFE e o classificador SVC
pipeline = make_pipeline(rfe, svc)

# Treinar o pipeline
pipeline.fit(X_train_norma, y_train)

In [50]:
X_test_norma = pd.DataFrame(scaler.fit_transform(X_test), columns=X_test.columns)

# Fazer previsões no conjunto de teste
y_pred = pipeline.predict(X_test_norma)

# Avaliar o desempenho do modelo
accuracy = accuracy_score(y_test, y_pred)

print(f'Número de features selecionadas: {rfe.n_features_}')
print(f'Features selecionadas: {np.where(rfe.support_)[0]}')
print(f'Acurácia do modelo: {accuracy:.2f}')

Número de features selecionadas: 33
Features selecionadas: [ 0  1  2  3  4  5  6  7  8 13 16 18 19 20 21 22 26 29 31 34 35 36 37 38
 40 41 47 51 52 53 54 55 60]
Acurácia do modelo: 0.58


In [53]:
X_test.columns[[ 0, 1,  2,  3,  4,  5,  6,  7,  8, 13, 16, 18, 19, 20, 21, 22, 26, 29, 31, 34, 35, 36, 37, 38, 40, 41, 47, 51, 52, 53, 54, 55, 60]]

Index(['team_1_mp', 'team_1_fg', 'team_1_fga', 'team_1_fg_pct', 'team_1_fg3',
       'team_1_fg3a', 'team_1_fg3_pct', 'team_1_ft', 'team_1_fta',
       'team_1_ast', 'team_1_tov', 'team_1_pts', 'team_1_ts_pct',
       'team_1_efg_pct', 'team_1_fg3a_per_fga_pct', 'team_1_fta_per_fga_pct',
       'team_1_ast_pct', 'team_1_tov_pct', 'team_1_off_rtg', 'team_2_fg',
       'team_2_fga', 'team_2_fg_pct', 'team_2_fg3', 'team_2_fg3a', 'team_2_ft',
       'team_2_fta', 'team_2_stl', 'team_2_pts', 'team_2_ts_pct',
       'team_2_efg_pct', 'team_2_fg3a_per_fga_pct', 'team_2_fta_per_fga_pct',
       'team_2_stl_pct'],
      dtype='object')

# Usando o grid-search para descobrir os hiper-parametros iniciais da árvore

In [65]:
model_forest = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': list(range(10,201,10)),
    'max_depth': [4, 6, 8, 10, None]
}

# Executar a busca em grade
clf = GridSearchCV(model_forest, param_grid, cv=10, n_jobs=10)
clf.fit(X_train_norma, y_train)

In [66]:
clf.best_estimator_

In [67]:
rf = RandomForestClassifier(max_depth=8 ,random_state=42)

rf.fit(X_train_norma, y_train)

In [68]:
X_test_norma = pd.DataFrame(scaler.fit_transform(X_test), columns=X_test.columns)

rf.score(X_test_norma, y_test)

0.595621769534813

In [48]:
rf = RandomForestClassifier(max_depth=8, n_estimators=180, random_state=42)

selector = RFE(rf, step=1)
selector = selector.fit(X, y)

In [49]:
feature_names = selector.get_feature_names_out()

feature_names

array(['team_1_fg_pct', 'team_1_fg3_pct', 'team_1_ft_pct',
       'team_1_ts_pct', 'team_1_efg_pct', 'team_1_fg3a_per_fga_pct',
       'team_1_fta_per_fga_pct', 'team_1_orb_pct', 'team_1_drb_pct',
       'team_1_trb_pct', 'team_1_ast_pct', 'team_1_stl_pct',
       'team_1_blk_pct', 'team_1_tov_pct', 'team_1_off_rtg',
       'team_1_def_rtg', 'team_2_fg_pct', 'team_2_fg3_pct',
       'team_2_ft_pct', 'team_2_pts', 'team_2_ts_pct', 'team_2_efg_pct',
       'team_2_fg3a_per_fga_pct', 'team_2_fta_per_fga_pct',
       'team_2_orb_pct', 'team_2_drb_pct', 'team_2_trb_pct',
       'team_2_ast_pct', 'team_2_stl_pct', 'team_2_blk_pct',
       'team_2_tov_pct', 'team_2_off_rtg', 'team_2_def_rtg'], dtype=object)

In [51]:
X = df[['team_1_fg_pct', 'team_1_fg3_pct', 'team_1_ft_pct',
       'team_1_ts_pct', 'team_1_efg_pct', 'team_1_fg3a_per_fga_pct',
       'team_1_fta_per_fga_pct', 'team_1_orb_pct', 'team_1_drb_pct',
       'team_1_trb_pct', 'team_1_ast_pct', 'team_1_stl_pct',
       'team_1_blk_pct', 'team_1_tov_pct', 'team_1_off_rtg',
       'team_1_def_rtg', 'team_2_fg_pct', 'team_2_fg3_pct',
       'team_2_ft_pct', 'team_2_pts', 'team_2_ts_pct', 'team_2_efg_pct',
       'team_2_fg3a_per_fga_pct', 'team_2_fta_per_fga_pct',
       'team_2_orb_pct', 'team_2_drb_pct', 'team_2_trb_pct',
       'team_2_ast_pct', 'team_2_stl_pct', 'team_2_blk_pct',
       'team_2_tov_pct', 'team_2_off_rtg', 'team_2_def_rtg']]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

X_train_norma =  pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test_norma = pd.DataFrame(scaler.fit_transform(X_test), columns=X_test.columns)

model_forest = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': list(range(10,201,10)),
    'max_depth': [4, 6, 8, 10, None]
}

# Executar a busca em grade
clf = GridSearchCV(model_forest, param_grid, cv=10, n_jobs=10)
clf.fit(X_train_norma, y_train)

In [52]:
rf = RandomForestClassifier(max_depth=10, n_estimators=130, random_state=42)

rf.fit(X_train_norma, y_train)
rf.score(X_test_norma, y_test)

0.582547886895713

In [22]:
model_svm = SVC()

param_grid = {
    'C': [np.power(2, x, dtype=float) for x in range(-5, 13, 2)],
    'gamma': [np.power(2, x, dtype=float) for x in range(-9, 4, 2)]
}

# Executar a busca em grade
clf = GridSearchCV(model_svm, param_grid, cv=5, n_jobs=10)
clf.fit(X_train_norma, y_train)


KeyboardInterrupt



In [63]:
model_svm = SVC()

model_svm.fit(X_train_norma, y_train)

In [64]:
X_test_norma = pd.DataFrame(scaler.fit_transform(X_test), columns=X_test.columns)

model_svm.score(X_test_norma, y_test)

0.5886287625418061