In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib
import sklearn
import keras
import sys

from pandas.plotting import scatter_matrix
from sklearn.metrics import accuracy_score
from keras.models import Sequential
from sklearn import model_selection
from keras.layers import Dense
from keras import regularizers
from keras import optimizers


## Importação dos datasets

In [None]:
fga = pd.read_csv('NBA_22_23_FGA.csv', sep=';')
fga.head()

In [None]:
## Tabela com o nome, código e estatísticas de todos os time da NBA
cod_teams = pd.read_csv('cod_teams.csv', sep=';')
cod_teams.head()

In [None]:
## Tabela com o nome, altura, peso e posição do draft de todos os jogadores da temporada 2022-2023 da NBA
info_players = pd.read_csv('players_info.csv', sep=',')
info_players = info_players.rename(columns={"Player": "PLAYER"})
info_players.head(10)

## Tratamento dos datasets

### cod_teams

In [None]:
cod_teams['win_rate'] = cod_teams["W21_22"] / (cod_teams["W21_22"] + cod_teams["L21_22"])
cod_teams.drop(columns=['W21_22', 'L21_22'], inplace=True)
cod_teams.head()

### info_players

In [None]:
## preencher valores nulos de DRAFT NUMBER com 100
info_players['DRAFT NUMBER'] = info_players['DRAFT NUMBER'].fillna(100)

## substituir "Undrafted" por 100
info_players['DRAFT NUMBER'] = info_players['DRAFT NUMBER'].replace('Undrafted', 100)

## converter DRAFT NUMBER para int
info_players['DRAFT NUMBER'] = info_players['DRAFT NUMBER'].astype(int)

info_players.head()

In [None]:
## Transformar a coluna "Height" em centímetros, assumindo que 1 ft = 30.48 cm:
## Transformar a coluna "Weight" em quilos, assumindo que 1 kg = 2.20 libras

heights = []
weights = []

for i in range(len(info_players)):
    height = float(info_players['Height'][i].replace('-', '.')) * 30.48
    heights.append(height)
    weight = float(info_players['Weight'][i]) / 2.20
    weights.append(weight)

info_players['Height'] = heights
info_players['Weight'] = weights

info_players.head()

### fga

In [None]:
fga.head()

In [None]:
## 1) Substituir "✔ Made Shot" por 1 e "✘ Missed Shot" por 0

fga['MADE'] = fga['MADE'].replace('✔ Made Shot', 1)
fga['MADE'] = fga['MADE'].replace('✖ Missed Shot', 0)

In [None]:
## 2) Substituir "2PT Field Goal" por 2 e "3PT Field Goal" por 3

fga['SHOT TYPE'] = fga['SHOT TYPE'].replace('2PT Field Goal', 2)
fga['SHOT TYPE'] = fga['SHOT TYPE'].replace('3PT Field Goal', 3)

In [None]:
## 3) Retirar colunas descenessárias para o modelo

fga.drop('BOXSCORE', axis='columns', inplace=True)
fga.drop('Game\xa0Date', axis='columns', inplace=True)
fga.head()

In [None]:
## 4) Criar uma coluna para identificar se o time que arremessou é mandante ou visitante da partida
## 5) Criar uma coluna para identificar a taxa de vitórias da temporada anterior
## 6) Transformar a coluna "TIME REMAINING" em segundos

is_home = []
owr = []
blocks = []
dwr = []
time_remaining = []
for i in range(len(fga)):
    # 4)
    if (cod_teams[cod_teams['name'] == fga['TEAM'][i]]["cod"].values[0]) == fga['HTM'][i]:
        is_home.append(1)
    else:
        is_home.append(0)
    
    # 5)
    owr.append(cod_teams[cod_teams['name'] == fga["TEAM"][i]]['win_rate'].values[0])

    # 6)
    time = fga['TIME REMAINING'][i].split(':')
    time_remaining.append(int(time[0])*60 + int(time[1]))


fga['is_home'] = is_home
fga['off_win_rate'] = owr
fga['TIME REMAINING'] = time_remaining

In [None]:
## 7) Criar uma coluna com o número de bloqueios da temporada anterior do time defensor
## 8) Criar uma coluna com o índice de vitórias do time defensor na temporada anterior

dwr = []
blocks = []
for i in range(len(fga)):
    if fga["is_home"][i] == 1:
        blocks.append(cod_teams[cod_teams['cod'] == fga["VTM"][i]]['blocks'].values[0])
        dwr.append(cod_teams[cod_teams['cod'] == fga["VTM"][i]]['win_rate'].values[0])
    else:
        blocks.append(cod_teams[cod_teams['cod'] == fga["HTM"][i]]['blocks'].values[0])
        dwr.append(cod_teams[cod_teams['cod'] == fga["HTM"][i]]['win_rate'].values[0])

fga['def_blocks'] = blocks
fga['def_win_rate'] = dwr

In [None]:
info_players.head()

In [None]:
fga = fga.merge(info_players, on='PLAYER', how='inner')
fga.head()

In [None]:
## Removendo colunas que não serão utilizadas no modelo
fga.drop('PLAYER', axis='columns', inplace=True)
fga.drop('PLAY TYPE', axis='columns', inplace=True)
fga.drop('HTM', axis='columns', inplace=True)
fga.drop('VTM', axis='columns', inplace=True)

fga.head()

## Análise exploratória

In [None]:
import plotly.express as px

## Quais são os times que mais arriscam arremessos?

fig = px.bar(fga.value_counts('TEAM').head(5), x=fga.value_counts('TEAM').head(5).index, y=fga.value_counts('TEAM').head(5).values, title='Times que mais arriscam arremessos')
fig.update_layout(xaxis_title="Times", yaxis_title="Número de arremessos")
fig.update_yaxes(range=[7000, 8000])
fig.update_traces(texttemplate='%{value}', textposition='outside')
fig.show()


In [None]:
## E os que mais possuem arremessos convertidos?

fig = px.bar(fga[fga["MADE"] == 1].value_counts('TEAM').head(5), x=fga[fga["MADE"] == 1].value_counts('TEAM').head(5).index, y=fga[fga["MADE"] == 1].value_counts('TEAM').head(5).values, title='Times que mais possuem arremessos convertidos')
fig.update_layout(xaxis_title="Times", yaxis_title="Número de arremessos convertidos")
fig.update_yaxes(range=[3000, 4000])
fig.update_traces(texttemplate='%{value}', textposition='outside')
fig.show()

In [None]:
## E os que menos possuem arremessos convertidos?

fig = px.bar(fga[fga["MADE"] == 1].value_counts('TEAM').tail(5), x=fga[fga["MADE"] == 1].value_counts('TEAM').tail(5).index, y=fga[fga["MADE"] == 1].value_counts('TEAM').tail(5).values, title='Times que mais possuem arremessos convertidos')
fig.update_layout(xaxis_title="Times", yaxis_title="Número de arremessos convertidos")
fig.update_yaxes(range=[3000, 4000])
fig.update_traces(texttemplate='%{value}', textposition='outside')
fig.show()

In [None]:
## Quais são os times com maior taxa de acerto nos arremessos?

(fga[fga["MADE"] == 1].value_counts('TEAM') / fga.value_counts('TEAM')).sort_values(ascending=False).head(5).map('{:.2%}'.format)

In [None]:
## Quais são os times com maior taxa de erro nos arremessos?

(fga[fga["MADE"] == 0].value_counts('TEAM') / fga.value_counts('TEAM')).sort_values(ascending=False).head(5).map('{:.2%}'.format)

In [None]:
## 9) Retirar a coluna "TEAM", ela não tem mais uso daqui em diante
fga.drop('TEAM', axis='columns', inplace=True)

## Dataset após a análise exploratória e tratamento dos dados

In [None]:
print(f"Shape: {fga.shape}")
fga.head()

In [None]:
X = np.array(fga.drop(['MADE'], axis=1))
y = np.array(fga['MADE'])

### Matriz de correlação

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(fga.corr(), annot=True, fmt='.1f')
plt.show()

## SVM

In [None]:
from sklearn.svm import SVC
clf = SVC(kernel='poly', degree=300).fit(X, y)

plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='autumn')
plot_svc_decision_function(clf)

# plot support vectors
plt.scatter(clf.support_vectors_[:, 0],
            clf.support_vectors_[:, 1],
            s=10, linewidth=10, facecolors='k')
print("#VS : " + str(len((clf.support_vectors_))))

In [None]:
import numpy as np

def plot_svc_decision_function(model):
    xlim = plt.gca().get_xlim()
    ylim = plt.gca().get_ylim()
    
    # create grid to evaluate model
    x = np.linspace(xlim[0], xlim[1], 30)
    y = np.linspace(ylim[0], ylim[1], 30)
    Y, X = np.meshgrid(y, x)
    xy = np.vstack([X.ravel(), Y.ravel()]).T
    P = model.decision_function(xy).reshape(X.shape)

    # plot decision boundary and margins
    plt.contour(X, Y, P,  colors='k',
               levels=[-1, 0, 1], alpha=0.5,
               linestyles=['--', '-', '--'])

plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='autumn')
plot_svc_decision_function(model)

# plot support vectors
plt.scatter(model.support_vectors_[:, 0],
            model.support_vectors_[:, 1],
            s=10, linewidth=10, facecolors='k')

In [None]:
clf = SVC(kernel='rbf')
clf.fit(X, y)

In [None]:
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='autumn')
plot_svc_decision_function(clf)

# plot support vectors
plt.scatter(clf.support_vectors_[:, 0],
            clf.support_vectors_[:, 1],
            s=10, linewidth=10, facecolors='k')
print("#VS : " + str(len((clf.support_vectors_))))

In [None]:
# TALVEZ USAR A QUANTIA DE PONTOS GERADAS PELO ARREMESSO EM QUESTÃO AO INVÉS DE USAR SE A BOLA CAIU OU NÃO

## Rede Neural

### Criando a rede neural

In [None]:
from keras.callbacks import EarlyStopping

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, random_state=2, test_size=0.2)

# Criando a arquitetura da rede neural
model = Sequential()

# Rede configurada a partir de informações da dimensão VC e da regra de ouro
#model.add(Dense(5, input_dim=12, kernel_initializer='normal',  activation='tanh'))
model.add(Dense(5, input_dim=12, kernel_initializer='normal', kernel_regularizer=regularizers.l2(0.01), activation='relu'))
model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))

# Early stopping
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=20)

# Instanciação e configuração de otimizadores
sgd = optimizers.SGD(learning_rate=0.01)

# Compila o modelo. O otimizador escolhido foi o Adam com os parâmetros default
model.compile(loss='binary_crossentropy', optimizer="adam", metrics=['accuracy'])

print(model.summary())

### Treinando a rede neural

In [None]:
# 167700 exemplos no 'X_train', para calcular a quantidade mínima de neurônios com base na regra de ouro:
# n <= ( (167700 - 10) / (10*(12+2) ) )  --> n <= 1197

In [None]:
BATCH_SIZE = 32

X_train, X_val, y_train, y_val = model_selection.train_test_split(X_train, y_train, random_state=2, test_size=0.2)

# Treinamento SEM Early Stopping
#history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=BATCH_SIZE)

# Treinamento COM Early Stopping
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=BATCH_SIZE, callbacks=[es])


### Performance do modelo

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Metrica de erro')
plt.ylabel('Erro')
plt.xlabel('Epoca')
plt.legend(['Treinamento', 'Validacao'])
plt.show()

pred = np.round(model.predict(X_test))
pred_train = np.round(model.predict(X_train))

print(f'--> Acuracia (train): {accuracy_score(y_train, pred_train):.4f}')
print(f'--> Acuracia (test): {accuracy_score(y_test, pred):.4f}')