In [1]:
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import seaborn as sns
import pandas as pd



## Importação dos datasets

In [2]:
fga = pd.read_csv('NBA_22_23_FGA.csv', sep=';')
fga.head()

Unnamed: 0,PLAYER,PLAY TYPE,MADE,SHOT TYPE,BOXSCORE,VTM,HTM,Game Date,PERIOD,TIME REMAINING,SHOT DISTANCE (FT),TEAM
0,Marcus Smart,Driving Floating Bank Jump Shot,✔ Made Shot,2PT Field Goal,PHI @ BOS,PHI,BOS,"Tuesday, October 18",1,11:15,13,Boston Celtics
1,Jayson Tatum,Jump Shot,✔ Made Shot,3PT Field Goal,PHI @ BOS,PHI,BOS,"Tuesday, October 18",1,10:46,23,Boston Celtics
2,Derrick White,Running Layup Shot,✖ Missed Shot,2PT Field Goal,PHI @ BOS,PHI,BOS,"Tuesday, October 18",1,10:04,1,Boston Celtics
3,Al Horford,Cutting Layup Shot,✖ Missed Shot,2PT Field Goal,PHI @ BOS,PHI,BOS,"Tuesday, October 18",1,09:53,2,Boston Celtics
4,Jayson Tatum,Running Layup Shot,✔ Made Shot,2PT Field Goal,PHI @ BOS,PHI,BOS,"Tuesday, October 18",1,09:42,0,Boston Celtics


In [3]:
## Tabela com o nome, código e estatísticas de todos os time da NBA
cod_teams = pd.read_csv('cod_teams.csv', sep=';')
cod_teams.head()

Unnamed: 0,name,cod,W21_22,L21_22,blocks
0,Atlanta Hawks,ATL,43,39,348
1,Boston Celtics,BOS,51,31,478
2,Brooklyn Nets,BKN,44,38,448
3,Charlotte Hornets,CHA,43,39,402
4,Chicago Bulls,CHI,46,36,336


In [4]:
## Tabela com o nome, altura, peso e posição do draft de todos os jogadores da temporada 2022-2023 da NBA
info_players = pd.read_csv('players_info.csv', sep=',')
info_players = info_players.rename(columns={"Player": "PLAYER"})
info_players.head(10)

Unnamed: 0,PLAYER,Age,Height,Weight,DRAFT NUMBER
0,A.J. Lawson,22,6-6,179,Undrafted
1,AJ Green,23,6-5,190,Undrafted
2,AJ Griffin,19,6-6,220,16
3,Aaron Gordon,27,6-8,235,4
4,Aaron Holiday,26,6-0,185,23
5,Aaron Nesmith,23,6-5,215,14
6,Aaron Wiggins,24,6-5,190,55
7,Admiral Schofield,26,6-5,241,42
8,Al Horford,37,6-9,240,3
9,Alec Burks,31,6-6,214,12


## Tratamento dos datasets

### cod_teams

In [5]:
cod_teams['win_rate'] = cod_teams["W21_22"] / (cod_teams["W21_22"] + cod_teams["L21_22"])
cod_teams.drop(columns=['W21_22', 'L21_22'], inplace=True)
cod_teams.head()

Unnamed: 0,name,cod,blocks,win_rate
0,Atlanta Hawks,ATL,348,0.52439
1,Boston Celtics,BOS,478,0.621951
2,Brooklyn Nets,BKN,448,0.536585
3,Charlotte Hornets,CHA,402,0.52439
4,Chicago Bulls,CHI,336,0.560976


### info_players

In [6]:
## preencher valores nulos de DRAFT NUMBER com 100
info_players['DRAFT NUMBER'] = info_players['DRAFT NUMBER'].fillna(100)

## substituir "Undrafted" por 100
info_players['DRAFT NUMBER'] = info_players['DRAFT NUMBER'].replace('Undrafted', 100)

## converter DRAFT NUMBER para int
info_players['DRAFT NUMBER'] = info_players['DRAFT NUMBER'].astype(int)

info_players.head()

Unnamed: 0,PLAYER,Age,Height,Weight,DRAFT NUMBER
0,A.J. Lawson,22,6-6,179,100
1,AJ Green,23,6-5,190,100
2,AJ Griffin,19,6-6,220,16
3,Aaron Gordon,27,6-8,235,4
4,Aaron Holiday,26,6-0,185,23


In [7]:
## Transformar a coluna "Height" em centímetros, assumindo que 1 ft = 30.48 cm:
## Transformar a coluna "Weight" em quilos, assumindo que 1 kg = 2.20 libras

heights = []
weights = []

for i in range(len(info_players)):
    height = float(info_players['Height'][i].replace('-', '.')) * 30.48
    heights.append(height)
    weight = float(info_players['Weight'][i]) / 2.20
    weights.append(weight)

info_players['Height'] = heights
info_players['Weight'] = weights

info_players.head()

Unnamed: 0,PLAYER,Age,Height,Weight,DRAFT NUMBER
0,A.J. Lawson,22,201.168,81.363636,100
1,AJ Green,23,198.12,86.363636,100
2,AJ Griffin,19,201.168,100.0,16
3,Aaron Gordon,27,207.264,106.818182,4
4,Aaron Holiday,26,182.88,84.090909,23


### fga

In [8]:
fga.head()

Unnamed: 0,PLAYER,PLAY TYPE,MADE,SHOT TYPE,BOXSCORE,VTM,HTM,Game Date,PERIOD,TIME REMAINING,SHOT DISTANCE (FT),TEAM
0,Marcus Smart,Driving Floating Bank Jump Shot,✔ Made Shot,2PT Field Goal,PHI @ BOS,PHI,BOS,"Tuesday, October 18",1,11:15,13,Boston Celtics
1,Jayson Tatum,Jump Shot,✔ Made Shot,3PT Field Goal,PHI @ BOS,PHI,BOS,"Tuesday, October 18",1,10:46,23,Boston Celtics
2,Derrick White,Running Layup Shot,✖ Missed Shot,2PT Field Goal,PHI @ BOS,PHI,BOS,"Tuesday, October 18",1,10:04,1,Boston Celtics
3,Al Horford,Cutting Layup Shot,✖ Missed Shot,2PT Field Goal,PHI @ BOS,PHI,BOS,"Tuesday, October 18",1,09:53,2,Boston Celtics
4,Jayson Tatum,Running Layup Shot,✔ Made Shot,2PT Field Goal,PHI @ BOS,PHI,BOS,"Tuesday, October 18",1,09:42,0,Boston Celtics


In [9]:
## 1) Substituir "✔ Made Shot" por 1 e "✘ Missed Shot" por 0

fga['MADE'] = fga['MADE'].replace('✔ Made Shot', 1)
fga['MADE'] = fga['MADE'].replace('✖ Missed Shot', 0)

In [10]:
## 2) Substituir "2PT Field Goal" por 2 e "3PT Field Goal" por 3

fga['SHOT TYPE'] = fga['SHOT TYPE'].replace('2PT Field Goal', 2)
fga['SHOT TYPE'] = fga['SHOT TYPE'].replace('3PT Field Goal', 3)

In [11]:
## 3) Retirar colunas descenessárias para o modelo

fga.drop('BOXSCORE', axis='columns', inplace=True)
fga.drop('Game\xa0Date', axis='columns', inplace=True)
fga.head()

Unnamed: 0,PLAYER,PLAY TYPE,MADE,SHOT TYPE,VTM,HTM,PERIOD,TIME REMAINING,SHOT DISTANCE (FT),TEAM
0,Marcus Smart,Driving Floating Bank Jump Shot,1,2,PHI,BOS,1,11:15,13,Boston Celtics
1,Jayson Tatum,Jump Shot,1,3,PHI,BOS,1,10:46,23,Boston Celtics
2,Derrick White,Running Layup Shot,0,2,PHI,BOS,1,10:04,1,Boston Celtics
3,Al Horford,Cutting Layup Shot,0,2,PHI,BOS,1,09:53,2,Boston Celtics
4,Jayson Tatum,Running Layup Shot,1,2,PHI,BOS,1,09:42,0,Boston Celtics


In [12]:
## 4) Criar uma coluna para identificar se o time que arremessou é mandante ou visitante da partida
## 5) Criar uma coluna para identificar a taxa de vitórias da temporada anterior
## 6) Transformar a coluna "TIME REMAINING" em segundos

is_home = []
owr = []
blocks = []
dwr = []
time_remaining = []
for i in range(len(fga)):
    # 4)
    if (cod_teams[cod_teams['name'] == fga['TEAM'][i]]["cod"].values[0]) == fga['HTM'][i]:
        is_home.append(1)
    else:
        is_home.append(0)
    
    # 5)
    owr.append(cod_teams[cod_teams['name'] == fga["TEAM"][i]]['win_rate'].values[0])

    # 6)
    time = fga['TIME REMAINING'][i].split(':')
    time_remaining.append(int(time[0])*60 + int(time[1]))


fga['is_home'] = is_home
fga['off_win_rate'] = owr
fga['TIME REMAINING'] = time_remaining

In [13]:
## 7) Criar uma coluna com o número de bloqueios da temporada anterior do time defensor
## 8) Criar uma coluna com o índice de vitórias do time defensor na temporada anterior

dwr = []
blocks = []
for i in range(len(fga)):
    if fga["is_home"][i] == 1:
        blocks.append(cod_teams[cod_teams['cod'] == fga["VTM"][i]]['blocks'].values[0])
        dwr.append(cod_teams[cod_teams['cod'] == fga["VTM"][i]]['win_rate'].values[0])
    else:
        blocks.append(cod_teams[cod_teams['cod'] == fga["HTM"][i]]['blocks'].values[0])
        dwr.append(cod_teams[cod_teams['cod'] == fga["HTM"][i]]['win_rate'].values[0])

fga['def_blocks'] = blocks
fga['def_win_rate'] = dwr

In [14]:
fga = fga.merge(info_players, on='PLAYER', how='inner')
fga.head()

Unnamed: 0,PLAYER,PLAY TYPE,MADE,SHOT TYPE,VTM,HTM,PERIOD,TIME REMAINING,SHOT DISTANCE (FT),TEAM,is_home,off_win_rate,def_blocks,def_win_rate,Age,Height,Weight,DRAFT NUMBER
0,Marcus Smart,Driving Floating Bank Jump Shot,1,2,PHI,BOS,1,675,13,Boston Celtics,1,0.621951,435,0.621951,29,195.072,100.0,6
1,Marcus Smart,Jump Shot,0,3,PHI,BOS,1,402,22,Boston Celtics,1,0.621951,435,0.621951,29,195.072,100.0,6
2,Marcus Smart,Driving Floating Jump Shot,0,2,PHI,BOS,1,205,6,Boston Celtics,1,0.621951,435,0.621951,29,195.072,100.0,6
3,Marcus Smart,Pullup Jump shot,1,2,PHI,BOS,2,453,9,Boston Celtics,1,0.621951,435,0.621951,29,195.072,100.0,6
4,Marcus Smart,Hook Shot,0,2,PHI,BOS,2,393,6,Boston Celtics,1,0.621951,435,0.621951,29,195.072,100.0,6


In [15]:
## Removendo colunas que não serão utilizadas no modelo
fga.drop('PLAY TYPE', axis='columns', inplace=True)
fga.drop('HTM', axis='columns', inplace=True)
fga.drop('VTM', axis='columns', inplace=True)

fga.head()

Unnamed: 0,PLAYER,MADE,SHOT TYPE,PERIOD,TIME REMAINING,SHOT DISTANCE (FT),TEAM,is_home,off_win_rate,def_blocks,def_win_rate,Age,Height,Weight,DRAFT NUMBER
0,Marcus Smart,1,2,1,675,13,Boston Celtics,1,0.621951,435,0.621951,29,195.072,100.0,6
1,Marcus Smart,0,3,1,402,22,Boston Celtics,1,0.621951,435,0.621951,29,195.072,100.0,6
2,Marcus Smart,0,2,1,205,6,Boston Celtics,1,0.621951,435,0.621951,29,195.072,100.0,6
3,Marcus Smart,1,2,2,453,9,Boston Celtics,1,0.621951,435,0.621951,29,195.072,100.0,6
4,Marcus Smart,0,2,2,393,6,Boston Celtics,1,0.621951,435,0.621951,29,195.072,100.0,6


## Análise exploratória

In [16]:
import plotly.express as px

## Quais são os times que mais arriscam arremessos?

fig = px.bar(fga.value_counts('TEAM').head(5), x=fga.value_counts('TEAM').head(5).index, y=fga.value_counts('TEAM').head(5).values, title='Times que mais arriscam arremessos')
fig.update_layout(xaxis_title="Times", yaxis_title="Número de arremessos")
fig.update_yaxes(range=[7000, 8000])
fig.update_traces(texttemplate='%{value}', textposition='outside')
fig.show()


In [17]:
## E os que mais possuem arremessos convertidos?

fig = px.bar(fga[fga["MADE"] == 1].value_counts('TEAM').head(5), x=fga[fga["MADE"] == 1].value_counts('TEAM').head(5).index, y=fga[fga["MADE"] == 1].value_counts('TEAM').head(5).values, title='Times que mais possuem arremessos convertidos')
fig.update_layout(xaxis_title="Times", yaxis_title="Número de arremessos convertidos")
fig.update_yaxes(range=[3000, 4000])
fig.update_traces(texttemplate='%{value}', textposition='outside')
fig.show()

In [105]:
## E os que menos possuem arremessos convertidos?

fig = px.bar(fga[fga["MADE"] == 1].value_counts('TEAM').tail(5), x=fga[fga["MADE"] == 1].value_counts('TEAM').tail(5).index, y=fga[fga["MADE"] == 1].value_counts('TEAM').tail(5).values, title='Times que mais possuem arremessos convertidos')
fig.update_layout(xaxis_title="Times", yaxis_title="Número de arremessos convertidos")
fig.update_yaxes(range=[3000, 3500])
fig.update_traces(texttemplate='%{value}', textposition='outside')
fig.show()

In [114]:
## Quais são os times com maior taxa de acerto nos arremessos?

fig = px.bar((fga[fga["MADE"] == 1].value_counts('TEAM') / fga.value_counts('TEAM')).sort_values(ascending=False).head(5),
             y=(fga[fga["MADE"] == 1].value_counts('TEAM') / fga.value_counts('TEAM')).sort_values(ascending=False).head(5).values,
             title='Times com maior taxa de arremessos convertidos')
fig.update_layout(xaxis_title="Times",
                  yaxis_title="Taxa de arremessos convertidos")
fig.update_yaxes(range=[0.45, 0.55])
fig.update_traces(texttemplate='%{value:.2%}', textposition='outside')


fig.show()

In [32]:
## Quais são os times com maior taxa de erro nos arremessos?

fig = px.bar((fga[fga["MADE"] == 0].value_counts('TEAM') / fga.value_counts('TEAM')).sort_values(ascending=False).head(5), y=(fga[fga["MADE"] == 0].value_counts('TEAM') / fga.value_counts('TEAM')).sort_values(ascending=False).head(5).values, title='Times com maior taxa de arremessos não convertidos')
fig.update_layout(xaxis_title="Times", yaxis_title="Taxa de arremessos não convertidos")
fig.update_yaxes(range=[0.5, 0.55])
fig.update_traces(texttemplate='%{value:.2%}', textposition='outside')
fig.show()

In [55]:
## Percentual de arremessos convertidos e não convertidos

labels = ['Arremessos convertidos', 'Arremessos não convertidos']
values = [fga['MADE'].value_counts()[1], fga['MADE'].value_counts()[0]]

fig = px.pie(fga, values=values, names=labels, title='Percentual de arremessos convertidos e não convertidos')
fig.update_traces(textposition='inside', textinfo='percent')
fig.update_traces(marker=dict(colors=['blue', 'red']))
fig.update_layout(width=600, height=400)

fig.show()

In [46]:
## Em qual período do jogo ocorrem mais arremessos?

fig = px.bar(fga.value_counts('PERIOD').sort_values(ascending=False),
             x=fga.value_counts('PERIOD').sort_values(ascending=False).index,
             y=fga.value_counts('PERIOD').sort_values(ascending=False).values,
             title='Arremessos por período do jogo')
fig.update_layout(xaxis_title="Período", yaxis_title="Número de arremessos")
fig.update_yaxes(range=[0,60000])
fig.update_traces(texttemplate='%{value}', textposition='outside')
fig.show()

In [91]:
## Número de arremessos certos vs errados por período

fig = go.Figure(data=[
    go.Bar(name='Arremessos certos', x=fga[fga["MADE"]==1].value_counts('PERIOD').sort_values(ascending=False).index,
           y=fga[fga["MADE"]==1].value_counts('PERIOD').sort_values(ascending=False).values),
    go.Bar(name='Arremessos errados', x=fga[fga["MADE"]==0].value_counts('PERIOD').sort_values(ascending=False).index,
           y=fga[fga["MADE"]==0].value_counts('PERIOD').sort_values(ascending=False).values)
])

fig.update_layout(title='Arremessos certos e errados por período do jogo')

fig.update_layout(barmode='group', xaxis_title="Período", yaxis_title="Número de arremessos")
fig.update_yaxes(range=[0,32000], showticklabels=False)
fig.update_traces(texttemplate='%{value}', textposition='outside')
fig.show()

In [50]:
## Qual é a taxa de acerto nos arremessos por período?
fig = px.bar(fga[fga["MADE"]==1].value_counts('PERIOD').sort_values(ascending=False) / fga.value_counts('PERIOD').sort_values(ascending=False),
             x=(fga[fga["MADE"]==1].value_counts('PERIOD').sort_values(ascending=False) / fga.value_counts('PERIOD').sort_values(ascending=False)).index,
             y=(fga[fga["MADE"]==1].value_counts('PERIOD').sort_values(ascending=False) / fga.value_counts('PERIOD').sort_values(ascending=False)).values,
             title='Taxa de conversão dos arremessos por período')
fig.update_layout(xaxis_title="Período", yaxis_title="Taxa de conversão dos arremessos")
fig.update_yaxes(range=[0.3,0.5])
fig.update_traces(texttemplate='%{value:.2%}', textposition='outside')
fig.show()

In [115]:
## Tentativas de arremesso por idade do jogador

fig = px.bar(fga.value_counts('Age').sort_values(ascending=False),
                x=fga.value_counts('Age').sort_values(ascending=False).index,
                y=fga.value_counts('Age').sort_values(ascending=False).values,
                title='Arremessos convertidos por idade do jogador')

fig.update_layout(xaxis_title="Idade", yaxis_title="Número de arremessos convertidos")
fig.update_yaxes(range=[0,25000], showticklabels=False)
fig.update_traces(texttemplate='%{value}', textposition='outside')
fig.update_xaxes(tick0=0, dtick=1)
fig.show()

In [117]:
## taxa de acerto por idade do jogador

fig = px.bar(fga[fga["MADE"]==1].value_counts('Age').sort_values(ascending=False) / fga.value_counts('Age').sort_values(ascending=False),
                x=(fga[fga["MADE"]==1].value_counts('Age').sort_values(ascending=False) / fga.value_counts('Age').sort_values(ascending=False)).index,
                y=(fga[fga["MADE"]==1].value_counts('Age').sort_values(ascending=False) / fga.value_counts('Age').sort_values(ascending=False)).values,
                title='Taxa de conversão dos arremessos por idade do jogador')

fig.update_layout(xaxis_title="Idade", yaxis_title="Taxa de conversão dos arremessos")
fig.update_yaxes(range=[0.3,0.6], showticklabels=False)
fig.update_traces(texttemplate='%{value:.2%}', textposition='outside')
fig.update_xaxes(tick0=0, dtick=1)
fig.show()


In [131]:
fig = px.line(fga[fga["MADE"]==1].value_counts('Height').sort_values(ascending=False)/fga.value_counts('Height').sort_values(ascending=False),
                x=(fga[fga["MADE"]==1].value_counts('Height').sort_values(ascending=False)/fga.value_counts('Height').sort_values(ascending=False)).index,
                y=(fga[fga["MADE"]==1].value_counts('Height').sort_values(ascending=False)/fga.value_counts('Height').sort_values(ascending=False)).values,
                title='Taxa de conversão dos arremessos por altura do jogador')

fig.update_layout(xaxis_title="Altura", yaxis_title="Taxa de conversão dos arremessos")
fig.update_yaxes(range=[0.2,0.7])

fig.show()

In [119]:
## gráfico de linhas correlacionando altura e taxa de acerto dos jogadores


ValueError: 
    Invalid value of type 'builtins.str' received for the 'textposition' property of scattergl
        Received value: 'outside'

    The 'textposition' property is an enumeration that may be specified as:
      - One of the following enumeration values:
            ['top left', 'top center', 'top right', 'middle left',
            'middle center', 'middle right', 'bottom left', 'bottom
            center', 'bottom right']
      - A tuple, list, or one-dimensional numpy array of the above

In [23]:
## Hora de retirar as colunas "TEAM" e "PLAYER", elas não têm mais uso daqui em diante
# fga.drop('TEAM', axis='columns', inplace=True)
# fga.drop('PLAYER', axis='columns', inplace=True)

## Dataset após a análise exploratória e tratamento dos dados

In [24]:
print(f"Shape: {fga.shape}")
fga.head()

Shape: (209626, 15)


Unnamed: 0,PLAYER,MADE,SHOT TYPE,PERIOD,TIME REMAINING,SHOT DISTANCE (FT),TEAM,is_home,off_win_rate,def_blocks,def_win_rate,Age,Height,Weight,DRAFT NUMBER
0,Marcus Smart,1,2,1,675,13,Boston Celtics,1,0.621951,435,0.621951,29,195.072,100.0,6
1,Marcus Smart,0,3,1,402,22,Boston Celtics,1,0.621951,435,0.621951,29,195.072,100.0,6
2,Marcus Smart,0,2,1,205,6,Boston Celtics,1,0.621951,435,0.621951,29,195.072,100.0,6
3,Marcus Smart,1,2,2,453,9,Boston Celtics,1,0.621951,435,0.621951,29,195.072,100.0,6
4,Marcus Smart,0,2,2,393,6,Boston Celtics,1,0.621951,435,0.621951,29,195.072,100.0,6


### Matriz de correlação

In [25]:
# plt.figure(figsize=(10,10))
# sns.heatmap(fga.corr(), annot=True, fmt='.1f')
# plt.show()

### Salva o dataset tratado num arquivo csv

In [26]:
# fga.to_csv('dados.csv', index=False)

In [27]:
fga.head()

Unnamed: 0,PLAYER,MADE,SHOT TYPE,PERIOD,TIME REMAINING,SHOT DISTANCE (FT),TEAM,is_home,off_win_rate,def_blocks,def_win_rate,Age,Height,Weight,DRAFT NUMBER
0,Marcus Smart,1,2,1,675,13,Boston Celtics,1,0.621951,435,0.621951,29,195.072,100.0,6
1,Marcus Smart,0,3,1,402,22,Boston Celtics,1,0.621951,435,0.621951,29,195.072,100.0,6
2,Marcus Smart,0,2,1,205,6,Boston Celtics,1,0.621951,435,0.621951,29,195.072,100.0,6
3,Marcus Smart,1,2,2,453,9,Boston Celtics,1,0.621951,435,0.621951,29,195.072,100.0,6
4,Marcus Smart,0,2,2,393,6,Boston Celtics,1,0.621951,435,0.621951,29,195.072,100.0,6
