# Guide to Advanced Statistics

## Dependencies

In [10]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from modules.charts.plotlyTemplates import PlotlyTemplates
from modules.datasets import BReferenceScraper

## Web Scraping Data

### Advanced Stats

In [11]:
advanced = BReferenceScraper.players_table("https://www.basketball-reference.com/leagues/NBA_2023_advanced.html")
advanced.head()

Unnamed: 0,Player,Pos,Age,Tm,G,MP,PER,TS%,3PAr,FTr,...,Unnamed: 12,OWS,DWS,WS,WS/48,Unnamed: 17,OBPM,DBPM,BPM,VORP
0,Precious Achiuwa,C,23,TOR,26,548,15.1,0.537,0.284,0.338,...,0,0.3,0.6,0.9,0.077,0,-1.7,-1.3,-2.9,-0.1
1,Steven Adams,C,29,MEM,42,1133,17.6,0.564,0.004,0.49,...,0,1.3,2.1,3.4,0.144,0,-0.1,0.8,0.7,0.8
2,Bam Adebayo,C,25,MIA,45,1576,20.8,0.588,0.014,0.334,...,0,2.1,2.6,4.7,0.144,0,0.8,0.9,1.7,1.5
3,Ochai Agbaji,SG,22,UTA,28,355,9.0,0.59,0.552,0.161,...,0,0.4,0.1,0.5,0.074,0,-1.5,-1.3,-2.8,-0.1
4,Santi Aldama,PF,22,MEM,46,1009,14.7,0.602,0.519,0.28,...,0,1.6,1.5,3.2,0.15,0,0.2,1.3,1.5,0.9


In [12]:
# change type of columns
advanced = advanced.astype({
    'Age': 'int64',
    'G': 'int64',
    'MP': 'int64',
    'PER': 'float64',
    'TS%': 'float64',
    '3PAr': 'float64',
    'FTr': 'float64',
    'ORB%': 'float64',
    'DRB%': 'float64',
    'TRB%': 'float64',
    'AST%': 'float64',
    'STL%': 'float64',
    'BLK%': 'float64',
    'TOV%': 'float64',
    'USG%': 'float64',
    'OWS': 'float64',
    'DWS': 'float64',
    'WS': 'float64',
    'WS/48': 'float64',
    'OBPM': 'float64',
    'DBPM': 'float64',
    'BPM': 'float64',
    'VORP': 'float64'
})

In [14]:
# remove column by number
advanced.drop(advanced.columns[[18, -5]], axis=1, inplace=True)

In [15]:
advanced.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 508 entries, 0 to 531
Data columns (total 26 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Player  508 non-null    object 
 1   Pos     508 non-null    object 
 2   Age     508 non-null    int64  
 3   Tm      508 non-null    object 
 4   G       508 non-null    int64  
 5   MP      508 non-null    int64  
 6   PER     508 non-null    float64
 7   TS%     508 non-null    float64
 8   3PAr    508 non-null    float64
 9   FTr     508 non-null    float64
 10  ORB%    508 non-null    float64
 11  DRB%    508 non-null    float64
 12  TRB%    508 non-null    float64
 13  AST%    508 non-null    float64
 14  STL%    508 non-null    float64
 15  BLK%    508 non-null    float64
 16  TOV%    508 non-null    float64
 17  USG%    508 non-null    float64
 18  OWS     508 non-null    float64
 19  DWS     508 non-null    float64
 20  WS      508 non-null    float64
 21  WS/48   508 non-null    float64
 22  OB

In [16]:
advanced.describe()

Unnamed: 0,Age,G,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,...,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP
count,508.0,508.0,508.0,508.0,508.0,508.0,508.0,508.0,508.0,508.0,...,508.0,508.0,508.0,508.0,508.0,508.0,508.0,508.0,508.0,508.0
mean,25.714567,31.07874,707.923228,13.442323,0.562077,0.392951,0.247059,5.213189,15.038976,10.129134,...,12.532087,18.249213,0.775591,0.715748,1.49252,0.085663,-1.190157,-0.017913,-1.203543,0.354724
std,4.33499,14.321805,511.54123,7.178015,0.130687,0.221992,0.172008,4.421179,8.002947,5.254146,...,7.333499,6.15562,1.132466,0.614316,1.608441,0.108251,4.129972,1.888733,5.157785,0.769793
min,19.0,1.0,1.0,-26.5,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-0.9,-0.1,-0.5,-0.797,-36.4,-9.4,-44.7,-0.7
25%,22.0,21.0,239.25,9.675,0.52,0.25,0.13875,2.1,10.3,6.6,...,9.2,14.0,0.0,0.2,0.2,0.046,-3.0,-0.9,-3.1,-0.1
50%,25.0,35.0,639.5,13.0,0.5685,0.4065,0.2265,3.8,13.3,8.6,...,12.1,17.3,0.4,0.6,1.0,0.085,-1.2,-0.1,-1.2,0.1
75%,28.0,43.0,1153.0,16.4,0.614,0.53675,0.334,7.125,18.5,12.825,...,14.925,21.525,1.2,1.1,2.3,0.131,0.6,0.8,0.9,0.5
max,42.0,51.0,1859.0,75.0,1.5,1.0,2.0,29.3,100.0,53.2,...,100.0,43.6,7.3,2.8,9.5,1.172,20.4,15.0,25.3,5.4


In [17]:
advanced = advanced.groupby('Player', as_index=False).mean()
advanced = advanced[(advanced.G > 10) & (advanced.MP > 200)]

### Basic Stats

In [18]:
basic = BReferenceScraper.players_table("https://www.basketball-reference.com/leagues/NBA_2023_per_game.html")
basic.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,Precious Achiuwa,C,23,TOR,26,2,21.1,3.5,7.7,0.458,...,0.735,1.9,4.0,5.9,1.0,0.5,0.7,1.2,1.8,9.5
1,Steven Adams,C,29,MEM,42,42,27.0,3.7,6.3,0.597,...,0.364,5.1,6.5,11.5,2.3,0.9,1.1,1.9,2.3,8.6
2,Bam Adebayo,C,25,MIA,45,45,35.0,8.6,16.0,0.54,...,0.8,2.8,7.3,10.1,3.1,1.1,0.8,2.7,2.9,21.5
3,Ochai Agbaji,SG,22,UTA,28,0,12.7,1.5,3.1,0.483,...,0.571,0.5,1.0,1.6,0.5,0.1,0.1,0.2,1.2,3.9
4,Santi Aldama,PF,22,MEM,46,16,21.9,3.3,6.8,0.481,...,0.705,1.1,3.6,4.7,1.2,0.7,0.7,0.7,2.0,9.2


In [19]:
basic = basic.astype({
    'Age': 'int64',
    'G': 'int64',
    'GS': 'int64',
    'MP': 'float64',
    'FG': 'float64',
    'FGA': 'float64',
    'FG%': 'float64',
    '3P': 'float64',
    '3PA': 'float64',
    '3P%': 'float64',
    '2P': 'float64',
    '2PA': 'float64',
    '2P%': 'float64',
    'eFG%': 'float64',
    'FT': 'float64',
    'FTA': 'float64',
    'FT%': 'float64',
    'ORB': 'float64',
    'DRB': 'float64',
    'TRB': 'float64',
    'AST': 'float64',
    'STL': 'float64',
    'BLK': 'float64',
    'TOV': 'float64',
    'PF': 'float64',
    'PTS': 'float64'
})

In [20]:
basic = basic.groupby('Player', as_index=False).mean()
basic = basic[basic.G > 10]

## Joining Basic and Advanced Data

In [23]:
all_stats = pd.merge(basic, advanced, on='Player', how='left')
all_stats.drop(columns=["Age_x", "G_x", "MP_x", "GS"], inplace=True)
all_stats.head()

Unnamed: 0,Player,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,...,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP
0,A.J. Green,1.5,3.5,0.44,1.2,2.9,0.4,0.4,0.6,0.643,...,7.5,17.8,0.3,0.2,0.5,0.112,-0.4,-0.6,-1.0,0.1
1,AJ Griffin,3.9,7.9,0.489,1.6,4.1,0.4,2.2,3.8,0.585,...,7.6,17.7,1.1,0.8,1.8,0.093,0.4,0.1,0.5,0.6
2,Aaron Gordon,6.3,11.0,0.574,1.0,2.5,0.393,5.3,8.5,0.628,...,11.3,21.5,3.1,1.4,4.5,0.165,2.7,-0.5,2.2,1.4
3,Aaron Holiday,1.6,3.9,0.419,0.6,1.5,0.424,1.0,2.4,0.415,...,11.9,13.0,0.3,0.5,0.8,0.057,-3.0,0.9,-2.1,0.0
4,Aaron Nesmith,3.1,7.8,0.404,1.4,4.0,0.345,1.8,3.8,0.467,...,10.3,16.9,0.2,0.8,1.0,0.045,-3.0,-0.3,-3.3,-0.4


## PER
### Player Efficiency Rating

In [36]:
def get_and_plot_top_players(advanced, col, n=10, n_decimals=1, season="2022-23"):
    """ Retorna e plota os n melhores jogadores de acordo com a coluna col
    """
    advanced.sort_values(by=col, ascending=False, inplace=True)
    top10 = advanced[["Player", col]].head(n)[::-1]
    title = f"<b>Top-{n} players by {col}</b><br>{season}"
    
    fig = go.Figure(data=[
    go.Bar(y=top10['Player'], x=top10[col], orientation='h', 
           text=top10[col], textposition='outside',
           texttemplate='%{text:.'+str(n_decimals)+'f}',
           marker_color='#8433ad',
           ),
    ])
    fig.update_layout(
        title = title,
        template=PlotlyTemplates.custom_template(type_='bar'),
        width=700,
        height=500,
        )
    fig.add_layout_image(
        dict(
            source="../images/logo-nba.png",
            xref="paper", yref="paper",
            x=1, y=1.05,
            sizex=0.2, sizey=0.2,
            xanchor="right", yanchor="bottom",
        )
    )
    return fig

In [37]:
fig = get_and_plot_top_players(advanced, 'PER')
fig.update_xaxes(range=[0,36])
fig.show()

In [38]:
PER_alltime = pd.read_csv('../data/breference/raw/PER_alltime.csv', skipfooter=1, skip_blank_lines=True, engine='python')
PER_alltime.Player = PER_alltime.Player.str.replace("*", "")
PER_alltime.head(10)


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.



Unnamed: 0,Player,PER
0,Michael Jordan,27.91
1,LeBron James,27.34
2,Nikola Jokić,27.09
3,Anthony Davis,26.89
4,Shaquille O'Neal,26.43
5,David Robinson,26.18
6,Wilt Chamberlain,26.16
7,Bob Pettit,25.45
8,Kevin Durant,25.26
9,Neil Johnston,24.86


In [39]:
fig = get_and_plot_top_players(PER_alltime, 'PER', n=10, season="All-time")
fig.update_xaxes(range=[0,36])
fig.show()

## TS%
### True Shooting Percentage

In [40]:
fig = get_and_plot_top_players(advanced, 'TS%', n=10, n_decimals=3)
fig.update_xaxes(range=[0,1])
fig.show()

In [41]:
TS_alltime = pd.read_csv('../data/breference/raw/TS_alltime.csv', skipfooter=1, skip_blank_lines=True, engine='python')
TS_alltime.Player = TS_alltime.Player.str.replace("*", "")


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.



In [42]:
fig = get_and_plot_top_players(TS_alltime, 'TS%', n=10, n_decimals=3, season="All-time")
fig.update_xaxes(range=[0,1])
fig.show()

## 3PAr
### 3-Point Attempt Rate

In [43]:
fig = get_and_plot_top_players(advanced, '3PAr', n=10, n_decimals=3)
fig.update_xaxes(range=[0,1])
fig.show()

## FTr
### Free Throw Rate

In [44]:
fig = get_and_plot_top_players(advanced, 'FTr', n=10, n_decimals=3)
fig.update_xaxes(range=[0,1])
fig.show()

## ORB%, DRB%, TRB%, AST%, STL%, BLK%, TOV%, USG%

In [45]:
cols = ['TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%']

In [46]:
corr_matrix = all_stats[cols].corr()
corr_matrix = corr_matrix.round(2)
fig = px.imshow(corr_matrix, x=cols, y=cols, text_auto=True)
fig.update_layout(width=500, height=500,
                  plot_bgcolor="#ffffb2",
                  paper_bgcolor="#ffffb2",
                  font_color="#000000",
                  font={
                        "color": "black",
                        "size": 16
                    },
                    title={
                        "text": "<b>% Stats Correlation Map</b>",
                        "font": {
                            "size": 22,
                            "color": "black",
                        },
                    })
fig.show()

In [47]:
fig = get_and_plot_top_players(advanced, 'USG%', n=10, n_decimals=1)
fig.update_xaxes(range=[0,45])
fig.show()