# Guide to Advanced Statistics

## Dependencies

In [1]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from modules.charts.plotlyTemplates import PlotlyTemplates
from modules.datasets import BReferenceScraper

## Web Scraping Data

### Advanced Stats

In [2]:
advanced = BReferenceScraper.players_table("https://www.basketball-reference.com/leagues/NBA_2023_advanced.html")
advanced.head()

Unnamed: 0,Player,Pos,Age,Tm,G,MP,PER,TS%,3PAr,FTr,...,Unnamed: 12,OWS,DWS,WS,WS/48,Unnamed: 17,OBPM,DBPM,BPM,VORP
0,Precious Achiuwa,C,23,TOR,34,793,15.7,0.552,0.261,0.339,...,0,0.7,0.9,1.6,0.099,0,-1.3,-0.9,-2.2,0.0
1,Steven Adams,C,29,MEM,42,1133,17.6,0.564,0.004,0.49,...,0,1.3,2.1,3.4,0.145,0,-0.2,0.9,0.6,0.8
2,Bam Adebayo,C,25,MIA,53,1871,21.2,0.596,0.013,0.356,...,0,3.0,3.1,6.1,0.157,0,1.2,1.1,2.3,2.0
3,Ochai Agbaji,SG,22,UTA,36,518,8.4,0.608,0.593,0.136,...,0,0.6,0.1,0.7,0.066,0,-1.7,-1.5,-3.2,-0.2
4,Santi Aldama,PF,22,MEM,53,1172,14.7,0.604,0.534,0.276,...,0,1.7,1.8,3.6,0.146,0,0.2,1.2,1.4,1.0


In [3]:
# change type of columns
advanced = advanced.astype({
    'Age': 'int64',
    'G': 'int64',
    'MP': 'int64',
    'PER': 'float64',
    'TS%': 'float64',
    '3PAr': 'float64',
    'FTr': 'float64',
    'ORB%': 'float64',
    'DRB%': 'float64',
    'TRB%': 'float64',
    'AST%': 'float64',
    'STL%': 'float64',
    'BLK%': 'float64',
    'TOV%': 'float64',
    'USG%': 'float64',
    'OWS': 'float64',
    'DWS': 'float64',
    'WS': 'float64',
    'WS/48': 'float64',
    'OBPM': 'float64',
    'DBPM': 'float64',
    'BPM': 'float64',
    'VORP': 'float64'
})

In [4]:
# remove column by number
advanced.drop(advanced.columns[[18, -5]], axis=1, inplace=True)

In [5]:
advanced.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 571 entries, 0 to 595
Data columns (total 26 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Player  571 non-null    object 
 1   Pos     571 non-null    object 
 2   Age     571 non-null    int64  
 3   Tm      571 non-null    object 
 4   G       571 non-null    int64  
 5   MP      571 non-null    int64  
 6   PER     571 non-null    float64
 7   TS%     571 non-null    float64
 8   3PAr    571 non-null    float64
 9   FTr     571 non-null    float64
 10  ORB%    571 non-null    float64
 11  DRB%    571 non-null    float64
 12  TRB%    571 non-null    float64
 13  AST%    571 non-null    float64
 14  STL%    571 non-null    float64
 15  BLK%    571 non-null    float64
 16  TOV%    571 non-null    float64
 17  USG%    571 non-null    float64
 18  OWS     571 non-null    float64
 19  DWS     571 non-null    float64
 20  WS      571 non-null    float64
 21  WS/48   571 non-null    float64
 22  OB

In [6]:
advanced.describe()

Unnamed: 0,Age,G,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,...,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP
count,571.0,571.0,571.0,571.0,571.0,571.0,571.0,571.0,571.0,571.0,...,571.0,571.0,571.0,571.0,571.0,571.0,571.0,571.0,571.0,571.0
mean,25.851138,34.297723,786.563923,13.29965,0.560665,0.398037,0.243282,5.07373,14.923292,10.008581,...,12.424694,18.151138,0.871804,0.791944,1.658669,0.08342,-1.279335,-0.04711,-1.326095,0.392644
std,4.25345,17.880764,604.426747,7.477853,0.138906,0.224496,0.204647,4.363008,8.073178,5.292893,...,7.584842,6.119151,1.291291,0.712375,1.845965,0.113338,4.536529,1.98345,5.57574,0.863905
min,19.0,1.0,1.0,-26.7,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-1.2,-0.1,-0.5,-0.813,-36.2,-10.0,-44.6,-1.0
25%,23.0,21.0,224.5,9.9,0.5235,0.2505,0.134,1.9,10.3,6.6,...,9.0,14.05,0.0,0.2,0.2,0.044,-2.9,-0.95,-3.2,-0.1
50%,25.0,39.0,680.0,13.0,0.564,0.418,0.221,3.7,13.3,8.7,...,11.9,17.3,0.4,0.7,1.1,0.085,-1.1,-0.1,-1.2,0.1
75%,29.0,49.0,1299.0,16.3,0.6125,0.549,0.3285,7.0,18.15,12.5,...,15.1,21.4,1.3,1.2,2.55,0.127,0.5,0.8,0.7,0.6
max,42.0,60.0,2174.0,74.7,1.293,1.0,3.0,29.2,100.0,54.0,...,100.0,43.7,8.6,3.6,11.2,1.176,35.5,15.0,40.4,6.5


In [7]:
advanced = advanced.groupby('Player', as_index=False).mean()
advanced = advanced[(advanced.G > 10) & (advanced.MP > 200)]

### Basic Stats

In [8]:
basic = BReferenceScraper.players_table("https://www.basketball-reference.com/leagues/NBA_2023_per_game.html")
basic.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,Precious Achiuwa,C,23,TOR,34,10,23.3,4.0,8.2,0.486,...,0.684,2.1,4.4,6.5,1.1,0.7,0.7,1.1,2.2,10.4
1,Steven Adams,C,29,MEM,42,42,27.0,3.7,6.3,0.597,...,0.364,5.1,6.5,11.5,2.3,0.9,1.1,1.9,2.3,8.6
2,Bam Adebayo,C,25,MIA,53,53,35.3,8.5,15.6,0.544,...,0.81,2.7,7.2,9.9,3.2,1.2,0.8,2.5,2.8,21.5
3,Ochai Agbaji,SG,22,UTA,36,1,14.4,1.6,3.3,0.483,...,0.625,0.6,1.1,1.7,0.5,0.1,0.1,0.3,1.4,4.2
4,Santi Aldama,PF,22,MEM,53,18,22.1,3.4,7.0,0.477,...,0.738,1.0,3.7,4.7,1.2,0.7,0.7,0.8,1.9,9.5


In [9]:
basic = basic.astype({
    'Age': 'int64',
    'G': 'int64',
    'GS': 'int64',
    'MP': 'float64',
    'FG': 'float64',
    'FGA': 'float64',
    'FG%': 'float64',
    '3P': 'float64',
    '3PA': 'float64',
    '3P%': 'float64',
    '2P': 'float64',
    '2PA': 'float64',
    '2P%': 'float64',
    'eFG%': 'float64',
    'FT': 'float64',
    'FTA': 'float64',
    'FT%': 'float64',
    'ORB': 'float64',
    'DRB': 'float64',
    'TRB': 'float64',
    'AST': 'float64',
    'STL': 'float64',
    'BLK': 'float64',
    'TOV': 'float64',
    'PF': 'float64',
    'PTS': 'float64'
})

In [10]:
basic = basic.groupby('Player', as_index=False).mean()
basic = basic[basic.G > 10]

## Joining Basic and Advanced Data

In [11]:
all_stats = pd.merge(basic, advanced, on='Player', how='left')
all_stats.drop(columns=["Age_x", "G_x", "MP_x", "GS"], inplace=True)
all_stats.head()

Unnamed: 0,Player,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,...,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP
0,A.J. Green,1.5,3.4,0.44,1.1,2.8,0.4,0.4,0.6,0.643,...,7.5,17.6,0.3,0.2,0.5,0.116,-0.5,-0.5,-1.0,0.1
1,AJ Griffin,3.6,7.7,0.476,1.5,3.8,0.398,2.1,3.9,0.553,...,7.2,17.6,1.0,0.8,1.9,0.081,-0.3,-0.1,-0.4,0.5
2,Aaron Gordon,6.6,11.2,0.587,1.0,2.6,0.397,5.6,8.7,0.644,...,10.7,21.5,3.8,1.6,5.5,0.176,3.2,-0.3,2.9,1.8
3,Aaron Holiday,1.6,3.8,0.424,0.6,1.5,0.419,1.0,2.2,0.427,...,12.9,13.3,0.2,0.6,0.8,0.054,-3.0,0.8,-2.2,0.0
4,Aaron Nesmith,3.3,8.0,0.413,1.5,4.2,0.353,1.8,3.8,0.48,...,9.5,16.6,0.4,0.9,1.4,0.052,-2.7,-0.4,-3.1,-0.4


## PER
### Player Efficiency Rating

In [12]:
def get_and_plot_top_players(advanced, col, n=10, n_decimals=1, season="2022-23"):
    """ Retorna e plota os n melhores jogadores de acordo com a coluna col
    """
    advanced.sort_values(by=col, ascending=False, inplace=True)
    top10 = advanced[["Player", col]].head(n)[::-1]
    title = f"<b>Top-{n} players by {col}</b><br>{season}"
    
    fig = go.Figure(data=[
    go.Bar(y=top10['Player'], x=top10[col], orientation='h', 
           text=top10[col], textposition='outside',
           texttemplate='%{text:.'+str(n_decimals)+'f}',
           marker_color='#8433ad',
           ),
    ])
    fig.update_layout(
        title = title,
        template=PlotlyTemplates.custom_template(type_='bar'),
        width=700,
        height=500,
        )
    fig.add_layout_image(
        dict(
            source="../images/logo-nba.png",
            xref="paper", yref="paper",
            x=1, y=1.05,
            sizex=0.2, sizey=0.2,
            xanchor="right", yanchor="bottom",
        )
    )
    return fig

In [13]:
fig = get_and_plot_top_players(advanced, 'PER')
fig.update_xaxes(range=[0,36])
fig.show()

In [14]:
PER_alltime = pd.read_csv('../data/breference/raw/PER_alltime.csv', skipfooter=1, skip_blank_lines=True, engine='python')
PER_alltime.Player = PER_alltime.Player.str.replace("*", "")
PER_alltime.head(10)


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.



Unnamed: 0,Player,PER
0,Michael Jordan,27.91
1,LeBron James,27.34
2,Nikola Jokić,27.09
3,Anthony Davis,26.89
4,Shaquille O'Neal,26.43
5,David Robinson,26.18
6,Wilt Chamberlain,26.16
7,Bob Pettit,25.45
8,Kevin Durant,25.26
9,Neil Johnston,24.86


In [15]:
fig = get_and_plot_top_players(PER_alltime, 'PER', n=10, season="All-time")
fig.update_xaxes(range=[0,36])
fig.show()

## TS%
### True Shooting Percentage

In [16]:
fig = get_and_plot_top_players(advanced, 'TS%', n=10, n_decimals=3)
fig.update_xaxes(range=[0,1])
fig.show()

In [17]:
TS_alltime = pd.read_csv('../data/breference/raw/TS_alltime.csv', skipfooter=1, skip_blank_lines=True, engine='python')
TS_alltime.Player = TS_alltime.Player.str.replace("*", "")


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.



In [18]:
fig = get_and_plot_top_players(TS_alltime, 'TS%', n=10, n_decimals=3, season="All-time")
fig.update_xaxes(range=[0,1])
fig.show()

## 3PAr
### 3-Point Attempt Rate

In [19]:
fig = get_and_plot_top_players(advanced, '3PAr', n=10, n_decimals=3)
fig.update_xaxes(range=[0,1])
fig.show()

## FTr
### Free Throw Rate

In [20]:
fig = get_and_plot_top_players(advanced, 'FTr', n=10, n_decimals=3)
fig.update_xaxes(range=[0,1])
fig.show()

## ORB%, DRB%, TRB%, AST%, STL%, BLK%, TOV%, USG%

In [21]:
cols = ['TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%']

In [22]:
corr_matrix = all_stats[cols].corr()
corr_matrix = corr_matrix.round(2)
fig = px.imshow(corr_matrix, x=cols, y=cols, text_auto=True)
fig.update_layout(width=500, height=500,
                  plot_bgcolor="#ffffb2",
                  paper_bgcolor="#ffffb2",
                  font_color="#000000",
                  font={
                        "color": "black",
                        "size": 16
                    },
                    title={
                        "text": "<b>% Stats Correlation Map</b>",
                        "font": {
                            "size": 22,
                            "color": "black",
                        },
                    })
fig.show()

In [23]:
fig = get_and_plot_top_players(advanced, 'USG%', n=10, n_decimals=1)
fig.update_xaxes(range=[0,45])
fig.show()

## WS, OWS, DWS and WS/48

In [24]:
fig = get_and_plot_top_players(advanced, 'WS', n=10, n_decimals=1)
fig.show()

In [27]:
fig = get_and_plot_top_players(advanced, 'WS/48', n=10, n_decimals=3)
fig.update_xaxes(range=[0,0.4])
fig.show()

In [29]:
fig = get_and_plot_top_players(advanced, 'OWS', n=10, n_decimals=1)
fig.update_xaxes(range=[0,10])
fig.show()

In [31]:
fig = get_and_plot_top_players(advanced, 'DWS', n=10, n_decimals=1)
fig.update_xaxes(range=[0,4])
fig.show()