In [79]:
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

pd.set_option("display.max_rows", None, "display.max_columns", None)
nba2021_df = pd.read_csv('nba2021_per_game.csv')

nba2021_df.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,Precious Achiuwa,PF,21,MIA,28,2,14.6,2.6,4.4,0.59,0.0,0.0,0.0,2.6,4.4,0.59,0.59,1.3,2.4,0.561,1.3,2.7,4.0,0.6,0.4,0.5,1.0,1.9,6.5
1,Jaylen Adams,PG,24,MIL,6,0,2.8,0.2,1.3,0.125,0.0,0.3,0.0,0.2,1.0,0.167,0.125,0.0,0.0,0.0,0.0,0.5,0.5,0.3,0.0,0.0,0.0,0.2,0.3
2,Steven Adams,C,27,NOP,27,27,28.1,3.5,5.8,0.603,0.0,0.0,0.0,3.5,5.7,0.606,0.603,1.1,2.3,0.468,4.3,4.6,8.9,2.1,1.0,0.6,1.7,1.9,8.0
3,Bam Adebayo,C,23,MIA,26,26,33.6,7.4,12.9,0.573,0.1,0.2,0.4,7.3,12.7,0.576,0.576,5.1,6.0,0.841,1.9,7.3,9.2,5.3,1.0,1.0,3.0,2.6,19.9
4,LaMarcus Aldridge,C,35,SAS,18,18,26.7,5.9,12.5,0.476,1.3,3.7,0.358,4.6,8.8,0.525,0.529,0.9,1.2,0.762,0.8,3.5,4.3,1.9,0.4,0.9,0.9,1.5,14.1


In [80]:
players = nba2021_df.groupby('Player')

mult_teams = []
for player, player_df in players:
    if len(player_df.index) > 1:
        # if a player played for multiple teams, we only want to keep their total stats
        mult_teams.append(player)

for idx, row in nba2021_df.iterrows():
    if row.Player in mult_teams and row.Tm != 'TOT':
        nba2021_df = nba2021_df.drop([idx])

for player in mult_teams:
    # each print should show one row, where the player team is TOT (total)
    print(nba2021_df[nba2021_df.Player == player])

       Player Pos  Age   Tm   G  GS    MP   FG  FGA    FG%   3P  3PA    3P%  \
265  Alex Len   C   27  TOT  20   6  14.6  2.4  3.8  0.627  0.3  1.0  0.316   

      2P  2PA    2P%   eFG%   FT  FTA    FT%  ORB  DRB  TRB  AST  STL  BLK  \
265  2.1  2.8  0.732  0.667  1.0  1.7  0.588  0.8  2.8  3.5  1.1  0.4  1.1   

     TOV   PF  PTS  
265  1.2  1.9  6.0  
               Player Pos  Age   Tm  G  GS    MP   FG  FGA    FG%   3P  3PA  \
425  Dennis Smith Jr.  PG   23  TOT  7   0  12.1  1.6  4.7  0.333  0.3  1.7   

       3P%   2P  2PA    2P%   eFG%   FT  FTA  FT%  ORB  DRB  TRB  AST  STL  \
425  0.167  1.3  3.0  0.429  0.364  1.1  1.4  0.8  0.3  0.9  1.1  1.6  0.4   

     BLK  TOV   PF  PTS  
425  0.1  0.7  0.7  4.6  
           Player Pos  Age   Tm   G  GS    MP   FG   FGA    FG%   3P  3PA  \
403  Derrick Rose  PG   32  TOT  20   0  22.3  5.0  11.7  0.427  0.8  2.4   

       3P%   2P  2PA    2P%   eFG%   FT  FTA    FT%  ORB  DRB  TRB  AST  STL  \
403  0.333  4.2  9.3  0.452  0.462  2.7

In [92]:
# okay so now that we have a fully unique datatset of current players, lets study HOF players

URL = 'https://www.basketball-reference.com/awards/hof.html'

req = requests.get(URL)
soup = bs(req.content, 'html.parser')
table = soup.find('table').prettify()

hof_df = pd.read_html(table, flavor = 'bs4')[0]

In [93]:
hof_df.columns = hof_df.columns.droplevel()
for idx, row in hof_df.iterrows():
   if row.Category != 'Player':
       hof_df = hof_df.drop([idx])

hof_df.head()

Unnamed: 0,Year,Name,Category,G,PTS,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48,Unnamed: 14_level_1,G.1,W,L,W/L%
1,2020,Kobe Bryant Player / Oly,Player,1346.0,25.0,5.2,4.7,1.4,0.5,0.447,0.329,0.837,172.7,0.17,,,,,
2,2020,Tamika Catchings WNBA,Player,,,,,,,,,,,,,,,,
3,2020,Tim Duncan Player / Coach / Oly / CBB ...,Player,1392.0,19.0,10.8,3.0,0.7,2.2,0.506,0.179,0.696,206.4,0.209,,,,,
4,2020,Kevin Garnett Player / Oly,Player,1462.0,17.8,10.0,3.7,1.3,1.4,0.497,0.275,0.789,191.4,0.182,,,,,
14,2019,Carl Braun Player / Coach,Player,788.0,13.5,3.4,3.7,,,0.383,,0.804,64.3,0.119,,,,,


In [94]:
hof_df = hof_df.drop(['Unnamed: 14_level_1'],axis=1)

In [96]:
# the Name category has some extra stuff, lets drop it to only <firstName lastName> 

hof_df['Name'] = hof_df['Name'].apply(lambda x: x.split()[0] + ' ' + x.split()[1])

hof_df.head()

Unnamed: 0,Year,Name,Category,G,PTS,TRB,AST,STL,BLK,FG%,3P%,FT%,WS,WS/48,G.1,W,L,W/L%
1,2020,Kobe Bryant,Player,1346.0,25.0,5.2,4.7,1.4,0.5,0.447,0.329,0.837,172.7,0.17,,,,
2,2020,Tamika Catchings,Player,,,,,,,,,,,,,,,
3,2020,Tim Duncan,Player,1392.0,19.0,10.8,3.0,0.7,2.2,0.506,0.179,0.696,206.4,0.209,,,,
4,2020,Kevin Garnett,Player,1462.0,17.8,10.0,3.7,1.3,1.4,0.497,0.275,0.789,191.4,0.182,,,,
14,2019,Carl Braun,Player,788.0,13.5,3.4,3.7,,,0.383,,0.804,64.3,0.119,,,,


In [None]:
# now that we have data on HOF players, we can build a model for what stats get a player into the HOF
# we can use this model on active players, and then calculate probabilities for their chances of getting into the HOF

# other things to do: 
# 1. explore other variables (i.e. make plots) to study what sets a HOF player apart from a non-HOF player
# 2. find the least impactful metrics -- what's the worst a player could do and still have a better than 1/2 chance at the HOF?