In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import requests
import pandas_profiling
import numpy as np

In [2]:
# create dataframes
# ------------------ #

# per game dataframe:

df_pg = pd.DataFrame(
columns = ['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS'],index=[])

# advanced dataframe:

df_adv = pd.DataFrame(
columns = ['Player', 'Pos', 'Age', 'Tm', 'G', 'MP', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 
'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP'],index=[])


Below is the Glossary for the Per Game dataset:

* Rk -- Rank; alphabetical
* Pos -- Position
* Age -- Age of Player at the start of February 1st of that season.
* Tm -- Team
* G -- Games
* GS -- Games Started
* MP -- Minutes Played Per Game
* FG -- Field Goals Per Game
* FGA -- Field Goal Attempts Per Game
* FG% -- Field Goal Percentage
* 3P -- 3-Point Field Goals Per Game
* 3PA -- 3-Point Field Goal Attempts Per Game
* 3P% -- FG% on 3-Pt FGAs.
* 2P -- 2-Point Field Goals Per Game
* 2PA -- 2-Point Field Goal Attempts Per Game
* 2P% -- FG% on 2-Pt FGAs.
* eFG% -- Effective Field Goal Percentage
This statistic adjusts for the fact that a 3-point field goal is worth one more point than a 2-point field goal.
* FT -- Free Throws Per Game
* FTA -- Free Throw Attempts Per Game
* FT% -- Free Throw Percentage
* ORB -- Offensive Rebounds Per Game
* DRB -- Defensive Rebounds Per Game
* TRB -- Total Rebounds Per Game
* AST -- Assists Per Game
* STL -- Steals Per Game
* BLK -- Blocks Per Game
* TOV -- Turnovers Per Game
* PF -- Personal Fouls Per Game
* PTS -- Points Per Game

Below is the Glossary for the Advanced dataset:
    
* Rk -- Rank; alpahbetical (also in per game)
* Pos -- Position (also in per game)
* Age -- Age of Player at the start of February 1st of that season. (also in per game)
* Tm -- Team (also in per game)
* G -- Games (also in per game)

* MP -- Minutes Played (Season)
* PER -- Player Efficiency Rating;
A measure of per-minute production standardized such that the league average is 15.
* TS% -- True Shooting Percentage;
A measure of shooting efficiency that takes into account 2-point field goals, 3-point field goals, and free throws.
* 3PAr -- 3-Point Attempt Rate;
Percentage of FG Attempts from 3-Point Range
* FTr -- Free Throw Attempt Rate;
Number of FT Attempts Per FG Attempt
* ORB% -- Offensive Rebound Percentage;
An estimate of the percentage of available offensive rebounds a player grabbed while he was on the floor.
* DRB% -- Defensive Rebound Percentage;
An estimate of the percentage of available defensive rebounds a player grabbed while he was on the floor.
* TRB% -- Total Rebound Percentage;
An estimate of the percentage of available rebounds a player grabbed while he was on the floor.
* AST% -- Assist Percentage;
An estimate of the percentage of teammate field goals a player assisted while he was on the floor.
* STL% -- Steal Percentage;
An estimate of the percentage of opponent possessions that end with a steal by the player while he was on the floor.
* BLK% -- Block Percentage;
An estimate of the percentage of opponent two-point field goal attempts blocked by the player while he was on the floor.
* TOV% -- Turnover Percentage;
An estimate of turnovers committed per 100 plays.
* USG% -- Usage Percentage;
An estimate of the percentage of team plays used by a player while he was on the floor.
* OWS -- Offensive Win Shares;
An estimate of the number of wins contributed by a player due to his offense.
* DWS -- Defensive Win Shares;
An estimate of the number of wins contributed by a player due to his defense.
* WS -- Win Shares;
An estimate of the number of wins contributed by a player.
* WS/48 -- Win Shares Per 48 Minutes;
An estimate of the number of wins contributed by a player per 48 minutes (league average is approximately .100)
* OBPM -- Offensive Box Plus/Minus;
A box score estimate of the offensive points per 100 possessions a player contributed above a league-average player, translated to an average team.
* DBPM -- Defensive Box Plus/Minus;
A box score estimate of the defensive points per 100 possessions a player contributed above a league-average player, translated to an average team.
* BPM -- Box Plus/Minus;
A box score estimate of the points per 100 possessions a player contributed above a league-average player, translated to an average team.
* VORP -- Value over Replacement Player;
A box score estimate of the points per 100 TEAM possessions that a player contributed above a replacement-level (-2.0) player, translated to an average team and prorated to an 82-game season. Multiply by 2.70 to convert to wins over replacement.

In [33]:
t = pd.Timestamp.now()
t.year

# startyr = t.year-40
startyr = t.year-1
endyr = t.year+1

print (startyr)
print (endyr)

2018
2020


In [4]:
# populate initial dataframe
# Player Per Game Stats:

# create variable for current date/time and extract year:
t = pd.Timestamp.now()
t.year

# assign start year and end year:
startyr = t.year-40
endyr = t.year+1


x = range(startyr,endyr)
for n in x:
    year = n
    url = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html".format(n)
    html = urlopen(url)
    soup = BeautifulSoup(html)
    

    soup.findAll('tr', limit=2)
    # use getText()to extract the text we need into a list
    headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]
    
    # exclude the first column as we will not need the ranking order from Basketball Reference for the analysis
    headers = headers[1:]
    headers
    
    # avoid the first header row
    rows = soup.findAll('tr')[1:]
    player_stats = [[td.getText() for td in rows[i].findAll('td')]
                for i in range(len(rows))]
    
    # create dataframe:
    stats = pd.DataFrame(player_stats, columns = headers)
    
    # remove missing values from dataframe:
    stats = stats.dropna()
    stats['Year'] = n

    # create copy of dataframe:
    df_orig = stats
    
    # find distinct Players with more than one record per year:
    df_dups = df_orig[df_orig.duplicated(['Player'], keep=False)]

    # then find players where Team does not equal "TOT":
    df_del = df_dups.loc[df_dups['Tm'] != "TOT"]

    # drop duplicate rows from original dataframe
    df_rem = df_orig.drop(index=df_del.index)
    
    # append records to initial dataframe:
    df_pg = df_pg.append(df_rem,sort=False)
    

In [5]:
print(df_orig.shape)
print (df_dups.shape)
print (df_del.shape)
print (df_rem.shape)
print (df_pg.shape)

(708, 30)
(264, 30)
(178, 30)
(530, 30)
(16848, 30)


In [6]:
# populate initial dataframe
# ------------------------- #

# Advanced Stats:

x = range(startyr,endyr)
for n in x:
    year = n
    url = "https://www.basketball-reference.com/leagues/NBA_{}_advanced.html".format(n)
    html = urlopen(url)
    soup = BeautifulSoup(html)
    

    soup.findAll('tr', limit=2)
    # use getText()to extract the text we need into a list
    headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]
    
    # exclude the first column as we will not need the ranking order from Basketball Reference for the analysis
    headers = headers[1:]
    headers
    
    # avoid the first header row
    rows = soup.findAll('tr')[1:]
    player_stats = [[td.getText() for td in rows[i].findAll('td')]
                for i in range(len(rows))]
    
    # create dataframe:
    stats = pd.DataFrame(player_stats, columns = headers)
    
    # remove missing values from dataframe:
    stats = stats.dropna()
    stats['Year'] = n
    
    # remove invalid columns:
    stats = stats.drop(columns=('\xa0'))
    
    # drop all rows from temporary tables:
    df_orig = df_orig.iloc[0:0]
    df_dups = df_dups.iloc[0:0]
    df_del = df_del.iloc[0:0]
    df_rem = df_rem.iloc[0:0]

    # create copy of dataframe:
    df_orig = stats
    
    # find distinct Players with more than one record per year:
    df_dups = df_orig[df_orig.duplicated(['Player'], keep=False)]

    # then find players where Team does not equal "TOT":
    df_del = df_dups.loc[df_dups['Tm'] != "TOT"]

    # drop duplicate rows from original dataframe
    df_rem = df_orig.drop(index=df_del.index)
    
    # append records to initial dataframe:
    df_adv = df_adv.append(df_rem,sort=False)

In [7]:
print(df_orig.shape)
print (df_dups.shape)
print (df_del.shape)
print (df_rem.shape)
print (df_adv.shape)

(708, 27)
(264, 27)
(178, 27)
(530, 27)
(16848, 27)


In [8]:
df_adv.head()

Unnamed: 0,Player,Pos,Age,Tm,G,MP,PER,TS%,3PAr,FTr,...,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,Year
0,Kareem Abdul-Jabbar*,C,31,LAL,80,3157,25.5,0.612,,0.352,...,23.3,8.8,5.6,14.4,0.219,4.0,3.7,7.8,7.8,1979.0
1,Tom Abernethy,PF,24,GSW,70,1219,13.6,0.55,,0.275,...,13.8,2.3,1.3,3.7,0.144,0.6,0.0,0.5,0.8,1979.0
2,Alvan Adams,C,24,PHO,77,2364,20.4,0.57,,0.269,...,24.1,3.9,3.7,7.6,0.154,2.9,1.9,4.8,4.1,1979.0
3,Lucius Allen,PG,31,KCK,31,413,8.7,0.416,,0.19,...,20.3,-0.4,0.5,0.1,0.007,-3.3,-0.7,-3.9,-0.2,1979.0
4,Kim Anderson,SF,23,POR,21,224,3.0,0.353,,0.364,...,19.6,-0.6,0.2,-0.4,-0.078,-6.6,-1.7,-8.3,-0.4,1979.0


In [9]:
# drop columns from advanced dataframe that exist in the per game dataframe:

df_adv.drop(columns=(['Player','Pos','Age','Tm','Year']),inplace = True)

# reanme advanced dataframe columns:

df_adv.rename(columns = {'G':'TotG','MP':'TotMP'}, inplace = True)
            

# merge per game and advanced dataframes:

df = df_pg.merge(df_adv, left_index=True, right_index=True)


# convert all relevant columns to numeric datatypes:

df[
['G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P','3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'TotG', 'TotMP','G', 'MP', 'PER', 'TS%', '3PAr', 'FTr', 
 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 
 'VORP']] = df[
['G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P','3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS','TotG', 'TotMP','G', 'MP', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%',
 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP']].apply(pd.to_numeric)


# calculate averages for all relevant columns:

df_avg = df.groupby("Player", as_index=True)[['G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P','3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'TotG', 'TotMP','G', 'MP', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 
'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP']].mean()


df_avg.head()

Unnamed: 0_level_0,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,...,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A.C. Green,79.691882,52.070111,28.161255,3.47952,7.058487,0.487293,0.083948,0.361808,0.157851,3.38893,...,14.718669,18.93155,1.526568,1.437638,2.962362,0.074865,-1.380627,-0.419188,-1.801476,0.770664
A.J. Bramlett,8.0,0.0,7.6,0.5,2.6,0.19,0.0,0.0,,0.5,...,12.984848,18.424242,1.278788,1.490909,2.745455,0.075818,-1.733333,-0.272727,-2.012121,0.621212
A.J. English,75.584615,8.953846,20.6,4.056923,9.316923,0.435954,0.050769,0.4,0.137108,3.956923,...,14.44375,18.518462,1.167692,1.356923,2.527692,0.0604,-2.035385,-0.312308,-2.349231,0.573846
A.J. Guyton,26.916667,4.625,12.25,1.458333,4.391667,0.254729,0.604167,1.779167,0.254646,0.7875,...,14.832292,19.640625,1.473958,1.417708,2.897917,0.07276,-1.533333,-0.779167,-2.309375,0.69375
A.J. Hammons,22.0,0.0,7.4,0.8,1.9,0.405,0.2,0.5,0.5,0.5,...,14.460526,19.763158,1.978947,1.705263,3.697368,0.091895,-0.615789,-0.034211,-0.668421,1.128947


In [10]:
# 2.) NBA MVP   <Done>
# 3.) Rookie of the Year   <Done>
# 4.) Defensive Player of the Year   <Done>
# 5.) Sixth Man of the Year   <Done>
# 6.) Most Improved Player   <Done>
# 7.) NBA Finals MVP   <Done>
# 8.) All-Star Game MVP   <Done>
# 9.) NBA All-League; same as #12   <Done>

# 10.) NBA All-Rookie
# 11.) NBA All-Defensive; same as #14
# 12.) All-NBA & All-ABA Selections by Player   <Done>
# 13.) All-Star Game Selections by Player   <Done>
# 14.) All-Defensive Selections by Player   <Done>
# 15.) Hall of Fame   <Done>

# nba mvp dataframe:

url = "https://www.basketball-reference.com/awards/mvp.html"
html = urlopen(url)
soup = BeautifulSoup(html)
s = soup.find(name='table', attrs={'id':'mvp_summary'})
hdr = [th.getText() for th in s.findAll('tr', limit=2)[0].findAll('th')]
rows = s.findAll('tr')[1:]
html_str = str(s)
df_mvp = pd.read_html(html_str)[0]
df_mvp = df_mvp.dropna()


# r.o.y. dataframe:
url = "https://www.basketball-reference.com/awards/roy.html"
html = urlopen(url)
soup = BeautifulSoup(html)
s = soup.find(name='table', attrs={'id':'roy_NBA'})
hdr = [th.getText() for th in s.findAll('tr', limit=2)[0].findAll('th')]
rows = s.findAll('tr')[1:]
html_str = str(s)
df_roy = pd.read_html(html_str,header=1)[0]
df_roy = df_roy.dropna()


# defensive p.o.y. dataframe:
url = "https://www.basketball-reference.com/awards/dpoy.html"
html = urlopen(url)
soup = BeautifulSoup(html)
s = soup.find(name='table', attrs={'id':'dpoy_NBA'})
hdr = [th.getText() for th in s.findAll('tr', limit=2)[0].findAll('th')]
rows = s.findAll('tr')[1:]
html_str = str(s)
df_dpoy = pd.read_html(html_str,header=1)[0]
df_dpoy = df_dpoy.dropna()


# sixth man of the year dataframe:
url = "https://www.basketball-reference.com/awards/smoy.html"
html = urlopen(url)
soup = BeautifulSoup(html)
s = soup.find(name='table', attrs={'id':'smoy_NBA'})
hdr = [th.getText() for th in s.findAll('tr', limit=2)[0].findAll('th')]
rows = s.findAll('tr')[1:]
html_str = str(s)
df_smoy = pd.read_html(html_str,header=1)[0]
df_smoy = df_smoy.dropna()


# most improved player dataframe:
url = "https://www.basketball-reference.com/awards/mip.html"
html = urlopen(url)
soup = BeautifulSoup(html)
s = soup.find(name='table', attrs={'id':'mip_NBA'})
hdr = [th.getText() for th in s.findAll('tr', limit=2)[0].findAll('th')]
rows = s.findAll('tr')[1:]
html_str = str(s)
df_mip = pd.read_html(html_str,header=1)[0]
df_mip = df_mip.dropna()


# finals mvp dataframe:
url = "https://www.basketball-reference.com/awards/finals_mvp.html"
html = urlopen(url)
soup = BeautifulSoup(html)
s = soup.find(name='table', attrs={'id':'finals_mvp_NBA'})
hdr = [th.getText() for th in s.findAll('tr', limit=2)[0].findAll('th')]
rows = s.findAll('tr')[1:]
html_str = str(s)
df_fmvp = pd.read_html(html_str,header=1)[0]
df_fmvp = df_fmvp.dropna()


# all star game mvp dataframe:
url = "https://www.basketball-reference.com/awards/all_star_mvp.html"
html = urlopen(url)
soup = BeautifulSoup(html)
s = soup.find(name='table', attrs={'id':'all_star_mvp_NBA'})
hdr = [th.getText() for th in s.findAll('tr', limit=2)[0].findAll('th')]
rows = s.findAll('tr')[1:]
html_str = str(s)
df_asgmvp = pd.read_html(html_str,header=1)[0]
df_asgmvp = df_asgmvp.dropna()


# all league dataframe:
url = "https://www.basketball-reference.com/awards/all_league_by_player.html"
html = urlopen(url)
soup = BeautifulSoup(html)
s = soup.find(name='table', attrs={'id':'all_league_by_player'})
hdr = [th.getText() for th in s.findAll('tr', limit=2)[0].findAll('th')]
rows = s.findAll('tr')[1:]
html_str = str(s)
df_albp = pd.read_html(html_str,header=1)[0]
df_albp = df_albp.dropna()


# all star game dataframe:
url = "https://www.basketball-reference.com/awards/all_star_by_player.html"
html = urlopen(url)
soup = BeautifulSoup(html)
s = soup.find(name='table', attrs={'id':'all_star_by_player'})
hdr = [th.getText() for th in s.findAll('tr', limit=1)[0].findAll('th')]
rows = s.findAll('tr')[1:]
html_str = str(s)
df_asg = pd.read_html(html_str,header=0)[0]
df_asg = df_asg.dropna()


# all defense dataframe:
url = "https://www.basketball-reference.com/awards/all_defense_by_player.html"
html = urlopen(url)
soup = BeautifulSoup(html)
s = soup.find(name='table', attrs={'id':'all_defense_by_player'})
hdr = [th.getText() for th in s.findAll('tr', limit=2)[0].findAll('th')]
rows = s.findAll('tr')[1:]
html_str = str(s)
df_ad = pd.read_html(html_str,header=1)[0]
df_ad = df_ad.dropna()


# hall of fame dataframe:
url = "https://en.wikipedia.org/wiki/List_of_players_in_the_Naismith_Memorial_Basketball_Hall_of_Fame"
html = urlopen(url)
soup = BeautifulSoup(html)
s = soup.find(name='table')
hdr = [th.getText() for th in s.findAll('tr', limit=2)[0].findAll('th')]
rows = s.findAll('tr')[1:]
html_str = str(s)
df_hof = pd.read_html(html_str,header=0)[0]
df_hof = df_hof.dropna()


In [11]:
# all star game mvp dataframe:all_defense_by_player.html



In [12]:
# df_albp.dropna()
# df_asg.dropna()
# df_ad.head()
# df_ad
# hdr

In [13]:
# nba all league dataframe:
# ------------------------- #

url = "https://www.basketball-reference.com/awards/all_league.html"
html = urlopen(url)
soup = BeautifulSoup(html)
s = soup.find(name='table', attrs={'id':'awards_all_league'})
hdr = [th.getText() for th in s.findAll('tr', limit=2)[0].findAll('th')]
rows = s.findAll('tr')[1:]
html_str = str(s)
df_al = pd.read_html(html_str)[0]
df_al = df_al.dropna()


# clean up player names in all-league dataframe:
df_al['Unnamed: 3'] = df_al['Unnamed: 3'].str.rstrip(' C')
df_al['Unnamed: 4'] = df_al['Unnamed: 4'].str.rstrip(' C')
df_al['Unnamed: 5'] = df_al['Unnamed: 5'].str.rstrip(' C')
df_al['Unnamed: 6'] = df_al['Unnamed: 6'].str.rstrip(' C')
df_al['Unnamed: 7'] = df_al['Unnamed: 7'].str.rstrip(' C')

df_al['Unnamed: 3'] = df_al['Unnamed: 3'].str.rstrip(' F')
df_al['Unnamed: 4'] = df_al['Unnamed: 4'].str.rstrip(' F')
df_al['Unnamed: 5'] = df_al['Unnamed: 5'].str.rstrip(' F')
df_al['Unnamed: 6'] = df_al['Unnamed: 6'].str.rstrip(' F')
df_al['Unnamed: 7'] = df_al['Unnamed: 7'].str.rstrip(' F')

df_al['Unnamed: 3'] = df_al['Unnamed: 3'].str.rstrip(' G')
df_al['Unnamed: 4'] = df_al['Unnamed: 4'].str.rstrip(' G')
df_al['Unnamed: 5'] = df_al['Unnamed: 5'].str.rstrip(' G')
df_al['Unnamed: 6'] = df_al['Unnamed: 6'].str.rstrip(' G')
df_al['Unnamed: 7'] = df_al['Unnamed: 7'].str.rstrip(' G')


# create seperate all-league dataframes:
df_al1 = df_al[df_al.Tm=='1st']
df_al2 = df_al[df_al.Tm=='2nd']
df_al3 = df_al[df_al.Tm=='3rd']

# create unique list of names for every all league level

# first team dataframe:
Team1 = pd.DataFrame(columns=['Player'])

for n in df_al1.columns[3:]:
    w1 = df_al1[n].unique()
    x1 = pd.DataFrame(w1)
    x1.rename(columns={0:'Player'}, inplace=True)
    Team1 = Team1.append(x1)


# second team dataframe:    
Team2 = pd.DataFrame(columns=['Player'])

for n in df_al2.columns[3:]:
    w2 = df_al2[n].unique()
    x2 = pd.DataFrame(w2)
    x2.rename(columns={0:'Player'}, inplace=True)
    Team2 = Team2.append(x2)

    
# third team dataframe:    
Team3 = pd.DataFrame(columns=['Player'])

for n in df_al3.columns[3:]:
    w3 = df_al3[n].unique()
    x3 = pd.DataFrame(w3)
    x3.rename(columns={0:'Player'}, inplace=True)
    Team3 = Team3.append(x3)

    
# get unique values:
Team1 = pd.DataFrame(Team1['Player'].unique())
Team2 = pd.DataFrame(Team2['Player'].unique())
Team3 = pd.DataFrame(Team3['Player'].unique())


# rename columns
Team1.rename(columns={0:'Player'}, inplace=True)
Team2.rename(columns={0:'Player'}, inplace=True)
Team3.rename(columns={0:'Player'}, inplace=True)



In [14]:
# clean nba all-league dataframe
# ------------------------------ #

# parse out records where players were tied for award

# first team:

new = Team1["Player"].str.split(",", n = 1, expand = True)
new = new.dropna()
new.rename(columns={1:'Player'}, inplace=True)

new1 = pd.DataFrame(new[0])
new1.rename(columns={0:'Player'}, inplace=True)
Team1 = Team1.append(new1)

new['Player'] = new['Player'].str.rstrip(' (T)')
new.drop([0], axis = 1, inplace = True) 
Team1 = Team1.append(new)

# second team:

new = Team2["Player"].str.split(",", n = 1, expand = True)
if len(new.columns) > 1:
    new = new.dropna()
    new.rename(columns={1:'Player'}, inplace=True)

    new1 = pd.DataFrame(new[0])
    new1.rename(columns={0:'Player'}, inplace=True)
    Team2 = Team2.append(new1)

    new['Player'] = new['Player'].str.rstrip(' (T)')
    new.drop([0], axis = 1, inplace = True) 
    Team2 = Team2.append(new)


# third team:

new = Team3["Player"].str.split(",", n = 1, expand = True)
if len(new.columns) > 1:
    new = Team3["Player"].str.split(",", n = 1, expand = True)
    new = new.dropna()
    new.rename(columns={1:'Player'}, inplace=True)

    new1 = pd.DataFrame(new[0])
    new1.rename(columns={0:'Player'}, inplace=True)
    Team3 = Team3.append(new1)

    new['Player'] = new['Player'].str.rstrip(' (T)')
    new.drop([0], axis = 1, inplace = True) 
    Team3 = Team3.append(new)

# remove records with multiple players:

Team1 = Team1[~Team1['Player'].str.contains (",")]
Team2 = Team2[~Team2['Player'].str.contains (",")]
Team3 = Team3[~Team3['Player'].str.contains (",")]   

    
# get unique values:
Team1 = pd.DataFrame(Team1['Player'].unique())
Team2 = pd.DataFrame(Team2['Player'].unique())
Team3 = pd.DataFrame(Team3['Player'].unique())


# rename columns:
Team1.rename(columns={0:'Player'}, inplace=True)
Team2.rename(columns={0:'Player'}, inplace=True)
Team3.rename(columns={0:'Player'}, inplace=True)


# strip invalid characters:
Team1['Player'] = Team1['Player'].str.rstrip(' C')
Team2['Player'] = Team2['Player'].str.rstrip(' C')
Team3['Player'] = Team3['Player'].str.rstrip(' C')

Team1['Player'] = Team1['Player'].str.rstrip(' F')
Team2['Player'] = Team2['Player'].str.rstrip(' F')
Team3['Player'] = Team3['Player'].str.rstrip(' F')

Team1['Player'] = Team1['Player'].str.rstrip(' G')
Team2['Player'] = Team2['Player'].str.rstrip(' G')
Team3['Player'] = Team3['Player'].str.rstrip(' G')

In [15]:
# nba all rookie dataframe:
# ------------------------- #

url = "https://www.basketball-reference.com/awards/all_rookie.html"
html = urlopen(url)
soup = BeautifulSoup(html)
s = soup.find(name='table', attrs={'id':'awards_all_rookie'})
hdr = [th.getText() for th in s.findAll('tr', limit=2)[0].findAll('th')]
rows = s.findAll('tr')[1:]
html_str = str(s)
df_ar = pd.read_html(html_str)[0]
df_ar = df_ar.dropna()


# clean up player names in all-rookie dataframe:
df_ar['Unnamed: 3'] = df_ar['Unnamed: 3'].str.rstrip(' C')
df_ar['Unnamed: 4'] = df_ar['Unnamed: 4'].str.rstrip(' C')
df_ar['Unnamed: 5'] = df_ar['Unnamed: 5'].str.rstrip(' C')
df_ar['Unnamed: 6'] = df_ar['Unnamed: 6'].str.rstrip(' C')
df_ar['Unnamed: 7'] = df_ar['Unnamed: 7'].str.rstrip(' C')

df_ar['Unnamed: 3'] = df_ar['Unnamed: 3'].str.rstrip(' F')
df_ar['Unnamed: 4'] = df_ar['Unnamed: 4'].str.rstrip(' F')
df_ar['Unnamed: 5'] = df_ar['Unnamed: 5'].str.rstrip(' F')
df_ar['Unnamed: 6'] = df_ar['Unnamed: 6'].str.rstrip(' F')
df_ar['Unnamed: 7'] = df_ar['Unnamed: 7'].str.rstrip(' F')

df_ar['Unnamed: 3'] = df_ar['Unnamed: 3'].str.rstrip(' G')
df_ar['Unnamed: 4'] = df_ar['Unnamed: 4'].str.rstrip(' G')
df_ar['Unnamed: 5'] = df_ar['Unnamed: 5'].str.rstrip(' G')
df_ar['Unnamed: 6'] = df_ar['Unnamed: 6'].str.rstrip(' G')
df_ar['Unnamed: 7'] = df_ar['Unnamed: 7'].str.rstrip(' G')


# # create seperate all-league dataframes:
df_ar1 = df_ar[df_ar.Tm=='1st']
df_ar2 = df_ar[df_ar.Tm=='2nd']


# create unique list of names for every all league level

# first team dataframe:
tar1 = pd.DataFrame(columns=['Player'])

for n in df_ar1.columns[3:]:
    w1 = df_ar1[n].unique()
    x1 = pd.DataFrame(w1)
    x1.rename(columns={0:'Player'}, inplace=True)
    tar1 = tar1.append(x1)


# second team dataframe:    
tar2 = pd.DataFrame(columns=['Player'])

for n in df_ar2.columns[3:]:
    w2 = df_ar2[n].unique()
    x2 = pd.DataFrame(w2)
    x2.rename(columns={0:'Player'}, inplace=True)
    tar2 = tar2.append(x2)

    
# get unique values:
tar1 = pd.DataFrame(tar1['Player'].unique())
tar2 = pd.DataFrame(tar2['Player'].unique())


# rename columns
tar1.rename(columns={0:'Player'}, inplace=True)
tar2.rename(columns={0:'Player'}, inplace=True)




In [16]:
# clean nba all-rookie dataframe
# ------------------------- #

# parse out records where players were tied for award

# first team:

new = tar1["Player"].str.split(",", n = 1, expand = True)
new = new.dropna()
new.rename(columns={1:'Player'}, inplace=True)

new1 = pd.DataFrame(new[0])
new1.rename(columns={0:'Player'}, inplace=True)
tar1 = tar1.append(new1)

new['Player'] = new['Player'].str.rstrip(' (T)')
new.drop([0], axis = 1, inplace = True) 
tar1 = tar1.append(new)

# second team:

new = tar2["Player"].str.split(",", n = 1, expand = True)
if len(new.columns) > 1:
    new = new.dropna()
    new.rename(columns={1:'Player'}, inplace=True)

    new1 = pd.DataFrame(new[0])
    new1.rename(columns={0:'Player'}, inplace=True)
    tar2 = tar2.append(new1)

    new['Player'] = new['Player'].str.rstrip(' (T)')
    new.drop([0], axis = 1, inplace = True) 
    tar2 = tar2.append(new)



# remove records with multiple players:

tar1 = tar1[~tar1['Player'].str.contains (",")]
tar2 = tar2[~tar2['Player'].str.contains (",")]


# get unique values:
tar1 = pd.DataFrame(tar1['Player'].unique())
tar2 = pd.DataFrame(tar2['Player'].unique())


# rename columns:
tar1.rename(columns={0:'Player'}, inplace=True)
tar2.rename(columns={0:'Player'}, inplace=True)


# strip invalid characters:
tar1['Player'] = tar1['Player'].str.rstrip(' C')
tar2['Player'] = tar2['Player'].str.rstrip(' C')


tar1['Player'] = tar1['Player'].str.rstrip(' F')
tar2['Player'] = tar2['Player'].str.rstrip(' F')


tar1['Player'] = tar1['Player'].str.rstrip(' G')
tar2['Player'] = tar2['Player'].str.rstrip(' G')


In [17]:
# tar1

In [18]:
# tar2
# df.head()
# df.mean(axis=0)
# df.mean(axis=1)
# df.info()
# df.groupby("Player").agg({'GS': 'mean'})

# df[['G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P','3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
#        'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'TotG', 'TotMP','G', 'MP', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 
# 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP']] = df[['G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
#        '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
#        'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS','TotG', 'TotMP','G', 'MP', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 
# 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP']].apply(pd.to_numeric)

# # df.info()
# df.groupby("Player").agg({'G': 'mean'})


# df.groupby("Player", as_index=True)[['G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P','3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
#        'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'TotG', 'TotMP','G', 'MP', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 
# 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP']].mean()



# df[df.Player=='Álex Abrines']

# df_pg = pd.DataFrame(
# columns = ['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
#        '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
#        'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS'],index=[])

# # advanced dataframe:

# df_adv = pd.DataFrame(
# columns = ['Player', 'Pos', 'Age', 'Tm', 'G', 'MP', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 
# 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP'],index=[])


In [34]:
# df_pg.head()
# df_pg[df_pg.Player=='Aaron Gordon']
# df_adv[df_adv.Player=='Aaron Gordon']

# df_adv.head()

df.columns

# df[df.Player=='Aaron Gordon']
# df_avg.head()

Index(['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year', 'TotG',
       'TotMP', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%',
       'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM',
       'DBPM', 'BPM', 'VORP'],
      dtype='object')

In [20]:
<th aria-label="" class="over_header center" colspan="6" data-stat="header_per_g">Per Game</th>

<th aria-label="Player" data-stat="player" scope="col" class="poptip sort_default_asc left sort_col sorttable_sorted">Player<span class="sorttable_elSortDir3"><br>▲</span></th>



SyntaxError: invalid syntax (<ipython-input-20-f3aaa866a9ab>, line 1)

In [None]:
# pandas_profiling.ProfileReport(df)