In [2]:
import time
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
import numpy as np
import pandas as pd
import os
import re
from bs4 import BeautifulSoup
import requests
import csv
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from basketball_reference_web_scraper import client
from datetime import date
from functools import reduce
from datetime import datetime
from collections import Counter
from dateutil.rrule import rrule, DAILY
from basketball_reference_web_scraper import client
from basketball_reference_web_scraper.data import Team
from basketball_reference_web_scraper.data import Outcome
from basketball_reference_web_scraper.data import Position

In [None]:
def get_response(url):
    print(f"Requesting \'{url}\'")
    try:
        r = requests.get(url)
        text = r.text
        text= text.replace("<!--", "")
        text= text.replace("-->", "")
        return BeautifulSoup(text, 'lxml')
    except requests.exceptions.RequestException as e:
        print(e)
    return None

# create directory for data
directory = os.path.join(os.getcwd(), 'scrapedata')
print(f"directory is '{directory}'")
if not os.path.exists(directory):
    print(f"Creating directory \'{directory}\'")
    os.makedirs(directory)

In [None]:
#Downloads NBA Player data 1988-2019 and NBA award voting data from www.basketball-reference.com 

# scrape player per game statistic data 1988-2019
for year in range(1988,2020):
    data = []
    url = f"http://www.basketball-reference.com/leagues/NBA_{year}_per_game.html"
    soup = get_response(url)
    table = soup.find('table', attrs={'id':'per_game_stats'})
    table_head = table.find('thead')
    hrow = table_head.find('tr')
    hcols = hrow.find_all('th')
    hcols = [ele.text.strip() for ele in hcols]
    data.append([ele for ele in hcols if ele])
    table_body = table.find('tbody')
    rows = table_body.find_all('tr')
    for row in rows:
        cols = [ele.text.strip() for ele in row.find_all('th')]
        # ignore rows that contain header information
        if cols[0] == 'Rk':
            continue
        cols += [ele.text.strip() for ele in row.find_all('td')]
        # data.append([ele for ele in cols if ele])
        # occasionally records representing pct are blank if num and denom are zero
        data.append([ele for ele in cols])
    fname = f"{directory}/{table.get('id')}_{year}.csv"
    with open(fname, 'w') as file:
        wr = csv.writer(file)
        wr.writerows(data)

In [None]:
# scrape player advanced statistic data 1988-2019
for year in range(1988,2020):
    data = []
    url = f"http://www.basketball-reference.com/leagues/NBA_{year}_advanced.html"
    soup = get_response(url)
    table = soup.find('table', attrs={'id':'advanced_stats'})
    table_head = table.find('thead')
    hrow = table_head.find('tr')
    hcols = hrow.find_all('th')
    hcols = [ele.text.strip() for ele in hcols]
    data.append([ele for ele in hcols if ele])
    table_body = table.find('tbody')
    rows = table_body.find_all('tr')
    for row in rows:
        cols = [ele.text.strip() for ele in row.find_all('th')]
        # ignore rows that contain header information
        if cols[0] == 'Rk':
            continue
        cols += [ele.text.strip() for ele in row.find_all('td')]
        data.append([ele for ele in cols if ele])
    fname = f"{directory}/{table.get('id')}_{year}.csv"
    with open(fname, 'w') as file:
        wr = csv.writer(file)
        wr.writerows(data)

In [None]:
#scrape MVP voting data 1988-2018
for year in range(1988,2019):
    data = []
    url = f"http://www.basketball-reference.com/awards/awards_{year}.html"
    soup = get_response(url)
    if not soup:
        continue
    table = soup.find('table', attrs={'id':'mvp'})
    table_head = table.find('thead')
    hrow = table_head.find_all('tr')[1]
    hcols = hrow.find_all('th')
    hcols = [ele.text.strip() for ele in hcols]
    data.append([ele for ele in hcols if ele])
    table_body = table.find('tbody')
    rows = table_body.find_all('tr')
    # because the 'Rank' column is broken we'll do it ourselves
    rank = 0
    for row in rows:
        rank += 1
        line = [rank]
        cols = [ele.text.strip() for ele in row.find_all('td')]
        line += [ele for ele in cols]
        data.append([ele for ele in line if ele])
    fname = f"{directory}/mvp_voting_{year}.csv"
    with open(fname, 'w') as file:
        wr = csv.writer(file)
        wr.writerows(data)

In [None]:
#scrape Rookie of the Year (ROY) voting data 1988-2018
for year in range(1988,2019):
    data = []
    url = f"http://www.basketball-reference.com/awards/awards_{year}.html"
    soup = get_response(url)
    if not soup:
        continue
    table = soup.find('table', attrs={'id':'roy'})
    table_head = table.find('thead')
    hrow = table_head.find_all('tr')[1]
    hcols = hrow.find_all('th')
    hcols = [ele.text.strip() for ele in hcols]
    data.append([ele for ele in hcols if ele])
    table_body = table.find('tbody')
    rows = table_body.find_all('tr')
    # because the 'Rank' column is broken we'll do it ourselves
    rank = 0
    for row in rows:
        rank += 1
        line = [rank]
        cols = [ele.text.strip() for ele in row.find_all('td')]
        line += [ele for ele in cols]
        data.append([ele for ele in line if ele])
    fname = f"{directory}/roy_voting_{year}.csv"
    with open(fname, 'w') as file:
        wr = csv.writer(file)
        wr.writerows(data)

In [None]:
#scrape Defensive Player of the Year (DPOY) voting data 1988-2018
for year in range(1988,2019):
    data = []
    url = f"http://www.basketball-reference.com/awards/awards_{year}.html"
    soup = get_response(url)
    if not soup:
        continue
    table = soup.find('table', attrs={'id':'dpoy'})
    table_head = table.find('thead')
    hrow = table_head.find_all('tr')[1]
    hcols = hrow.find_all('th')
    hcols = [ele.text.strip() for ele in hcols]
    data.append([ele for ele in hcols if ele])
    table_body = table.find('tbody')
    rows = table_body.find_all('tr')
    # because the 'Rank' column is broken we'll do it ourselves
    rank = 0
    for row in rows:
        rank += 1
        line = [rank]
        cols = [ele.text.strip() for ele in row.find_all('td')]
        line += [ele for ele in cols]
        data.append([ele for ele in line if ele])
    fname = f"{directory}/dpoy_voting_{year}.csv"
    with open(fname, 'w') as file:
        wr = csv.writer(file)
        wr.writerows(data)

In [None]:
#scrape Sixth Man of the Year (SMOY) voting data 1988-2018
for year in range(1988,2019):
    data = []
    url = f"http://www.basketball-reference.com/awards/awards_{year}.html"
    soup = get_response(url)
    if not soup:
        continue
    table = soup.find('table', attrs={'id':'smoy'})
    table_head = table.find('thead')
    hrow = table_head.find_all('tr')[1]
    hcols = hrow.find_all('th')
    hcols = [ele.text.strip() for ele in hcols]
    data.append([ele for ele in hcols if ele])
    table_body = table.find('tbody')
    rows = table_body.find_all('tr')
    # because the 'Rank' column is broken we'll do it ourselves
    rank = 0
    for row in rows:
        rank += 1
        line = [rank]
        cols = [ele.text.strip() for ele in row.find_all('td')]
        line += [ele for ele in cols]
        data.append([ele for ele in line if ele])
    fname = f"{directory}/smoy_voting_{year}.csv"
    with open(fname, 'w') as file:
        wr = csv.writer(file)
        wr.writerows(data)

In [None]:
#scrape Most Improved Player (MIP) voting data 1988-2018
for year in range(1988,2019):
    data = []
    url = f"http://www.basketball-reference.com/awards/awards_{year}.html"
    soup = get_response(url)
    if not soup:
        continue
    table = soup.find('table', attrs={'id':'mip'})
    table_head = table.find('thead')
    hrow = table_head.find_all('tr')[1]
    hcols = hrow.find_all('th')
    hcols = [ele.text.strip() for ele in hcols]
    data.append([ele for ele in hcols if ele])
    table_body = table.find('tbody')
    rows = table_body.find_all('tr')
    # because the 'Rank' column is broken we'll do it ourselves
    rank = 0
    for row in rows:
        rank += 1
        line = [rank]
        cols = [ele.text.strip() for ele in row.find_all('td')]
        line += [ele for ele in cols]
        data.append([ele for ele in line if ele])
    fname = f"{directory}/mip_voting_{year}.csv"
    with open(fname, 'w') as file:
        wr = csv.writer(file)
        wr.writerows(data)              

In [None]:
# scrape team standings data 1988-2019
for year in range(1988,2020):
    url = f"http://www.basketball-reference.com/leagues/NBA_{year}_standings.html"
    soup = get_response(url)
    tables = soup.find_all('table')
    for table in tables:
        tid = table.get('id')
        data = []
        table_head = table.find('thead')
        hrow = table_head.find('tr')
        hcols = hrow.find_all('th')
        hcols = [ele.text.strip() for ele in hcols]
        data.append([ele for ele in hcols if ele])
        table_body = table.find('tbody')
        rows = table_body.find_all('tr')
        for row in rows:
            cols = [ele.text.strip() for ele in row.find_all('th')]
            cols += [ele.text.strip() for ele in row.find_all('td')]
            data.append([ele for ele in cols if ele])
        fname = f"{directory}/{tid}_{year}.csv"
        with open(fname, 'w') as file:
            wr = csv.writer(file)
            wr.writerows(data)

In [None]:
#scrape All-NBA selection data 1947-2018
data = []
url = "https://www.basketball-reference.com/awards/all_league.html"
soup = get_response(url)
if not soup:
    print("AHH")
table = soup.find('table', attrs={'id':'awards_all_league'})
table_head = table.find('thead')
hrow = table_head.find_all('tr')[0]
hcols = hrow.find_all('th')
hcols = [ele.text.strip() for ele in hcols]
data.append([ele for ele in hcols if ele])
table_body = table.find('tbody')
rows = table_body.find_all('tr')
for row in rows:
    line = []
    yr = [ele.text.strip() for ele in row.find_all('th')]
    cols = [ele.text.strip() for ele in row.find_all('td')]
    line += [ele for ele in yr]
    line += [ele for ele in cols]
    data.append([ele for ele in line if ele])
    
alldf=pd.DataFrame(data) #Clean up the df
alldf.columns=alldf.loc[0]
alldf=alldf.drop(0).dropna()
alldf.Season=alldf.Season.apply(lambda x:int(x[:4])+1) 
for i in range(5):
    alldf.columns.values[i+3]='Player'+str(i+1)
new_df = pd.DataFrame()
for index, row in alldf.iterrows():
    if row["Tm"] == "1st":
        all_nba_1st, all_nba_2nd, all_nba_3rd = 1, 0, 0
    if row["Tm"] == "2nd":
        all_nba_1st, all_nba_2nd, all_nba_3rd = 0, 1, 0
    if row["Tm"] == "3rd":
        all_nba_1st, all_nba_2nd, all_nba_3rd = 0, 0, 1
    for player in ["Player1","Player2","Player3","Player4","Player5"]:
        new_row = pd.DataFrame([row[player],row["Season"], all_nba_1st, all_nba_2nd, all_nba_3rd])
        new_row = new_row.T
        if new_df.empty == True:
            new_df = new_row
        else:
            new_df=pd.concat([new_row,new_df])
new_df.columns = ["Player","Year", "all_nba_1st", "all_nba_2nd", "all_nba_3rd"]
new_df=new_df.sort_values(by=["Year",'all_nba_1st','all_nba_2nd'],ascending=False)
new_df["Player"]=new_df["Player"].apply(lambda x: x[:-2])
new_df.to_csv(f"{directory}/all_nba.csv",index=False) # To CSV

In [None]:
#scrape All-Rookie selections data 1947-2018
data = []
url = "https://www.basketball-reference.com/awards/all_rookie.html"
soup = get_response(url)
if not soup:
    print("AHH! URL broken?")
table = soup.find('table', attrs={'id':'awards_all_rookie'})
table_head = table.find('thead')
hrow = table_head.find_all('tr')[0]
hcols = hrow.find_all('th')
hcols = [ele.text.strip() for ele in hcols]
data.append([ele for ele in hcols if ele])
table_body = table.find('tbody')
rows = table_body.find_all('tr')
for row in rows:
    line = []
    yr = [ele.text.strip() for ele in row.find_all('th')]
    cols = [ele.text.strip() for ele in row.find_all('td')]
    line += [ele for ele in yr]
    line += [ele for ele in cols]
    data.append([ele for ele in line if ele])
    
alldf=pd.DataFrame(data) #Clean up the df
alldf.columns=alldf.loc[0]
alldf=alldf.drop(0).dropna()
alldf.Season=alldf.Season.apply(lambda x:int(x[:4])+1) 
for i in range(5):
    alldf.columns.values[i+3]='Player'+str(i+1)
new_df = pd.DataFrame()
for index, row in alldf.iterrows():
    if row["Tm"] == "1st":
        all_nba_1st, all_nba_2nd = 1, 0
    if row["Tm"] == "2nd":
        all_nba_1st, all_nba_2nd = 0, 1
        
    for player in ["Player1","Player2","Player3","Player4","Player5"]:
        new_row = pd.DataFrame([row[player],row["Season"], all_nba_1st, all_nba_2nd])
        new_row = new_row.T
        if new_df.empty == True:
            new_df = new_row
        else:
            new_df=pd.concat([new_row,new_df])
new_df.columns = ["Player","Year", "all_rookie_1st", "all_rookie_2nd"]

#Fix Rows with Ties
ties=new_df[new_df["Player"].str.contains(",")]
new_df=new_df[~new_df["Player"].str.contains(",")]
for index, row in ties.iterrows():
    new_rows = pd.DataFrame([[row["Player"].split(',')[0],row["Year"], row["all_rookie_1st"], row["all_rookie_2nd"]],[row["Player"].split(', ')[1][:-3],row["Year"],row["all_rookie_1st"],row["all_rookie_2nd"]]],columns=new_df.columns)
    new_df=new_df.append(new_rows)
    
new_df=new_df.sort_values(by=["Year",'all_rookie_1st'],ascending=False)
new_df.index=range(len(new_df))
new_df.to_csv(f"{directory}/all_rookie.csv",index=False) # To CSV

In [None]:
#scrape All-Defensive selections data 1947-2018
data = []
url = "https://www.basketball-reference.com/awards/all_defense.html"
soup = get_response(url)
if not soup:
    print("AHH! URL broken?")
table = soup.find('table', attrs={'id':'awards_all_defense'})
table_head = table.find('thead')
hrow = table_head.find_all('tr')[0]
hcols = hrow.find_all('th')
hcols = [ele.text.strip() for ele in hcols]
data.append([ele for ele in hcols if ele])
table_body = table.find('tbody')
rows = table_body.find_all('tr')
for row in rows:
    line = []
    yr = [ele.text.strip() for ele in row.find_all('th')]
    cols = [ele.text.strip() for ele in row.find_all('td')]
    line += [ele for ele in yr]
    line += [ele for ele in cols]
    data.append([ele for ele in line if ele])
    
alldf=pd.DataFrame(data) #Clean up the df
alldf.columns=alldf.loc[0]
alldf=alldf.drop(0).dropna()
alldf.Season=alldf.Season.apply(lambda x:int(x[:4])+1) 
for i in range(5):
    alldf.columns.values[i+3]='Player'+str(i+1)
new_df = pd.DataFrame()
for index, row in alldf.iterrows():
    if row["Tm"] == "1st":
        all_nba_1st, all_nba_2nd = 1, 0
    if row["Tm"] == "2nd":
        all_nba_1st, all_nba_2nd = 0, 1
        
    for player in ["Player1","Player2","Player3","Player4","Player5"]:
        new_row = pd.DataFrame([row[player],row["Season"], all_nba_1st, all_nba_2nd])
        new_row = new_row.T
        if new_df.empty == True:
            new_df = new_row
        else:
            new_df=pd.concat([new_row,new_df])
new_df.columns = ["Player","Year", "all_defense_1st", "all_defense_2nd"]

#Fix Rows with Ties
ties=new_df[new_df["Player"].str.contains(",")]
new_df=new_df[~new_df["Player"].str.contains(",")]
for index, row in ties.iterrows():
    new_rows = pd.DataFrame([[row["Player"].split(',')[0],row["Year"], row["all_defense_1st"], row["all_defense_2nd"]],[row["Player"].split(', ')[1][:-3],row["Year"],row["all_defense_1st"],row["all_defense_2nd"]]],columns=new_df.columns)
    new_df=new_df.append(new_rows)
    
new_df=new_df.sort_values(by=["Year",'all_defense_1st'],ascending=False)
new_df.index=range(len(new_df))
new_df.to_csv(f"{directory}/all_defense.csv",index=False) # To CSV

In [None]:
#Scrape Rookie Draft classes
for year in range(1987,2019):
    data = []
    url = f"https://www.basketball-reference.com/draft/NBA_{year}.html"
    soup = get_response(url)
    table = soup.find('table', attrs={'id':'stats'})
    table_head = table.find('thead')
    hrow = table_head.find('tr')
    hcols = hrow.find_all('th')
    hcols = [ele.text.strip() for ele in hcols]
    data.append([ele for ele in hcols if ele])
    table_body = table.find('tbody')
    rows = table_body.find_all('tr')
    for row in rows:
        cols = [ele.text.strip() for ele in row.find_all('th')]
        # ignore rows that contain header information
        if cols[0] == 'Rk':
            continue
        cols += [ele.text.strip() for ele in row.find_all('td')]
        # data.append([ele for ele in cols if ele])
        # occasionally records representing pct are blank if num and denom are zero
        data.append([ele for ele in cols])
    fname = f"{directory}/rookie_class_{year+1}.csv"
    with open(fname, 'w') as file:
        wr = csv.writer(file)
        wr.writerows(data)

In [None]:
#Scraping top 2019 MVP candidates from Basketball Reference MVP Tracker https://www.basketball-reference.com/friv/mvp.html
url = f"https://www.basketball-reference.com/friv/mvp.html"
soup = get_response(url)
tables = soup.find_all('table')
for table in tables:
    tid = table.get('id')
    data = []
    table_head = table.find('thead')
    hrow = table_head.find('tr')
    hcols = hrow.find_all('th')
    hcols = [ele.text.strip() for ele in hcols]
    data.append([ele for ele in hcols if ele])
    table_body = table.find('tbody')
    rows = table_body.find_all('tr')
    for row in rows:
        cols = [ele.text.strip() for ele in row.find_all('th')]
        cols += [ele.text.strip() for ele in row.find_all('td')]
        data.append([ele for ele in cols if ele])
df=pd.DataFrame(data)
df.columns=df.iloc[0]
df=df.drop([0]).drop(columns=['Rk'])
df['Prob%'] = df['Prob%'].str.replace('%', '').astype(float)
cols = df.columns.drop(['Player','Tm'])
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')
df["Rank_mvp"]=df.index
df["Share_mvp"]=df["Prob%"]/58 #convert the given probability to match share

df.to_csv(f"{directory}/mvp_voting_2019.csv") # to CSV

In [390]:
'''
NOTE: This section of code is heavily adapted from https://github.com/stevenrdungan/mvp

Merges all our scraped data together - MVP voting data, Per game data, Seed, Advanced stats, all NBA awards and honors, rookie data

I'll use this to make MVP (among other) predictions.  I will use leaders dataset to measure the quality of the leader variable.

'''

teams = {'Atlanta Hawks':'ATL',
'Boston Celtics':'BOS',
'Brooklyn Nets':'BRK',
'Charlotte Bobcats':'CHA',
'Charlotte Hornets':'CHO',
'Chicago Bulls':'CHI',
'Cleveland Cavaliers':'CLE',
'Dallas Mavericks':'DAL',
'Denver Nuggets':'DEN',
'Detroit Pistons':'DET',
'Golden State Warriors':'GSW',
'Houston Rockets':'HOU',
'Indiana Pacers':'IND',
'Los Angeles Clippers':'LAC',
'Los Angeles Lakers':'LAL',
'Memphis Grizzlies':'MEM',
'Miami Heat':'MIA',
'Milwaukee Bucks':'MIL',
'Minnesota Timberwolves':'MIN',
'New Jersey Nets':'NJN',
'New Orleans Hornets':'NOH',
'New Orleans Pelicans':'NOP',
'New OrleansOklahoma City Hornets':'NOK',
'New York Knicks':'NYK',
'Oklahoma City Thunder':'OKC',
'Orlando Magic':'ORL',
'Philadelphia 76ers':'PHI',
'Phoenix Suns':'PHO',
'Portland Trail Blazers':'POR',
'Sacramento Kings':'SAC',
'San Antonio Spurs':'SAS',
'Seattle SuperSonics':'SEA',
'Toronto Raptors':'TOR',
'Utah Jazz':'UTA',
'Vancouver Grizzlies':'VAN',
'Washington Bullets':'WSB',
'Washington Wizards':'WAS'}

pd.set_option('precision', 3)
data = pd.DataFrame()   # this will be our dataset
directory = os.path.join(os.getcwd(),'scrapedata')

for year in range(1988,2020):
    # read data into DataFrames
    pergame, advanced, voting, mvp_voting, roy_voting, dpoy_voting, smoy_voting, mip_voting, east, west, all_nba, all_rookie, all_defensive, is_rookie  = pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame() 
    for filename in os.listdir(directory):
        if re.match(f"per_game_stats_{year}.csv", filename):
            pergame = pd.read_csv(os.path.join(directory,filename))
            if year == 2017:
                if 'PS/G' in pergame.columns:
                    pergame=pergame.rename(index=str, columns={"PS/G": "PTS"})
        elif re.match(f"advanced_stats_{year}.csv", filename):
            advanced = pd.read_csv(os.path.join(directory,filename))    
        elif re.match(f"mvp_voting_{year}.csv", filename):
            mvp_voting = pd.read_csv(os.path.join(directory,filename))
        elif re.match(f"roy_voting_{year}.csv", filename):
            roy_voting = pd.read_csv(os.path.join(directory,filename))
        elif re.match(f"dpoy_voting_{year}.csv", filename):
            dpoy_voting = pd.read_csv(os.path.join(directory,filename))
        elif re.match(f"smoy_voting_{year}.csv", filename):
            smoy_voting = pd.read_csv(os.path.join(directory,filename))
        elif re.match(f"mip_voting_{year}.csv", filename):
            mip_voting = pd.read_csv(os.path.join(directory,filename))
        elif re.match(f"[a-z]+s_standings_E_{year}.csv", filename):
            east = pd.read_csv(os.path.join(directory,filename))
        elif re.match(f"[a-z]+s_standings_W_{year}.csv", filename):
            west = pd.read_csv(os.path.join(directory,filename))
        elif re.match(f"all_nba.csv", filename):
            all_nba = pd.read_csv(os.path.join(directory,filename))
        elif re.match("all_rookie.csv", filename):
            all_rookie = pd.read_csv(os.path.join(directory,filename))
        elif re.match("all_defense.csv", filename):
            all_defense = pd.read_csv(os.path.join(directory,filename))
        elif re.match(f"rookie_class_{year}.csv", filename):
            is_rookie=pd.read_csv(os.path.join(directory,filename))

    # assemble stats dataframe
    pergame = pergame.loc[:,['Player','Age','Tm','G','MP','TRB','AST','STL','BLK','PTS',"FGA",'FG%','3PA','3P%','eFG%','FT%','TOV','PF']]
    advanced = advanced.loc[:,['Player','Age','Tm','PER','TS%','USG%','VORP','WS','TRB%','AST%','STL%','BLK%','TOV%','WS/48','BPM']]
    if pergame["G"].max() != 82:
                advanced["WS"]=advanced["WS"]*(82/pergame["G"].max())  #ADJUST WIN SHARE FOR FULL SEASON in lockout years
                advanced["VORP"]=advanced["VORP"]*(82/pergame["G"].max())  #ADJUST WIN SHARE FOR FULL SEASON in lockout years #2011 and 1999 and 2019
    stats = pd.merge(pergame, advanced, on=['Player','Age','Tm'], how='left')
    stats['Year'] = year
    # remove asterisk symbol from player name (Hall of Famers)
    stats['Player'].replace(to_replace=r'\*', value=r'', regex=True, inplace=True)
    # drop all duplicate rows (i.e. players who played on multiple teams in same season)
    stats = stats.drop_duplicates(subset=['Player','Age'], keep=False)
    # only keep rows for players playing 25 minutes per game or more
    stats = stats[stats.MP >= 25.0]
    # box is sum of rebounds, assists, steals, blocks
    stats['box'] = stats['TRB'] + stats['AST'] + stats['STL'] + stats['BLK']

    # assemble standings dataframe. sort 2017 playoff teams can be easily determined
    east = east.rename(columns = {'Eastern Conference':'Tm'}).sort_values('W/L%', ascending=False).reset_index(drop=True)
    west = west.rename(columns = {'Western Conference':'Tm'}).sort_values('W/L%', ascending=False).reset_index(drop=True)
    # this will remove the Division/Conference header lines
    standings = pd.concat([east, west]).dropna()
    standings = standings.loc[:,['Tm','W','L','W/L%']]
    standings['playoffs'] = standings['Tm'].str.contains('\*').astype(int)
    standings['seed'] = standings.index + 1

    if year == 2019:
        standings['playoffs'][standings.index < 8] = 1   # assume playoffs for top 8 teams in each conference
    standings['games'] = standings['W'] + standings ['L']
    standings['Tm'] = standings['Tm'].str.replace('[^\w\s]+','').str.replace('\d+\s*$','').str.strip()
    standings = standings.replace({'Tm':teams}, regex=True)

    if year < 2003:   # if year is < 2003 replace CHA with CHH. ugly but it works!
        standings['Tm'].replace('CHO','CHH', inplace=True)
    standings = standings.drop(['W','L'], axis=1)
    df_merge = pd.merge(stats, standings, on='Tm', how='left')
    
    #All-NBA, All-Rookie, All Star
    
    if not mvp_voting.empty: #Load MVP data
        mvp_voting['Tm'] = mvp_voting['Tm'].str.strip()
        mvp_voting = mvp_voting.loc[:,['Player','Tm','Share','Rank']]
        mvp_voting = mvp_voting.rename(columns={"Share":"Share_mvp","Rank":"Rank_mvp"})
        df_merge = pd.merge(df_merge, mvp_voting, on=['Player','Tm'], how='left')
        df_merge['got_votes_mvp'] = (df_merge['Share_mvp'] > 0).astype(int)
        df_merge['is_mvp'] = (df_merge['Rank_mvp'] == 1).astype(int)
        df_merge['Share_mvp'].fillna(0, inplace=True)
    else:
        df_merge['is_mvp'] = np.nan
        df_merge['got_votes_mvp'] = np.nan
        df_merge['Share_mvp']= np.nan
    
    if not roy_voting.empty: #Load ROY data
        roy_voting['Tm'] = roy_voting['Tm'].str.strip()
        roy_voting = roy_voting.loc[:,['Player','Tm','Share','Rank']]
        roy_voting = roy_voting.rename(columns={"Share":"Share_roy","Rank":"Rank_roy"})
        df_merge = pd.merge(df_merge, roy_voting, on=['Player','Tm'], how='left')
        df_merge['got_votes_roy'] = (df_merge['Share_roy'] > 0).astype(int)
        df_merge['is_roy'] = (df_merge['Rank_roy'] == 1).astype(int)
        df_merge['Share_roy'].fillna(0, inplace=True)
    else:
        df_merge['is_roy'] = np.nan
        df_merge['got_votes_roy'] = np.nan
        df_merge['Share_roy']= np.nan
        
    if not dpoy_voting.empty: #Load DPOY data
        dpoy_voting['Tm'] = dpoy_voting['Tm'].str.strip()
        dpoy_voting = dpoy_voting.loc[:,['Player','Tm','Share','Rank']]
        dpoy_voting = dpoy_voting.rename(columns={"Share":"Share_dpoy","Rank":"Rank_dpoy"})
        df_merge = pd.merge(df_merge, dpoy_voting, on=['Player','Tm'], how='left')
        df_merge['got_votes_dpoy'] = (df_merge['Share_dpoy'] > 0).astype(int)
        df_merge['is_dpoy'] = (df_merge['Rank_dpoy'] == 1).astype(int)
        df_merge['Share_dpoy'].fillna(0, inplace=True)
    else:
        df_merge['is_dpoy'] = np.nan
        df_merge['got_votes_dpoy'] = np.nan
        df_merge['Share_dpoy']= np.nan
        
    if not smoy_voting.empty: #Load SMOY data
        smoy_voting['Tm'] = smoy_voting['Tm'].str.strip()
        smoy_voting = smoy_voting.loc[:,['Player','Tm','Share','Rank']]
        smoy_voting = smoy_voting.rename(columns={"Share":"Share_smoy","Rank":"Rank_smoy"})
        df_merge = pd.merge(df_merge, smoy_voting, on=['Player','Tm'], how='left')
        df_merge['got_votes_smoy'] = (df_merge['Share_smoy'] > 0).astype(int)
        df_merge['is_smoy'] = (df_merge['Rank_smoy'] == 1).astype(int)
        df_merge['Share_smoy'].fillna(0, inplace=True)
    else:
        df_merge['is_smoy'] = np.nan
        df_merge['got_votes_smoy'] = np.nan
        df_merge['Share_smoy']= np.nan 
        
    if not mip_voting.empty: #Load MIP data
        mip_voting['Tm'] = mip_voting['Tm'].str.strip()
        mip_voting = mip_voting.loc[:,['Player','Tm','Share','Rank']]
        mip_voting = mip_voting.rename(columns={"Share":"Share_mip","Rank":"Rank_mip"})
        df_merge = pd.merge(df_merge, mip_voting, on=['Player','Tm'], how='left')
        df_merge['got_votes_mip'] = (df_merge['Share_mip'] > 0).astype(int)
        df_merge['is_mip'] = (df_merge['Rank_mip'] == 1).astype(int)
        df_merge['Share_mip'].fillna(0, inplace=True)
    else:
        df_merge['is_mip'] = np.nan
        df_merge['got_votes_mip'] = np.nan
        df_merge['Share_mip']= np.nan 
    
    if not all_nba.empty:
        df_merge = pd.merge(df_merge, all_nba, on=["Player","Year"],how='left')
        df_merge['all_nba_1st'].fillna(0, inplace=True)
        df_merge['all_nba_2nd'].fillna(0, inplace=True)
        df_merge['all_nba_3rd'].fillna(0, inplace=True)
    else:
        df_merge["all_nba_1st"] = np.nan
        df_merge["all_nba_2nd"] = np.nan
        df_merge["all_nba_3rd"] = np.nan
        
    if not all_rookie.empty:
        df_merge = pd.merge(df_merge, all_rookie, on=["Player","Year"],how='left')
        df_merge['all_rookie_1st'].fillna(0, inplace=True)
        df_merge['all_rookie_2nd'].fillna(0, inplace=True)
        
    else:
        df_merge["all_rookie_1st"] = np.nan
        df_merge["all_rookie_2nd"] = np.nan
        
    if not all_defense.empty:
        df_merge = pd.merge(df_merge, all_defense, on=["Player","Year"],how='left')
        df_merge['all_defense_1st'].fillna(0, inplace=True)
        df_merge['all_defense_2nd'].fillna(0, inplace=True)
    else:
        df_merge["all_defense_1st"] = np.nan
        df_merge["all_defense_2nd"] = np.nan
        
    if not is_rookie.empty:
        rook=pd.DataFrame()
        rook['Player']=is_rookie.index.get_level_values(3).drop("Round 2").drop("Round 3",errors='ignore').drop("Round 4",errors='ignore').drop("Round 5",errors='ignore').drop("Round 6",errors='ignore').drop("Round 7",errors='ignore').drop(np.nan,errors='ignore')
        rook['rookie_pick']=is_rookie.index.levels[0]
        rook['is_rookie']=1
        rook['Year']=year
        df_merge=pd.merge(df_merge,rook,on=["Player","Year"],how='left')
        df_merge["rookie_pick"].fillna(0,inplace=True)
        df_merge["is_rookie"].fillna(0,inplace=True)
    else:
        print ("NO ROOKIE DATA FOR ",year)
        df_merge["rookie_pick"]=np.nan
        df_merge["is_rookie"]=np.nan
    
    df_merge['gp_pct'] = df_merge['G'] / df_merge['games']
    df_merge = df_merge.drop(['G','games'], axis=1)
    if data.empty:
        data = df_merge
    else:
        data = pd.concat([data,df_merge])

data['Share_mvp'] = data['Share_mvp'].map(lambda x: '{0:.3}'.format(x))
data['Share_roy'] = data['Share_roy'].map(lambda x: '{0:.3}'.format(x))
data['Share_dpoy'] = data['Share_dpoy'].map(lambda x: '{0:.3}'.format(x))
data['Share_smoy'] = data['Share_smoy'].map(lambda x: '{0:.3}'.format(x))
data['Share_mip'] = data['Share_mip'].map(lambda x: '{0:.3}'.format(x))

data['Share_mvp']=data.Share_mvp.astype('float')
data['Share_roy']=data.Share_roy.astype('float')
data['Share_dpoy']=data.Share_dpoy.astype('float')
data['Share_smoy']=data.Share_smoy.astype('float')
data['Share_mip']=data.Share_mip.astype('float')

data=data.set_index('Player')

data['TS%'] = data['TS%'].map(lambda x: '{0:.3}'.format(x))
data['TS%']=data['TS%'].astype('float')
data['W/L%'] = data['W/L%'].map(lambda x: '{0:.3}'.format(x))
data['W/L%']=data['W/L%'].astype('float')
data['gp_pct'] = data['gp_pct'].map(lambda x: '{0:.3}'.format(x))
data['gp_pct']=data['gp_pct'].astype('float')

data = data.sort_values(["Share_mvp","Year"], ascending=False)

all_data=data[data.columns]

# output to csv in output folder
outdir = os.path.join(os.getcwd(),'output')
if not os.path.exists(outdir):
    print(f"Creating directory \'{outdir}\'")
    os.makedirs(outdir)
all_data.to_csv(outdir + '/full_dataframe.csv', float_format='%.3f')

all_data



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike


Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.





Unnamed: 0_level_0,3P%,3PA,AST,AST%,Age,BLK,BLK%,BPM,FG%,FGA,...,gp_pct,is_dpoy,is_mip,is_mvp,is_rookie,is_roy,is_smoy,playoffs,rookie_pick,seed
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Stephen Curry,0.454,11.2,6.7,33.7,27,0.2,0.4,12.5,0.504,20.2,...,0.963,0.0,0.0,1,0.0,0.0,0.0,1,0.0,1
LeBron James,0.406,3.3,7.3,36.4,28,0.9,1.9,11.6,0.565,17.8,...,0.927,0.0,0.0,1,0.0,0.0,0.0,1,0.0,1
Shaquille O'Neal,0.000,0.0,3.8,19.3,27,3.0,5.3,9.7,0.574,21.1,...,0.963,0.0,0.0,1,0.0,0.0,0.0,1,0.0,1
Kevin Garnett,0.256,0.5,5.0,24.4,27,2.2,4.0,9.9,0.499,19.6,...,1.000,0.0,0.0,1,0.0,0.0,0.0,1,0.0,1
Kevin Durant,0.391,6.1,5.5,26.7,25,0.7,1.5,8.8,0.503,20.8,...,0.988,0.0,0.0,1,0.0,0.0,0.0,1,0.0,2
Michael Jordan,0.427,3.2,4.3,21.2,32,0.5,1.0,8.6,0.495,22.6,...,1.000,0.0,0.0,1,0.0,0.0,0.0,1,0.0,1
LeBron James,0.333,5.1,8.6,41.8,25,1.0,2.0,12.5,0.503,20.1,...,0.927,0.0,0.0,1,0.0,0.0,0.0,1,0.0,1
Derrick Rose,0.332,4.8,7.7,38.7,22,0.6,1.3,5.9,0.445,19.7,...,0.988,0.0,0.0,1,0.0,0.0,0.0,1,0.0,1
LeBron James,0.344,4.7,7.2,38.0,24,1.1,2.4,13.0,0.489,19.9,...,0.988,0.0,0.0,1,0.0,0.0,0.0,1,0.0,1
James Harden,0.367,10.0,8.8,45.1,28,0.7,1.7,10.9,0.449,20.1,...,0.878,0.0,0.0,1,0.0,0.0,0.0,1,0.0,1


In [86]:
"""
THIS DATASET ABOVE HAS 31 YEARS OF NBA PLAYER STATS
(CURRENTLY FOR ALL NBA PLAYERS WHO PLAYED AN AVERAGE OF 25 MINUTES PER GAME, ~ TOP 150 PLAYERS FOR MINS PLAYED PER GAME EACH SEASON)
THE STATS INCLUDED ARE BOTH PER-GAME STATS, AS IN HOW MANY POINTS OR ASSISTS THE PLAYER SCORED ON AVERAGE PER GAME,
AS WELL AS ADVANCED STATS LIKE WIN SHARE (WS) AND VORP (VALUE OVER REPLACEMENT).
ALSO INCLUDED ARE THE PLAYER'S TEAM STATS FOR THAT SEASON: WIN/LOSE % AND SEED.
FINALLY, WE HAVE 3 VARIABLES OF MVP DATA: IS_MVP (BINARY, 1 IF MVP), GOT_VOTES (1 IF GOT VOTES), AND SHARE (VOTE SHARE)

I WILL TRAIN VARIOUS COMBINATIONS OF PERGAME, ADVANCED, TEAM STATS TO PREDICT THE 3 MVP OUTCOME VARIABLES.

I'LL TAKE 2019 OUT OF THE TRAIN DATA BECAUSE THERE IS NO OUTCOME YET.

BEFORE WE START ANY REGRESSIONS A GOOD WAY TO KICK OFF IS BY LOOKING AT CORRELATIONS BETWEEN EACH VARIABLE & VOTE SHARE

I WILL ADD ALL-STAR SELECTION

WILL CONDUCT RDD ANALYSIS FOR ALL PLAYERS THAT GOT VOTES WITH IS_MVP AS THE CUTOFF VALUES

"""

Index(['Player', 'Age', 'Tm', 'PER', 'TS%', 'USG%', 'VORP', 'WS'], dtype='object')

In [391]:
# Subset df 2000-2019 which includes player leader data (can only access 2000-2019 with bball reference web scraper)

directory = os.path.join(os.getcwd(),'season_leaders')
leaders = [pd.read_csv(f'./season_leaders/season_leaders_{year}.csv') for year in range(2000,2020)]
year=2000
for df in leaders:
    df["year"]=year
    year+=1
leaders=pd.concat(leaders).set_index('player').rename(index=str, columns={"year": "Year"})
leaders.index=leaders.index.rename("Player")
leaders["pra"]=leaders["point_lead_count_wins"]+leaders["rebound_lead_count_wins"]+leaders["assist_lead_count_wins"]
leaders["total_pra"]=leaders["point_lead_count_total"]+leaders["rebound_lead_count_total"]+leaders["rebound_lead_count_total"]
leaders["total_prank"]=leaders["total_pra"].rank(ascending=False)
leaders["pra_rank"] = leaders["pra"].rank(ascending=False) 

all_data_2000 = all_data[all_data["Year"]>=2000]
all_data_2000 = all_data_2000.merge(leaders,how='inner',on=['Player','Year'])

# output to csv
outdir = os.path.join(os.getcwd(),'output')
if not os.path.exists(outdir):
    print(f"Creating directory \'{outdir}\'")
    os.makedirs(outdir)
all_data_2000.to_csv(outdir + '/full_df_leaderdata.csv', float_format='%.3f')

all_data_2000


Unnamed: 0_level_0,3P%,3PA,AST,AST%,Age,BLK,BLK%,BPM,FG%,FGA,...,point_lead_count_losses,rebound_lead_count_losses,assist_lead_count_losses,point_lead_count_total,rebound_lead_count_total,assist_lead_count_total,pra,total_pra,total_prank,pra_rank
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Stephen Curry,0.454,11.2,6.7,33.7,27,0.2,0.4,12.5,0.504,20.2,...,6.0,0.0,3.0,55.0,33.0,9.0,88.0,121.0,208.0,20.0
LeBron James,0.406,3.3,7.3,36.4,28,0.9,1.9,11.6,0.565,17.8,...,12.0,9.0,11.0,53.0,55.0,44.0,120.0,163.0,50.0,3.0
Shaquille O'Neal,0.000,0.0,3.8,19.3,27,3.0,5.3,9.7,0.574,21.1,...,9.0,10.0,3.0,59.0,22.0,70.0,129.0,103.0,326.0,1.0
Kevin Garnett,0.256,0.5,5.0,24.4,27,2.2,4.0,9.9,0.499,19.6,...,17.0,21.0,10.0,50.0,26.0,74.0,102.0,102.0,334.5,11.5
Kevin Durant,0.391,6.1,5.5,26.7,25,0.7,1.5,8.8,0.503,20.8,...,19.0,6.0,10.0,69.0,34.0,22.0,90.0,137.0,119.0,16.5
LeBron James,0.333,5.1,8.6,41.8,25,1.0,2.0,12.5,0.503,20.1,...,16.0,6.0,14.0,68.0,64.0,23.0,119.0,196.0,7.0,4.0
Derrick Rose,0.332,4.8,7.7,38.7,22,0.6,1.3,5.9,0.445,19.7,...,12.0,0.0,16.0,54.0,67.0,2.0,95.0,188.0,14.5,14.5
LeBron James,0.344,4.7,7.2,38.0,24,1.1,2.4,13.0,0.489,19.9,...,11.0,5.0,11.0,63.0,59.0,26.0,121.0,181.0,22.0,2.0
James Harden,0.367,10.0,8.8,45.1,28,0.7,1.7,10.9,0.449,20.1,...,10.0,0.0,12.0,54.0,51.0,7.0,90.0,156.0,66.5,16.5
Stephen Curry,0.443,8.1,7.7,38.6,26,0.2,0.5,9.9,0.487,16.8,...,6.0,0.0,12.0,42.0,64.0,1.0,89.0,170.0,39.0,18.5
