In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import requests
from statsmodels.formula.api import ols
import statsmodels.api as sm
import itertools
import re
from bs4 import BeautifulSoup
from urllib.request import urlopen
# This imports the client
from basketball_reference_web_scraper import client
warnings.filterwarnings("ignore")

examples.directory is deprecated; in the future, examples will be found relative to the 'datapath' directory.
  "found relative to the 'datapath' directory.".format(key))


In [119]:
############### Functions##################

def make_soup(url):
    comm = re.compile("<!--|-->")
    page = urlopen(url)
    soupdata = BeautifulSoup(comm.sub("", page.read().decode('utf-8')), 'lxml')
    return soupdata

def advanced_data(start_year, end_year):
    """Scrapes basketball reference site for advance data given the year ranges"""
    dfs=[]
    for year in range(start_year, end_year+1):
        stats = client.players_advanced_season_totals(season_end_year=year)
        df = pd.DataFrame(stats)
        df['year']=year
        dfs.append(df)
    concat = pd.concat(dfs, ignore_index=True)
    names = concat['slug'].unique()
    
    return concat, names

def player_salaries(name_list):
    """Using a name list, this function scrapes basketball reference for salary informatin and returns the list of dataframes"""
    dfs=[]
    name_count = 0
    for name in names:
        salary_dict = {}
        salary_dict['slug'] = []
        salary_dict['year'] = []
        salary_dict['team'] = []
        salary_dict['salary'] = []
        try:
            url =f'https://www.basketball-reference.com/players/{name[0]}/{name}.html'
            response = requests.get(url=url,verify=False)
            soup=make_soup(url)
            tableStats = soup.find_all('table', {'id':'all_salaries'})
        except Exception:
            print(f'could not make soup for {name}')
            continue
        
        try:
            res = []
            for tr in tableStats:
                td = tr.find_all('td')
                row = [tr.text.strip() for tr in td if tr.text.strip()]
                if row:
                    res.append(row)
                th = tr.find_all('th')
                row = [tr.text.strip() for tr in th if tr.text.strip()]
                if row:
                    res.append(row)
        except Exception:
            print(f'could not parse table for {name}')
            continue
            
        try:
            yrs = len(res[1])
            
            #Assert these are the same
            #len(range(4,yrs-1))==len(range(0,yrs-5))
            
            for i in range(4,yrs-1):
                salary_dict['slug'].append(name)
                salary_dict['year'].append(res[1][i])

            count = 0    
            for i in range(0,yrs-5):
                salary_dict['team'].append(res[0][0+count])
                salary = int(res[0][2+count].strip('$').replace(',', ''))
                salary_dict['salary'].append(salary)
                count+=3
        except Exception:
            print(f'could not create the dict for {name}')
            continue
            
        try:
            df = pd.DataFrame(salary_dict)
            dfs.append(df)
        except Exception:
            print(f'could not convert {name} to dataframe')
            continue
        if name_count % 100 == 0:
            print(name_count)
            name_count+=1
        else:
            name_count+=1
    return dfs

def positions(position_list):
    """Cleans the dataframe position information column and returns only the position"""
    positions = []
    for i in range(0,len(position_list)):
        text = str(position_list[i])
        m = re.search('<Position.(.*): ', text)
        result = m.group(1)
        positions.append(result)
    position_list = pd.DataFrame({'positions':sorted(positions)})
    
    return position_list

def clean_names(team_name_list):
    """Given a list of team names, this function cleans the text and returns only the team names"""
    team_list = []
    try:
        for i in range(0,len(team_name_list)):
            text = str(team_name_list[i])
            m = re.search('Team.(.*)', text)
            result = m.group(1)
            team_list.append(result)
        team_list = pd.DataFrame({'team_df':sorted(team_list)})
    except:
        team_list = pd.DataFrame({'team_df':sorted(team_name_list)})

    return team_list

def clean_salary_names(team_name_list):
    """Removes old team names and keeps the most current teams"""
    sal_teams = pd.DataFrame({'sal_team':sorted(team_name_list)})
    remove_teams = ['New Orleans/Oklahoma City Hornets', 'Seattle SuperSonics', 'Vancouver Grizzlies','Washington Bullets']
    sal_teams = sal_teams.loc[~sal_teams['sal_team'].isin(remove_teams)].reset_index(drop=True)
    return sal_teams

def experience(df, name_list):
    """Calculates the experience in years for each player"""
    exp_dfs=[]
    for name in name_list:
        subset = df.loc[df['slug'] == name]
        subset['min_year'] = min(subset['year'])-1
        subset['exp'] = subset['year']-subset['min_year']
        exp_dfs.append(subset)
    exp_df = pd.concat(exp_dfs, ignore_index=True)
    return exp_df

def linear_regression_combos(fixed_y, fixed_X, variables_to_test, categorical_variables, dataframe):
    """
    Runs a linear regression on fixed y and x variables along with different combonations of other variables.
    Returns a dataframe of variable combonations and their resulting RMSE scores.
    """
    
    combos = []
    for length in range(0, 6):
        for subset in itertools.combinations(variables_to_test, length):
            combos.append(np.asanyarray(subset))
            
    combos_with_fixed = [combo.tolist() + fixed_X for combo in combos]
    
    ##### Regression #####
    y = dataframe[fixed_y]
    
    results = {}
    results['combos'] = []
    results['R2'] = []
    results['R2_adj'] = []
    results['RMSE'] = []
    for combo in combos_with_fixed:

        X = dataframe[combo]
        cat =[]
        for i in combo:
            if i in categorical_variables:
                cat.append(i)
            
        if len(cat) >=1:
            X = pd.get_dummies(X, columns=cat, drop_first=True)
        else:
            continue
    
        X = sm.add_constant(X.values)
        res = sm.OLS(y,X).fit()
        mse = np.divide(np.sum(np.square(res.resid)),len(data['log_sal']))
        rmse = np.sqrt(mse)

        results['combos'].append(combo)
        results['R2'].append(np.round(res.rsquared,3))
        results['R2_adj'].append(np.round(res.rsquared_adj,3))
        results['RMSE'].append(np.round(rmse,3))
       
    df = pd.DataFrame(results).sort_values(by='RMSE', ascending=True)
    return df, combos_with_fixed

In [110]:
#################### Misc stuff ################
years_df = pd.DataFrame({'year_range': ['1999-00','2000-01','2001-02','2002-03','2003-04','2004-05','2005-06','2006-07','2007-08','2008-09','2009-10','2010-11','2011-12','2012-13','2013-14','2014-15','2015-16','2016-17','2017-18','2018-19','2019-20'],
                         'year': [2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020]})

all_stars = pd.read_csv('nba_allstars_10-18.csv',names=['player','allstar','year'],engine='python',header=0)

In [7]:
adv_stats, names = advanced_data(2010,2018)

#Clean position infor
position_list = list(adv_stats['positions'])
adv_stats['positions']=positions(position_list)

#clean team info
teams= list(adv_stats['team'])
adv_stats['team'] = clean_names(teams)

#Create the join table for team info
adv_stats_team_names = list(adv_stats['team'].unique())
adv_team_names_cleaned = clean_names(adv_stats_team_names)

# Get unique team names from salary data df
salaries = player_salaries(names)
salary_df = pd.concat(salaries, ignore_index=True)
sal_team_names = salary_df['team'].unique()
sal_team_names_cleaned = clean_salary_names(sal_team_names)

# add columns together
adv_team_names_cleaned['sal_teams'] = sal_team_names_cleaned

#Create join dataframe
salary_df = salary_df.merge(years_df,how='inner',left_on='year',right_on='year_range')
salary_df = salary_df[['slug', 'year_y','team', 'salary']]
salary_df = salary_df.dropna()
salary_df['year'] = salary_df['year_y'].astype(int)

#Get experience
adv_stats_exp = experience(adv_stats, names)

#Add allstar data
adv_stats_exp = adv_stats_exp.merge(all_stars,how='left', left_on=['name','year'], right_on=['player','year']).fillna(0)
adv_stats_exp['allstar'] = adv_stats_exp['allstar'].astype(int)

#Merge into final df
adv_stats_final = adv_stats_exp.merge(salary_df[['slug','year','salary']], how='left', left_on=['slug','year'], right_on=['slug','year']).dropna()

#data df
data = adv_stats_final.drop(columns=['slug','name','year','player','min_year'])
data['log_sal'] = np.log(data['salary'])
data['exp^2']= np.square(data['exp'])

In [121]:
variables_to_test = ['positions', 'age', 'team', 'games_played', 'minutes_played',
       'player_efficiency_rating', 'true_shooting_percentage',
       'three_point_attempt_rate', 'free_throw_attempt_rate',
       'offensive_rebound_percentage', 'defensive_rebound_percentage',
       'total_rebound_percentage', 'assist_percentage', 'steal_percentage',
       'block_percentage', 'turnover_percentage', 'usage_percentage',
       'offensive_win_shares', 'defensive_win_shares', 'win_shares',
       'win_shares_per_48_minutes', 'offensive_box_plus_minus',
       'defensive_box_plus_minus', 'box_plus_minus',
       'value_over_replacement_player', 'allstar']

results, combos = linear_regression_combos(fixed_y=fixed_y, 
                         fixed_X=fixed_X,
                         variables_to_test=variables_to_test,
                         categorical_variables=['positions','team'],
                         dataframe=data)

"\nresults = {}\nresults['combos'] = []\nresults['R2'] = []\nresults['R2_adj'] = []\nresults['RMSE'] = []\ncount = 0\ncategorical_variables=['positions','team']\ndataframe=data\nfor combo in combos_with_fixed:\n\n    X = dataframe[combo]\n\n    cat =[]\n    for i in combo:\n        if i in categorical_variables:\n            cat.append(i)\n\n    if len(cat) >=1:\n        X = pd.get_dummies(X, columns=cat, drop_first=True)\n\n\n    X = sm.add_constant(X.values)\n    res = sm.OLS(y,X).fit()\n    mse = np.divide(np.sum(np.square(res.resid)),len(data['log_sal']))\n    rmse = np.sqrt(mse)\n"