# Position Estimates by Year

After manually reviewing some of the positions that were listed as "Primary Position", I realized they were mistaken in some (if not many) places. Thus, I need to scrape positions by year and percentage played (through play-by-play analysis).

This is quite easy through basketball-reference.com. 

Note: This data also includes :
- +- per 100 possessions, 
- BRef's Positions (total, not est), and 
- A row for each team a player played on during said season (i.e., a way to tell if a player is traded, etc)
    - Further, if a player plays on two teams in a year, they will also have a "TOT" column with their aggregate statistics



In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.ticker as mtick
import sqlite3
import seaborn as sns
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import requests   
import shutil      
import datetime
from scipy.stats import norm
import os
import winsound
import warnings
warnings.filterwarnings('ignore')

In [55]:
home_folder = 'C:\\Users\\Travis\\OneDrive\\Data Science\\Personal_Projects\\Sports\\NBA_Prediction_V3_1'
os.chdir(home_folder)

In [56]:
years = np.arange(2000,2024,1)

In [57]:
position_files = os.listdir('data/player/play_by_play/')

to_download = []
for year in years:
    for file in position_files:
        if str(year) in file:
            to_download.append(file)

to_download


['2000position_estimates.csv',
 '2001position_estimates.csv',
 '2002position_estimates.csv',
 '2003position_estimates.csv',
 '2004position_estimates.csv',
 '2005position_estimates.csv',
 '2006position_estimates.csv',
 '2007position_estimates.csv',
 '2008position_estimates.csv',
 '2009position_estimates.csv',
 '2010position_estimates.csv',
 '2011position_estimates.csv',
 '2012position_estimates.csv',
 '2013position_estimates.csv',
 '2014position_estimates.csv',
 '2015position_estimates.csv',
 '2016position_estimates.csv',
 '2017position_estimates.csv',
 '2018position_estimates.csv',
 '2019position_estimates.csv',
 '2020position_estimates.csv',
 '2021position_estimates.csv',
 '2022position_estimates.csv']

In [58]:
# check to_download files against position_files to see if any are in one but not the other
left_to_download = []
for file in to_download:
    if file not in position_files:
        left_to_download.append(file)

left_to_download

[]

In [59]:
if left_to_download == []:
    print('All files downloaded')
else:
    print('Files to download:',left_to_download)
    for year in years:
        df = pd.read_html('https://www.basketball-reference.com/leagues/NBA_'+str(year)+'_play-by-play.html')
        df = df[0]
        yar = year-1
        df['season'] = yar
        df.to_csv('data/player/play_by_play/'+str(yar)+'position_estimates.csv')

All files downloaded


In [60]:
appended_data = []

files = os.listdir('data/player/play_by_play/')
for file in files:
    df = pd.read_csv('data/player/play_by_play/'+file)[:]
    appended_data.append(df)

df = pd.concat(appended_data)
df.to_csv('data/player/aggregates/all_position_estimates.csv')

In [61]:
df = df.rename(columns={'Unnamed: 0':'na', 'Unnamed: 0_level_0' : 'rank', 'Unnamed: 1_level_0': 'player',
                        'Unnamed: 2_level_0': 'position', 'Unnamed: 3_level_0': 'age', 'Unnamed: 4_level_0': 'team',
                        'Totals': 'G', 'Totals.1': 'MP', 'Position Estimate': 'PG_est_%', 'Position Estimate.1': 'SG_est_%',
                        'Position Estimate.2': 'SF_est_%', 'Position Estimate.3': 'PF_est_%', 'Position Estimate.4': 'C_est_%',
                        })

df = df.rename(columns={'+/- Per 100 Poss.':'per100poss_+/-_ON_court', '+/- Per 100 Poss.':'per100poss_+/-_OFF_court'})

df = df.rename(columns={'Turnovers':'BadPass', 'Turnovers.1':'LostBall'})

df.columns

Index(['na', 'rank', 'player', 'position', 'age', 'team', 'G', 'MP',
       'PG_est_%', 'SG_est_%', 'SF_est_%', 'PF_est_%', 'C_est_%',
       'per100poss_+/-_OFF_court', '+/- Per 100 Poss..1', 'BadPass',
       'LostBall', 'Fouls Committed', 'Fouls Committed.1', 'Fouls Drawn',
       'Fouls Drawn.1', 'Misc.', 'Misc..1', 'Misc..2', 'season'],
      dtype='object')

In [62]:
# drop all unnamed cols
unnamed = df.columns[df.columns.str.contains('Unnamed')]
df = df.drop(columns=unnamed)

# drop na and rank if they are in the df
if 'na' in df.columns:
    to_drop = ['na']
    df = df.drop(columns=to_drop)
if 'rank' in df.columns:
    to_drop = ['rank']
    df = df.drop(columns=to_drop)

# drop na in season
df = df.dropna(subset = 'season')

# season to int
df['season'] = df['season'].astype(int)

In [63]:
# fix the % values
df['PG_est_%'] = df['PG_est_%'].str.replace('%', '')
df['SG_est_%'] = df['SG_est_%'].str.replace('%', '')
df['SF_est_%'] = df['SF_est_%'].str.replace('%', '')
df['PF_est_%'] = df['PF_est_%'].str.replace('%', '')
df['C_est_%'] = df['C_est_%'].str.replace('%', '')
df.head()

Unnamed: 0,player,position,age,team,G,MP,PG_est_%,SG_est_%,SF_est_%,PF_est_%,...,BadPass,LostBall,Fouls Committed,Fouls Committed.1,Fouls Drawn,Fouls Drawn.1,Misc.,Misc..1,Misc..2,season
1,Tariq Abdul-Wahad,SG,25,TOT,61,1578,1.0,96.0,3.0,,...,44,22,51,21,77,9,216,13,32,1999
2,Tariq Abdul-Wahad,SG,25,ORL,46,1205,,97.0,3.0,,...,36,18,39,17,59,6,158,11,24,1999
3,Tariq Abdul-Wahad,SG,25,DEN,15,373,4.0,93.0,3.0,,...,8,4,12,4,18,3,58,2,8,1999
4,Shareef Abdur-Rahim,SF,23,VAN,82,3223,,,63.0,35.0,...,85,71,102,51,212,17,597,42,138,1999
5,Cory Alexander,PG,26,DEN,29,329,97.0,3.0,,,...,16,6,14,2,4,2,127,0,11,1999


In [64]:
df['PG_est_%'] = df['PG_est_%'].fillna(0)
df['SG_est_%'] = df['SG_est_%'].fillna(0)
df['SF_est_%'] = df['SF_est_%'].fillna(0)
df['PF_est_%'] = df['PF_est_%'].fillna(0)
df['C_est_%'] = df['C_est_%'].fillna(0)
df.head(2)

Unnamed: 0,player,position,age,team,G,MP,PG_est_%,SG_est_%,SF_est_%,PF_est_%,...,BadPass,LostBall,Fouls Committed,Fouls Committed.1,Fouls Drawn,Fouls Drawn.1,Misc.,Misc..1,Misc..2,season
1,Tariq Abdul-Wahad,SG,25,TOT,61,1578,1,96,3,0,...,44,22,51,21,77,9,216,13,32,1999
2,Tariq Abdul-Wahad,SG,25,ORL,46,1205,0,97,3,0,...,36,18,39,17,59,6,158,11,24,1999


In [65]:
df['PG_est_%'] = df['PG_est_%'].fillna(0)
df['SG_est_%'] = df['SG_est_%'].fillna(0)
df['SF_est_%'] = df['SF_est_%'].fillna(0)
df['PF_est_%'] = df['PF_est_%'].fillna(0)
df['C_est_%'] = df['C_est_%'].fillna(0)
df.head(2)

Unnamed: 0,player,position,age,team,G,MP,PG_est_%,SG_est_%,SF_est_%,PF_est_%,...,BadPass,LostBall,Fouls Committed,Fouls Committed.1,Fouls Drawn,Fouls Drawn.1,Misc.,Misc..1,Misc..2,season
1,Tariq Abdul-Wahad,SG,25,TOT,61,1578,1,96,3,0,...,44,22,51,21,77,9,216,13,32,1999
2,Tariq Abdul-Wahad,SG,25,ORL,46,1205,0,97,3,0,...,36,18,39,17,59,6,158,11,24,1999


In [68]:
df.to_csv('data/player/aggregates_of_aggregates/all_position_estimates.csv')