In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options


In [2]:
def set_chrome_options() -> None:
    """Sets chrome options for Selenium.
    Chrome options for headless browser is enabled.
    """
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_prefs = {}
    chrome_options.experimental_options["prefs"] = chrome_prefs
    chrome_prefs["profile.default_content_settings"] = {"images": 2}
    return chrome_options


In [3]:
chrome_options = set_chrome_options()
driver = webdriver.Chrome(options=chrome_options)


In [4]:
url = 'https://www.basketball-reference.com/players/j/jamesle01.html'
driver.get(url)


In [5]:
meta = driver.find_element(By.ID, 'meta')
print(meta.text)


LeBron James
LeBron Raymone James ▪ Twitter: KingJames ▪ Instagram: kingjames
(King James, LBJ, Chosen One, Bron-Bron, The Little Emperor, The Akron Hammer, L-Train)
Position: Small Forward, Power Forward, Point Guard, and Shooting Guard ▪ Shoots: Right
6-9, 250lb (206cm, 113kg)
Born: December 30, 1984 (Age: 36-361d) in Akron, Ohio us
High School: Saint Vincent-Saint Mary in Akron, Ohio
More bio, uniform, draft, salary info


In [6]:
url = 'https://www.basketball-reference.com/players/j/jamesle01.html#all_totals-playoffs_totals'
driver.get(url)


In [50]:
totals = driver.find_element(By.ID, 'totals')

totals_tbl = totals.text

totals_tbl


In [14]:
from io import StringIO
import pandas as pd

In [15]:
TESTDATA = StringIO(totals_tbl)
TESTDATA

<_io.StringIO at 0x7f770cf040d0>

In [16]:
df = pd.read_csv(TESTDATA, sep=" ")

In [17]:
df

Unnamed: 0,Season,Age,Tm,Lg,Pos,G,GS,MP,FG,FGA,...,AST,STL,BLK,TOV,PF,PTS,Unnamed: 30,Unnamed: 31,Trp,Dbl
0,2003-04,19,CLE,NBA,SG,79,79,3122.0,622,1492.0,...,465,130,58,273,149.0,1654.0,0.0,,,
1,2004-05,20,CLE,NBA,SF,80,80,3388.0,795,1684.0,...,577,177,52,262,146.0,2175.0,4.0,,,
2,2005-06,21,CLE,NBA,SF,79,79,3361.0,875,1823.0,...,521,123,66,260,181.0,2478.0,5.0,,,
3,2006-07,22,CLE,NBA,SF,78,78,3190.0,772,1621.0,...,470,125,55,250,171.0,2132.0,1.0,,,
4,2007-08,23,CLE,NBA,SF,75,74,3027.0,794,1642.0,...,539,138,81,255,165.0,2250.0,7.0,,,
5,2008-09,24,CLE,NBA,SF,81,81,3054.0,789,1613.0,...,587,137,93,241,139.0,2304.0,7.0,,,
6,2009-10,25,CLE,NBA,SF,76,76,2966.0,768,1528.0,...,651,125,77,261,119.0,2258.0,4.0,,,
7,2010-11,26,MIA,NBA,SF,79,79,3063.0,758,1485.0,...,554,124,50,284,163.0,2111.0,4.0,,,
8,2011-12,27,MIA,NBA,SF,62,62,2326.0,621,1169.0,...,387,115,50,213,96.0,1683.0,0.0,,,
9,2012-13,28,MIA,NBA,PF,76,76,2877.0,765,1354.0,...,551,129,67,226,110.0,2036.0,4.0,,,


In [18]:
patternDel = "\d{4}-\d{2}"
seasons_df = df[df['Season'].str.contains(patternDel)]


In [19]:
seasons_df['tripple_doubles'] = seasons_df['Unnamed: 30']

seasons_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  seasons_df['tripple_doubles'] = seasons_df['Unnamed: 30']


Unnamed: 0,Season,Age,Tm,Lg,Pos,G,GS,MP,FG,FGA,...,STL,BLK,TOV,PF,PTS,Unnamed: 30,Unnamed: 31,Trp,Dbl,tripple_doubles
0,2003-04,19,CLE,NBA,SG,79,79,3122.0,622,1492.0,...,130,58,273,149.0,1654.0,0.0,,,,0.0
1,2004-05,20,CLE,NBA,SF,80,80,3388.0,795,1684.0,...,177,52,262,146.0,2175.0,4.0,,,,4.0
2,2005-06,21,CLE,NBA,SF,79,79,3361.0,875,1823.0,...,123,66,260,181.0,2478.0,5.0,,,,5.0
3,2006-07,22,CLE,NBA,SF,78,78,3190.0,772,1621.0,...,125,55,250,171.0,2132.0,1.0,,,,1.0
4,2007-08,23,CLE,NBA,SF,75,74,3027.0,794,1642.0,...,138,81,255,165.0,2250.0,7.0,,,,7.0
5,2008-09,24,CLE,NBA,SF,81,81,3054.0,789,1613.0,...,137,93,241,139.0,2304.0,7.0,,,,7.0
6,2009-10,25,CLE,NBA,SF,76,76,2966.0,768,1528.0,...,125,77,261,119.0,2258.0,4.0,,,,4.0
7,2010-11,26,MIA,NBA,SF,79,79,3063.0,758,1485.0,...,124,50,284,163.0,2111.0,4.0,,,,4.0
8,2011-12,27,MIA,NBA,SF,62,62,2326.0,621,1169.0,...,115,50,213,96.0,1683.0,0.0,,,,0.0
9,2012-13,28,MIA,NBA,PF,76,76,2877.0,765,1354.0,...,129,67,226,110.0,2036.0,4.0,,,,4.0


In [20]:
drop_cols = ['Unnamed: 30', 'Unnamed: 31', 'Trp', 'Dbl']

seasons_df.drop(axis=1, labels = drop_cols, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [21]:
seasons_df.describe().columns

Index(['G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA',
       '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL',
       'BLK', 'TOV', 'PF', 'PTS', 'tripple_doubles'],
      dtype='object')

In [40]:
x

In [22]:
def create_accumulator_column(df, col):
    df[f'{col}-accum'] = df[col].cumsum()


In [41]:
seasons_df2 = seasons_df.copy()

In [23]:
accum_cols = seasons_df.describe().columns

for col in accum_cols:
    create_accumulator_column(seasons_df, col)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'{col}-accum'] = df[col].cumsum()


In [24]:
seasons_df.columns

Index(['Season', 'Age', 'Tm', 'Lg', 'Pos', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%',
       '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%',
       'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS',
       'tripple_doubles', 'G-accum', 'GS-accum', 'MP-accum', 'FG-accum',
       'FGA-accum', 'FG%-accum', '3P-accum', '3PA-accum', '3P%-accum',
       '2P-accum', '2PA-accum', '2P%-accum', 'eFG%-accum', 'FT-accum',
       'FTA-accum', 'FT%-accum', 'ORB-accum', 'DRB-accum', 'TRB-accum',
       'AST-accum', 'STL-accum', 'BLK-accum', 'TOV-accum', 'PF-accum',
       'PTS-accum', 'tripple_doubles-accum'],
      dtype='object')

In [25]:
def create_total_column(df, col):
    df[f'{col}-total'] = df[col].sum()

In [26]:
# accum_cols = seasons_df.describe().columns

for col in accum_cols:
    create_total_column(seasons_df, col)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'{col}-total'] = df[col].sum()


In [27]:
seasons_df

Unnamed: 0,Season,Age,Tm,Lg,Pos,G,GS,MP,FG,FGA,...,ORB-total,DRB-total,TRB-total,AST-total,STL-total,BLK-total,TOV-total,PF-total,PTS-total,tripple_doubles-total
0,2003-04,19,CLE,NBA,SG,79,79,3122.0,622,1492.0,...,1553,8339,9892,9833,2098,1003,4667,2443.0,35930.0,101.0
1,2004-05,20,CLE,NBA,SF,80,80,3388.0,795,1684.0,...,1553,8339,9892,9833,2098,1003,4667,2443.0,35930.0,101.0
2,2005-06,21,CLE,NBA,SF,79,79,3361.0,875,1823.0,...,1553,8339,9892,9833,2098,1003,4667,2443.0,35930.0,101.0
3,2006-07,22,CLE,NBA,SF,78,78,3190.0,772,1621.0,...,1553,8339,9892,9833,2098,1003,4667,2443.0,35930.0,101.0
4,2007-08,23,CLE,NBA,SF,75,74,3027.0,794,1642.0,...,1553,8339,9892,9833,2098,1003,4667,2443.0,35930.0,101.0
5,2008-09,24,CLE,NBA,SF,81,81,3054.0,789,1613.0,...,1553,8339,9892,9833,2098,1003,4667,2443.0,35930.0,101.0
6,2009-10,25,CLE,NBA,SF,76,76,2966.0,768,1528.0,...,1553,8339,9892,9833,2098,1003,4667,2443.0,35930.0,101.0
7,2010-11,26,MIA,NBA,SF,79,79,3063.0,758,1485.0,...,1553,8339,9892,9833,2098,1003,4667,2443.0,35930.0,101.0
8,2011-12,27,MIA,NBA,SF,62,62,2326.0,621,1169.0,...,1553,8339,9892,9833,2098,1003,4667,2443.0,35930.0,101.0
9,2012-13,28,MIA,NBA,PF,76,76,2877.0,765,1354.0,...,1553,8339,9892,9833,2098,1003,4667,2443.0,35930.0,101.0


In [28]:
def create_total_column(df, col):
#     df[f'{col}-total'] = df[col].sum()
    df[f'{col}-pct'] = df.apply(lambda x: (x[f'{col}'] / x[f'{col}-total']), axis=1)
  

In [29]:
for col in accum_cols:
    create_total_column(seasons_df, col)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'{col}-pct'] = df.apply(lambda x: (x[f'{col}'] / x[f'{col}-total']), axis=1)


In [30]:
seasons_df.to_csv('lebron-seasons.csv')

In [32]:
# pivot = pd.pivot_table(
#     data=seasons_df, 
#     index=['Season'],
# #     values=['AST-pct', 'PTS-pct', 'TRB-pct'], 
#     values=['AST', 'PTS', 'TRB'], 
# #     aggfunc=cumsum()
# )

# pivot

In [33]:
seasons_df

Unnamed: 0,Season,Age,Tm,Lg,Pos,G,GS,MP,FG,FGA,...,ORB-pct,DRB-pct,TRB-pct,AST-pct,STL-pct,BLK-pct,TOV-pct,PF-pct,PTS-pct,tripple_doubles-pct
0,2003-04,19,CLE,NBA,SG,79,79,3122.0,622,1492.0,...,0.063748,0.039933,0.043672,0.04729,0.061964,0.057827,0.058496,0.060991,0.046034,0.0
1,2004-05,20,CLE,NBA,SF,80,80,3388.0,795,1684.0,...,0.071475,0.057201,0.059442,0.05868,0.084366,0.051844,0.056139,0.059763,0.060534,0.039604
2,2005-06,21,CLE,NBA,SF,79,79,3361.0,875,1823.0,...,0.048294,0.057681,0.056207,0.052985,0.058627,0.065803,0.05571,0.074089,0.068967,0.049505
3,2006-07,22,CLE,NBA,SF,78,78,3190.0,772,1621.0,...,0.053445,0.053124,0.053174,0.047798,0.059581,0.054835,0.053568,0.069996,0.059338,0.009901
4,2007-08,23,CLE,NBA,SF,75,74,3027.0,794,1642.0,...,0.085641,0.055043,0.059846,0.054815,0.065777,0.080758,0.054639,0.06754,0.062622,0.069307
5,2008-09,24,CLE,NBA,SF,81,81,3054.0,789,1613.0,...,0.068255,0.060799,0.061969,0.059697,0.0653,0.092722,0.051639,0.056897,0.064125,0.069307
6,2009-10,25,CLE,NBA,SF,76,76,2966.0,768,1528.0,...,0.045718,0.057921,0.056005,0.066206,0.059581,0.07677,0.055925,0.048711,0.062844,0.039604
7,2010-11,26,MIA,NBA,SF,79,79,3063.0,758,1485.0,...,0.051513,0.061158,0.059644,0.056341,0.059104,0.04985,0.060853,0.066721,0.058753,0.039604
8,2011-12,27,MIA,NBA,SF,62,62,2326.0,621,1169.0,...,0.060528,0.047728,0.049737,0.039357,0.054814,0.04985,0.04564,0.039296,0.046841,0.0
9,2012-13,28,MIA,NBA,PF,76,76,2877.0,765,1354.0,...,0.06246,0.061518,0.061666,0.056036,0.061487,0.0668,0.048425,0.045027,0.056666,0.039604


In [34]:
def create_pct_accumulator_column(df, col):
    df[f'{col}-pct-accum'] = df[f'{col}-pct'].cumsum()

In [35]:
for col in accum_cols:
    create_pct_accumulator_column(seasons_df, col)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'{col}-pct-accum'] = df[f'{col}-pct'].cumsum()


In [36]:
seasons_df

Unnamed: 0,Season,Age,Tm,Lg,Pos,G,GS,MP,FG,FGA,...,ORB-pct-accum,DRB-pct-accum,TRB-pct-accum,AST-pct-accum,STL-pct-accum,BLK-pct-accum,TOV-pct-accum,PF-pct-accum,PTS-pct-accum,tripple_doubles-pct-accum
0,2003-04,19,CLE,NBA,SG,79,79,3122.0,622,1492.0,...,0.063748,0.039933,0.043672,0.04729,0.061964,0.057827,0.058496,0.060991,0.046034,0.0
1,2004-05,20,CLE,NBA,SF,80,80,3388.0,795,1684.0,...,0.135222,0.097134,0.103114,0.10597,0.14633,0.109671,0.114635,0.120753,0.106568,0.039604
2,2005-06,21,CLE,NBA,SF,79,79,3361.0,875,1823.0,...,0.183516,0.154815,0.159321,0.158955,0.204957,0.175474,0.170345,0.194842,0.175536,0.089109
3,2006-07,22,CLE,NBA,SF,78,78,3190.0,772,1621.0,...,0.236961,0.207939,0.212495,0.206753,0.264538,0.230309,0.223913,0.264838,0.234873,0.09901
4,2007-08,23,CLE,NBA,SF,75,74,3027.0,794,1642.0,...,0.322601,0.262981,0.272341,0.261568,0.330315,0.311067,0.278552,0.332378,0.297495,0.168317
5,2008-09,24,CLE,NBA,SF,81,81,3054.0,789,1613.0,...,0.390856,0.32378,0.334311,0.321265,0.395615,0.403789,0.330191,0.389275,0.36162,0.237624
6,2009-10,25,CLE,NBA,SF,76,76,2966.0,768,1528.0,...,0.436574,0.3817,0.390315,0.387471,0.455195,0.480558,0.386115,0.437986,0.424464,0.277228
7,2010-11,26,MIA,NBA,SF,79,79,3063.0,758,1485.0,...,0.488088,0.442859,0.44996,0.443812,0.514299,0.530409,0.446968,0.504707,0.483217,0.316832
8,2011-12,27,MIA,NBA,SF,62,62,2326.0,621,1169.0,...,0.548616,0.490586,0.499697,0.483169,0.569113,0.580259,0.492608,0.544003,0.530058,0.316832
9,2012-13,28,MIA,NBA,PF,76,76,2877.0,765,1354.0,...,0.611075,0.552105,0.561363,0.539205,0.630601,0.647059,0.541033,0.58903,0.586724,0.356436


In [46]:
dfs = []
df_out = None
for col in ['G', 'MP', 'FG', 'FGA', '3P', '2P', 'FT', 'TRB', 'AST', 'STL', 'BLK', 'PTS', 'tripple_doubles']:
    for val in ['', '-accum', '-total', '-pct', '-pct-accum']:
        df = pd.DataFrame(columns=['season', 'metric', 'value'])
        df['season'] = seasons_df['Season']
        df['metric'] = [f'{col}{val}']*len(seasons_df)
        df['value'] = seasons_df[f'{col}{val}']
    
        dfs.append(df)
    
df_out = pd.concat(dfs)

df_out


Unnamed: 0,season,metric,value
0,2003-04,G,79.000000
1,2004-05,G,80.000000
2,2005-06,G,79.000000
3,2006-07,G,78.000000
4,2007-08,G,75.000000
...,...,...,...
14,2017-18,tripple_doubles-pct-accum,0.722772
15,2018-19,tripple_doubles-pct-accum,0.801980
16,2019-20,tripple_doubles-pct-accum,0.930693
17,2020-21,tripple_doubles-pct-accum,0.980198


In [47]:
seasons_df['G']

0     79
1     80
2     79
3     78
4     75
5     81
6     76
7     79
8     62
9     76
10    77
11    69
12    76
13    74
14    82
15    55
16    67
17    45
18    21
Name: G, dtype: int64

In [74]:
df_out

Unnamed: 0,Season,Age,Tm,Lg,Pos,G,GS,MP,FG,FGA,...,TOV,PF,PTS,Unnamed: 30,Unnamed: 31,Trp,Dbl,season,metric,value
0,2003-04,19,CLE,NBA,SG,79,79,3122.000,622,1492.000,...,273,149.0,1654.0,0.0,,,,2003-04,tripple_doubles,0.0
1,2004-05,20,CLE,NBA,SF,80,80,3388.000,795,1684.000,...,262,146.0,2175.0,4.0,,,,2004-05,tripple_doubles,4.0
2,2005-06,21,CLE,NBA,SF,79,79,3361.000,875,1823.000,...,260,181.0,2478.0,5.0,,,,2005-06,tripple_doubles,5.0
3,2006-07,22,CLE,NBA,SF,78,78,3190.000,772,1621.000,...,250,171.0,2132.0,1.0,,,,2006-07,tripple_doubles,1.0
4,2007-08,23,CLE,NBA,SF,75,74,3027.000,794,1642.000,...,255,165.0,2250.0,7.0,,,,2007-08,tripple_doubles,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18,2021-22,37,LAL,NBA,SF,20,20,740.000,198,388.000,...,72,46.0,527.0,2.0,,,,2021-22,tripple_doubles,2.0
19,Career,NBA,1330,1329,50795,13101,25992,0.504,2032,5890.000,...,101,,,,,,,,tripple_doubles,
20,11,seasons,CLE,NBA,849,848,33130,8369.000,17022,0.492,...,1618,23119.0,64.0,,,,,,tripple_doubles,
21,4,seasons,MIA,NBA,294,294,11168,2911.000,5361,0.543,...,495,7919.0,9.0,,,,,,tripple_doubles,


In [48]:
df_out.reset_index(inplace=True)

df_out

Unnamed: 0,index,season,metric,value
0,0,2003-04,G,79.000000
1,1,2004-05,G,80.000000
2,2,2005-06,G,79.000000
3,3,2006-07,G,78.000000
4,4,2007-08,G,75.000000
...,...,...,...,...
1230,14,2017-18,tripple_doubles-pct-accum,0.722772
1231,15,2018-19,tripple_doubles-pct-accum,0.801980
1232,16,2019-20,tripple_doubles-pct-accum,0.930693
1233,17,2020-21,tripple_doubles-pct-accum,0.980198


In [49]:
df_out.to_csv('lebron-analytics-df-full.csv', index=False)