In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import time
import re

### Testing whether web scraping works with Kareem to capture per game stats

In [2]:
service = Service(executable_path="/usr/local/bin")
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=service, options=options)

In [3]:
kareem_url = 'https://www.basketball-reference.com/players/r/reaveau01.html' #Use whatever URL you're scraping from
d = driver.get(kareem_url)
html = driver.page_source
soup = BeautifulSoup(html, "lxml")

In [4]:
kareem_pergame = soup.find('table', {'id':'per_game'})

In [5]:
kareem_pergame_table = pd.read_html(str(kareem_pergame))[0]

In [6]:
kareem_pergame_table

Unnamed: 0,Season,Age,Tm,Lg,Pos,G,GS,MP,FG,FGA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,2021-22,23.0,LAL,NBA,SG,61,19,23.2,2.4,5.2,...,0.839,0.7,2.4,3.2,1.8,0.5,0.3,0.7,1.4,7.3
1,2022-23,24.0,LAL,NBA,SG,64,22,28.8,4.0,7.7,...,0.864,0.5,2.5,3.0,3.4,0.5,0.3,1.5,1.7,13.0
2,Career,,,NBA,,125,41,26.1,3.2,6.5,...,0.856,0.6,2.5,3.1,2.6,0.5,0.3,1.1,1.6,10.2


In [9]:
career_row = kareem_pergame_table[kareem_pergame_table['Season'] == 'Career'].index.tolist()[0]
kareem_pergame_table_processed = kareem_pergame_table.iloc[:career_row]

In [10]:
kareem_pergame_table_processed

Unnamed: 0,Season,Age,Tm,Lg,Pos,G,GS,MP,FG,FGA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,2021-22,23.0,LAL,NBA,SG,61,19,23.2,2.4,5.2,...,0.839,0.7,2.4,3.2,1.8,0.5,0.3,0.7,1.4,7.3
1,2022-23,24.0,LAL,NBA,SG,64,22,28.8,4.0,7.7,...,0.864,0.5,2.5,3.0,3.4,0.5,0.3,1.5,1.7,13.0


In [11]:
kareem_teams = kareem_pergame_table['Tm'].unique()
kareem_teams

array(['LAL', nan], dtype=object)

In [12]:
kareem_teams = kareem_teams[~pd.isnull(kareem_teams)]
kareem_teams

array(['LAL'], dtype=object)

In [13]:
kareem_full = kareem_pergame_table_processed.apply(lambda x: x.dropna().max()).to_frame().T
kareem_full

Unnamed: 0,Season,Age,Tm,Lg,Pos,G,GS,MP,FG,FGA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,2022-23,24.0,LAL,NBA,SG,64,22,28.8,4.0,7.7,...,0.864,0.7,2.5,3.2,3.4,0.5,0.3,1.5,1.7,13.0


In [14]:
kareem_full.loc[0, 'Tm'] = kareem_teams
kareem_full

Unnamed: 0,Season,Age,Tm,Lg,Pos,G,GS,MP,FG,FGA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,2022-23,24.0,[LAL],NBA,SG,64,22,28.8,4.0,7.7,...,0.864,0.7,2.5,3.2,3.4,0.5,0.3,1.5,1.7,13.0


## Got a processed per game row with max of each category kept and "Tm" being an array of all teams played on

### Now to try game highs section

In [15]:
kareem_highs = soup.find('table', {'id':'stathead_insights'})

In [16]:
kareem_highs_table = pd.read_html(str(kareem_highs))[0]
kareem_highs_table

Unnamed: 0,Highlight,Unnamed: 1,In Stathead
0,"Career high, Points",35.0,View full stats from top 20 games
1,"Career high, Rebounds",16.0,View full stats from top 20 games
2,"Career high, Assists",11.0,View full stats from top 20 games
3,"Career high, Steals",3.0,View full stats from top 20 games
4,"Career high, Blocks",2.0,View full stats from top 20 games
5,"Career high, Game Score",35.8,View full stats from top 20 games
6,Triple-Doubles,1.0,View all


In [17]:
kareem_highs_table['Unnamed: 1'].tolist()

[35.0, 16.0, 11.0, 3.0, 2.0, 35.8, 1.0]

In [18]:
kareem_highs_table["Highlight"].tolist()

['Career high, Points',
 'Career high, Rebounds',
 'Career high, Assists',
 'Career high, Steals',
 'Career high, Blocks',
 'Career high, Game Score',
 'Triple-Doubles']

In [19]:
kareem_full[kareem_highs_table["Highlight"].tolist()] = kareem_highs_table['Unnamed: 1'].tolist()

In [20]:
kareem_full

Unnamed: 0,Season,Age,Tm,Lg,Pos,G,GS,MP,FG,FGA,...,TOV,PF,PTS,"Career high, Points","Career high, Rebounds","Career high, Assists","Career high, Steals","Career high, Blocks","Career high, Game Score",Triple-Doubles
0,2022-23,24.0,[LAL],NBA,SG,64,22,28.8,4.0,7.7,...,1.5,1.7,13.0,35.0,16.0,11.0,3.0,2.0,35.8,1.0


### Now college stats

In [21]:
kareem_college = soup.find('table', {'id':'all_college_stats'})

In [22]:
kareem_college_table = pd.read_html(str(kareem_college))[0]

In [23]:
kareem_college_table.columns = kareem_college_table.columns.droplevel()
kareem_college_table

Unnamed: 0,Season,Age,College,G,MP,FG,FGA,3P,3PA,FT,...,TOV,PF,PTS,FG%,3P%,FT%,MP.1,PTS.1,TRB,AST
0,2016-17,18.0,WICHITAST,33,390,39,87,28,55,28,...,18,26,134,0.448,0.509,0.757,11.8,4.1,1.8,1.1
1,2017-18,19.0,WICHITAST,33,708,85,189,54,127,43,...,38,53,267,0.45,0.425,0.827,21.5,8.1,3.1,2.0
2,2019-20,21.0,OKLAHOMA,31,1030,137,360,42,162,140,...,77,58,456,0.381,0.259,0.848,33.2,14.7,5.3,3.0
3,2020-21,22.0,OKLAHOMA,25,862,139,314,32,105,147,...,75,42,457,0.443,0.305,0.865,34.5,18.3,5.5,4.6
4,Career,,,122,2990,400,950,156,449,358,...,208,179,1314,0.421,0.347,0.844,24.5,10.8,3.8,2.6


In [24]:
kareem_college_teams = kareem_college_table['College'].unique()
kareem_college_teams = kareem_college_teams[~pd.isnull(kareem_college_teams)]
kareem_college_teams

array(['WICHITAST', 'OKLAHOMA'], dtype=object)

In [25]:
kareem_full.loc[0, 'College'] = kareem_college_teams

In [26]:
kareem_full

Unnamed: 0,Season,Age,Tm,Lg,Pos,G,GS,MP,FG,FGA,...,PF,PTS,"Career high, Points","Career high, Rebounds","Career high, Assists","Career high, Steals","Career high, Blocks","Career high, Game Score",Triple-Doubles,College
0,2022-23,24.0,[LAL],NBA,SG,64,22,28.8,4.0,7.7,...,1.7,13.0,35.0,16.0,11.0,3.0,2.0,35.8,1.0,"[WICHITAST, OKLAHOMA]"


### Now get all the awards

In [27]:
kareem_allstar = soup.find(id='leaderboard_allstar')

In [28]:
if kareem_allstar is not None:
    kareem_all_star_text = [button.text for button in kareem_allstar][-2]
    kareem_num_all_stars = int(re.findall("\d+", kareem_all_star_text)[0])
    kareem_full["All Star Games"] = kareem_num_all_stars
else:
    kareem_full["All Star Games"] = 0

In [29]:
kareem_champs = soup.find(id='leaderboard_championships')

In [30]:
if kareem_allstar is not None:
    kareem_champs_text = [button.text for button in kareem_allstar][-2]
    kareem_num_champs = int(re.findall("\d+", kareem_all_star_text)[0])
    kareem_full["Championships"] = kareem_num_all_stars
else:
    kareem_full["Championships"] = 0

In [33]:
kareem_hof = soup.find(id='leaderboard_hof_prob')

In [34]:
if kareem_allstar is not None:
    kareem_hof_text = [button.text for button in kareem_hof][-2]
    kareem_hof_prob = int(re.findall("\d+", kareem_hof_text)[0])
    if (kareem_hof_prob) == 100:
        kareem_full["HOF"] = 1
    else:
        kareem_full["HOF"] = 0
else:
    kareem_full["HOF"] = 0

In [35]:
kareem_full

Unnamed: 0,Season,Age,Tm,Lg,Pos,G,GS,MP,FG,FGA,...,"Career high, Rebounds","Career high, Assists","Career high, Steals","Career high, Blocks","Career high, Game Score",Triple-Doubles,College,All Star Games,Championships,HOF
0,2022-23,24.0,[LAL],NBA,SG,64,22,28.8,4.0,7.7,...,16.0,11.0,3.0,2.0,35.8,1.0,"[WICHITAST, OKLAHOMA]",0,0,0


In [36]:
kareem_drafted = soup.find(id="meta")
kareem_drafted_p = kareem_drafted.findAll('p')
desired_p_tag = None
for tag in kareem_drafted_p:
    if tag.name == 'p' and tag.strong and tag.strong.text.strip() == 'Draft:':
        desired_p_tag = tag
        break

In [41]:
if desired_p_tag is not None:
    kareem_draft_text = desired_p_tag.text.split('\n')[-2]
    kareem_match = re.search(r'\b\d+(?:st|nd|rd|th) pick\b', kareem_draft_text).group(0)
    kareem_pick = int(re.findall("\d+", kareem_match)[0])
    kareem_full["Pick #"] = kareem_pick
    kareem_draft_match = re.search(r'\d{4}\sNBA\sDraft', kareem_draft_text).group(0)
    kareem_draft_year = int(re.findall("\d+", kareem_draft_match)[0])
    kareem_full["Draft Year"] = kareem_draft_year
else:
    kareem_full["Pick #"] = None
    kareem_full["Draft Year"] = None

## Last thing is his image URL

In [42]:
kareem_image = soup.find(class_="media-item")

In [43]:
kareem_image_url = kareem_image.find('img')['src']

In [44]:
kareem_full['Image URL'] = kareem_image_url

In [45]:
kareem_full

Unnamed: 0,Season,Age,Tm,Lg,Pos,G,GS,MP,FG,FGA,...,"Career high, Blocks","Career high, Game Score",Triple-Doubles,College,All Star Games,Championships,HOF,Pick #,Draft Year,Image URL
0,2022-23,24.0,[LAL],NBA,SG,64,22,28.8,4.0,7.7,...,2.0,35.8,1.0,"[WICHITAST, OKLAHOMA]",0,0,0,,,https://www.basketball-reference.com/req/20210...
