### Testing whether web scraping works with Kareem to capture per game stats

In [3]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import time
import re
import numpy as np
import os

In [4]:
player_list = os.listdir("threadedPlayers/a")
player_0 = player_list[113]

In [5]:
player_0

'Dick Atha.html'

In [6]:
with open("threadedPlayers/a/{}".format(player_0)) as f:
    page = f.read()

In [7]:
soup = BeautifulSoup(page, "lxml")

In [8]:
player_pergame = soup.find('table', {'id':'per_game'})

In [9]:
player_pergame_table = pd.read_html(str(player_pergame))[0]

In [10]:
player_pergame_table

Unnamed: 0,Season,Age,Tm,Lg,Pos,G,GS,MP,FG,FGA,FG%,FT,FTA,FT%,TRB,AST,PF,PTS
0,1955-56,24.0,NYK,NBA,SG,25,,11.5,1.4,3.5,.409,0.8,1.1,.778,1.7,1.3,1.6,3.7
1,1956-57,25.0,Did Not Play (injury),Did Not Play (injury),Did Not Play (injury),Did Not Play (injury),Did Not Play (injury),Did Not Play (injury),Did Not Play (injury),Did Not Play (injury),Did Not Play (injury),Did Not Play (injury),Did Not Play (injury),Did Not Play (injury),Did Not Play (injury),Did Not Play (injury),Did Not Play (injury),Did Not Play (injury)
2,1957-58,26.0,DET,NBA,SG,18,,8.9,0.9,2.6,.362,0.6,0.7,.833,1.3,1.1,1.3,2.4
3,Career,,,NBA,,43,,10.4,1.2,3.1,.393,0.7,0.9,.795,1.5,1.2,1.5,3.2
4,,,,,,,,,,,,,,,,,,
5,1 season,,NYK,NBA,,25,,11.5,1.4,3.5,.409,0.8,1.1,.778,1.7,1.3,1.6,3.7
6,1 season,,DET,NBA,,18,,8.9,0.9,2.6,.362,0.6,0.7,.833,1.3,1.1,1.3,2.4


#### Getting a condensed list of all the teams an individual player played for

In [11]:
career_row = player_pergame_table[player_pergame_table['Season'] == 'Career'].index.tolist()[0]
player_pergame_table_processed = player_pergame_table.iloc[:career_row]

In [12]:
player_pergame_table_processed

Unnamed: 0,Season,Age,Tm,Lg,Pos,G,GS,MP,FG,FGA,FG%,FT,FTA,FT%,TRB,AST,PF,PTS
0,1955-56,24.0,NYK,NBA,SG,25,,11.5,1.4,3.5,.409,0.8,1.1,.778,1.7,1.3,1.6,3.7
1,1956-57,25.0,Did Not Play (injury),Did Not Play (injury),Did Not Play (injury),Did Not Play (injury),Did Not Play (injury),Did Not Play (injury),Did Not Play (injury),Did Not Play (injury),Did Not Play (injury),Did Not Play (injury),Did Not Play (injury),Did Not Play (injury),Did Not Play (injury),Did Not Play (injury),Did Not Play (injury),Did Not Play (injury)
2,1957-58,26.0,DET,NBA,SG,18,,8.9,0.9,2.6,.362,0.6,0.7,.833,1.3,1.1,1.3,2.4


In [13]:
player_teams = player_pergame_table['Tm'].unique()
player_teams

array(['NYK', 'Did Not Play\xa0(injury)', 'DET', nan], dtype=object)

In [15]:
player_teams = player_teams[~pd.isnull(player_teams)]
# Convert the ndarray to a list of strings
player_teams_list = player_teams.astype(str).tolist()

# Remove instances longer than 4 letters
player_teams_list = [team for team in player_teams_list if len(team) <= 4]

# Convert the list back to a NumPy ndarray
player_teams = np.array(player_teams_list, dtype=object)
player_teams

array(['NYK', 'DET'], dtype=object)

In [22]:
# Calculate the maximum numerical values for columns 4 onwards
max_numerical_values = player_pergame_table_processed.iloc[:, 5:].apply(pd.to_numeric, errors='coerce').max()

# Create a DataFrame with the maximum values as a single row
max_values_df = max_numerical_values.to_frame().T

# Update the first row of the DataFrame to keep the original values for columns 0 to 3
player_full = pd.concat([player_pergame_table_processed.iloc[:1, :5], max_values_df], axis=1)

player_full

Unnamed: 0,Season,Age,Tm,Lg,Pos,G,GS,MP,FG,FGA,FG%,FT,FTA,FT%,TRB,AST,PF,PTS
0,1955-56,24.0,NYK,NBA,SG,25.0,,11.5,1.4,3.5,0.409,0.8,1.1,0.833,1.7,1.3,1.6,3.7


In [23]:
player_full.at[0, 'Tm'] = np.array(player_teams_list, dtype=object)
player_full

Unnamed: 0,Season,Age,Tm,Lg,Pos,G,GS,MP,FG,FGA,FG%,FT,FTA,FT%,TRB,AST,PF,PTS
0,1955-56,24.0,"[NYK, DET]",NBA,SG,25.0,,11.5,1.4,3.5,0.409,0.8,1.1,0.833,1.7,1.3,1.6,3.7


In [24]:
player_full

Unnamed: 0,Season,Age,Tm,Lg,Pos,G,GS,MP,FG,FGA,FG%,FT,FTA,FT%,TRB,AST,PF,PTS
0,1955-56,24.0,"[NYK, DET]",NBA,SG,25.0,,11.5,1.4,3.5,0.409,0.8,1.1,0.833,1.7,1.3,1.6,3.7


In [25]:
# Inserting their name to verify code
player_full.insert(loc=0, column="Name", value="Dick")
player_full

Unnamed: 0,Name,Season,Age,Tm,Lg,Pos,G,GS,MP,FG,FGA,FG%,FT,FTA,FT%,TRB,AST,PF,PTS
0,Dick,1955-56,24.0,"[NYK, DET]",NBA,SG,25.0,,11.5,1.4,3.5,0.409,0.8,1.1,0.833,1.7,1.3,1.6,3.7


## Got a processed per game row with max of each category kept and "Tm" being an array of all teams played on

### Now to try game highs section

In [26]:
('DET') in player_full["Tm"][0]

True

In [27]:
player_highs = soup.find('table', {'id':'stathead_insights'})

In [28]:
player_highs_table = pd.read_html(str(player_highs))[0]
player_highs_table

Unnamed: 0,Highlight,Unnamed: 1,In Stathead
0,"Career high, Points",14,View full stats from top 20 games
1,"Career high, Rebounds",7,View full stats from top 20 games
2,"Career high, Assists",3,View full stats from top 20 games


In [29]:
# Convert the 'Highlight' and 'Unnamed: 1' columns to lists
highlight_list = player_highs_table['Highlight'].tolist()
unnamed_list = player_highs_table['Unnamed: 1'].tolist()

In [30]:
player_full.loc[0, highlight_list] = unnamed_list

In [31]:
player_full

Unnamed: 0,Name,Season,Age,Tm,Lg,Pos,G,GS,MP,FG,...,FT,FTA,FT%,TRB,AST,PF,PTS,"Career high, Points","Career high, Rebounds","Career high, Assists"
0,Dick,1955-56,24.0,"[NYK, DET]",NBA,SG,25.0,,11.5,1.4,...,0.8,1.1,0.833,1.7,1.3,1.6,3.7,14.0,7.0,3.0


In [32]:
player_full[player_highs_table["Highlight"].tolist()] = player_highs_table['Unnamed: 1'].tolist()

In [33]:
player_full

Unnamed: 0,Name,Season,Age,Tm,Lg,Pos,G,GS,MP,FG,...,FT,FTA,FT%,TRB,AST,PF,PTS,"Career high, Points","Career high, Rebounds","Career high, Assists"
0,Dick,1955-56,24.0,"[NYK, DET]",NBA,SG,25.0,,11.5,1.4,...,0.8,1.1,0.833,1.7,1.3,1.6,3.7,14,7,3


### Now college stats

In [34]:
player_college = soup.find('table', {'id':'all_college_stats'})

In [35]:
player_college_table = pd.read_html(str(player_college))[0]

ValueError: No tables found

In [36]:
player_college_table.columns = player_college_table.columns.droplevel()
player_college_table

NameError: name 'player_college_table' is not defined

In [37]:
player_college_teams = player_college_table['College'].unique()
player_college_teams = player_college_teams[~pd.isnull(player_college_teams)]
player_college_teams

NameError: name 'player_college_table' is not defined

In [450]:
player_full.at[0, 'College'] = player_college_teams

In [451]:
player_full

Unnamed: 0,Season,Age,Tm,Lg,Pos,G,GS,MP,FG,FGA,...,FTA,FT%,TRB,AST,PF,PTS,"Career high, Points","Career high, Rebounds","Career high, Assists",College
0,1955-56,24.0,"[NYK, DET]",NBA,SG,25.0,,11.5,1.4,3.5,...,1.1,0.833,1.7,1.3,1.6,3.7,14,7,3,[WVIRGINIA]


### Now get all the awards

In [452]:
player_allstar = soup.find(id='leaderboard_allstar')

In [453]:
if player_allstar is not None:
    player_all_star_text = [button.text for button in player_allstar][-2]
    player_num_all_stars = int(re.findall("\d+", player_all_star_text)[0])
    player_full["All Star Games"] = player_num_all_stars
else:
    player_full["All Star Games"] = 0

In [454]:
player_champs = soup.find(id='leaderboard_championships')

In [455]:
if player_champs is not None:
    if player_champs.find_all('button'):
    # If buttons exist, use the list comprehension to get the desired text
        player_champs_text = [button.text for button in player_allstar][-2]
    else:
    # If buttons don't exist, directly extract the text
        player_champs_text = player_allstar.text
    player_num_champs = int(re.findall("\d+", player_all_champs_text)[0])
    
    if player_num_champs > 1000:
        player_num_champs = 1
        
    player_full["Championships"] = player_num_champs
else:
    player_full["Championships"] = 0

In [456]:
player_hof = soup.find(id='leaderboard_hof_prob')

In [457]:
if player_hof is not None:
    player_hof_text = [button.text for button in player_hof][-2]
    player_hof_prob = int(re.findall("\d+", player_hof_text)[0])
    if (player_hof_prob) == 100:
        player_full["HOF"] = 1
    else:
        player_full["HOF"] = 0
else:
    player_full["HOF"] = 0

In [458]:
player_full

Unnamed: 0,Season,Age,Tm,Lg,Pos,G,GS,MP,FG,FGA,...,AST,PF,PTS,"Career high, Points","Career high, Rebounds","Career high, Assists",College,All Star Games,Championships,HOF
0,1955-56,24.0,"[NYK, DET]",NBA,SG,25.0,,11.5,1.4,3.5,...,1.3,1.6,3.7,14,7,3,[WVIRGINIA],0,0,0


In [459]:
player_drafted = soup.find(id="meta")
player_drafted_p = player_drafted.findAll('p')
desired_p_tag = None
for tag in player_drafted_p:
    if tag.name == 'p' and tag.strong and tag.strong.text.strip() == 'Draft:':
        desired_p_tag = tag
        break

In [460]:
if desired_p_tag is not None:
    player_draft_text = desired_p_tag.text.split('\n')[-2]
    player_match = re.search(r'\b\d+(?:st|nd|rd|th) pick\b', player_draft_text).group(0)
    player_pick = int(re.findall("\d+", player_match)[0])
    player_full["Pick #"] = player_pick
    player_draft_match = re.search(r'\d{4}\sNBA\sDraft', player_draft_text).group(0)
    player_draft_year = int(re.findall("\d+", player_draft_match)[0])
    player_full["Draft Year"] = player_draft_year
else:
    player_full["Pick #"] = None
    player_full["Draft Year"] = None

AttributeError: 'NoneType' object has no attribute 'group'

In [475]:
re.search(r'\d{4}\sNBA\sDraft', player_draft_text).group(0)

'1953 NBA Draft'

## Last thing is his image URL

In [335]:
player_image = soup.find(class_="media-item")

In [336]:
if player_image is not None:
    player_image_url = player_image.find('img')['src']
else:
    player_image_url = None

In [337]:
player_full['Image URL'] = player_image_url

In [338]:
player_full

Unnamed: 0,Season,Age,Tm,Lg,Pos,G,GS,MP,FG,FGA,...,"Career high, Steals","Career high, Blocks","Career high, Game Score",College,All Star Games,Championships,HOF,Pick #,Draft Year,Image URL
0,2017-18,25.0,[ORL],NBA,SG,15.0,1.0,18.6,2.1,5.3,...,1.0,1.0,14.8,[PITT],0,0,0,,,https://www.basketball-reference.com/req/20210...


In [219]:
player_list[5:10]

['Ömer Aşık.html',
 'Josh Akognon.html',
 'Andy Anderson.html',
 'Pero Antić.html',
 'Stacey Augmon.html']

### Now try to put all the above code in a for loop

In [38]:
def perGame(soup):
    
    player_pergame = player_soup.find('table', {'id':'per_game'})
    player_pergame_table = pd.read_html(str(player_pergame))[0]
    career_row = player_pergame_table[player_pergame_table['Season'] == 'Career'].index.tolist()[0]
    player_pergame_table_processed = player_pergame_table.iloc[:career_row]
    player_teams = player_pergame_table['Tm'].unique()
    
    player_teams = player_teams[~pd.isnull(player_teams)]
    
    # Convert the ndarray to a list of strings
    player_teams_list = player_teams.astype(str).tolist()

    # Remove instances longer than 4 letters
    player_teams_list = [team for team in player_teams_list if len(team) <= 4]

    # Convert the list back to a NumPy ndarray
    player_teams = np.array(player_teams_list, dtype=object)
    
    # Calculate the maximum numerical values for columns 4 onwards
    max_numerical_values = player_pergame_table_processed.iloc[:, 5:].apply(pd.to_numeric, errors='coerce').max()

    # Create a DataFrame with the maximum values as a single row
    max_values_df = max_numerical_values.to_frame().T

    # Update the first row of the DataFrame to keep the original values for columns 0 to 3
    player_full = pd.concat([player_pergame_table_processed.iloc[:1, :5], max_values_df], axis=1)
    
    if "TOT" in player_teams:
        player_full["MidSeasonTrade"] = 1
        player_teams = np.delete(player_teams, np.where(player_teams == 12))
    else:
        player_full["MidSeasonTrade"] = 0
    
    # Place teams played on as array
    player_full.at[0, 'Tm'] = player_teams
    
    return player_full

In [39]:
def gameHighsCollegeURL(soup, df):
    new_df = df.copy(deep=True)
    player_highs = soup.find('table', {'id':'stathead_insights'})
    player_highs_table = pd.read_html(str(player_highs))[0]
    
    # Convert the 'Highlight' and 'Unnamed: 1' columns to lists
    highlight_list = player_highs_table['Highlight'].tolist()
    unnamed_list = player_highs_table['Unnamed: 1'].tolist()
    
    # Assign the values from unnamed_list to the corresponding columns in new_df
    new_df.loc[0, highlight_list] = unnamed_list
    
    # COLLEGE
    player_college = soup.find('table', {'id':'all_college_stats'})
    if player_college is not None:
        player_college_table = pd.read_html(str(player_college))[0]
        player_college_table.columns = player_college_table.columns.droplevel()

        player_college_teams = player_college_table['College'].unique()
        player_college_teams = player_college_teams[~pd.isnull(player_college_teams)]

        new_df.at[0, 'College'] = player_college_teams
    else:
        new_df.at[0, 'College'] = None
    
    # URL
    player_image = soup.find(class_="media-item")
    if player_image is not None:
        player_image_url = player_image.find('img')['src']
    else:
        player_image_url = None
    new_df['Image URL'] = player_image_url
    
    return new_df

In [40]:
def awardsAndDraft(soup, df):
    
    player_full = df.copy(deep=True)
    
    # All Star
    player_allstar = soup.find(id='leaderboard_allstar')
    if player_allstar is not None:
        if player_allstar.find_all('button'):
        # If buttons exist, use the list comprehension to get the desired text
            player_allstar_text = [button.text for button in player_allstar][-2]
        else:
        # If buttons don't exist, directly extract the text
            player_allstar_text = player_allstar.text
        
        player_num_allstar = int(re.findall("\d+", player_allstar_text)[0])
        if player_num_allstar > 1000:
            player_num_allstar = 1

        player_full["All Star Games"] = player_num_allstar
    else:
        player_full["All Star Games"] = 0
        
    # Championships
    player_champs = soup.find(id='leaderboard_championships')
    if player_champs is not None:
        if player_champs.find_all('button'):
        # If buttons exist, use the list comprehension to get the desired text
            player_champs_text = [button.text for button in player_champs][-2]
        else:
        # If buttons don't exist, directly extract the text
            player_champs_text = player_champs.text
        
        player_num_champs = int(re.findall("\d+", player_champs_text)[0])
        if player_num_champs > 1000:
            player_num_champs = 1

        player_full["Championships"] = player_num_champs
    else:
        player_full["Championships"] = 0
        
    # HOF
    player_hof = soup.find(id='leaderboard_hof_prob')
    if player_hof is not None:
        player_hof_text = [button.text for button in player_hof][-2]
        player_hof_prob = int(re.findall("\d+", player_hof_text)[0])
        if (player_hof_prob) == 100:
            player_full["HOF"] = 1
        else:
            player_full["HOF"] = 0
    else:
        player_full["HOF"] = 0
    
    # DRAFT INFO
    player_drafted = soup.find(id="meta")
    player_drafted_p = player_drafted.findAll('p')
    desired_p_tag = None
    for tag in player_drafted_p:
        if tag.name == 'p' and tag.strong and tag.strong.text.strip() == 'Draft:':
            desired_p_tag = tag
            break
    if desired_p_tag is not None:
        player_draft_text = desired_p_tag.text.split('\n')[-2]
        player_match = re.search(r'\b\d+(?:st|nd|rd|th) pick\b', player_draft_text)
        if player_match is not None: # dealing with cases where bball-reference does not have pick number
            player_match = player_match.group(0)
            player_pick = int(re.findall("\d+", player_match)[0])
            player_full["Pick #"] = player_pick
        else:
            player_full["Pick #"] = None
        player_draft_match = re.search(r'\d{4}\sNBA\sDraft', player_draft_text).group(0)
        player_draft_year = int(re.findall("\d+", player_draft_match)[0])
        player_full["Draft Year"] = player_draft_year
    else:
        player_full["Pick #"] = None
        player_full["Draft Year"] = None
        
    return player_full

In [41]:
player_entries = []
player_list = os.listdir("threadedPlayers/a")
for player in player_list:
    with open("threadedPlayers/a/{}".format(player)) as f:
        page = f.read()
    player_soup = BeautifulSoup(page, "lxml")
    
    player_df = perGame(player_soup)
    player_df = gameHighsCollegeURL(player_soup, player_df)
    player_df = awardsAndDraft(player_soup, player_df)
    
    player_name = player.replace(".html", "")
    player_df.insert(loc=0, column="Name", value=player_name)
    player_entries.append(player_df)

In [42]:
a_players_df = pd.concat(player_entries)

In [47]:
a_players_df.sort_values('Triple-Doubles', ascending=False)

Unnamed: 0,Name,Season,Age,Tm,Lg,Pos,G,GS,MP,FG,...,"Career high, Blocks","Career high, Game Score",College,Image URL,All Star Games,Championships,HOF,Pick #,Draft Year,Triple-Doubles
0,Giannis Antetokounmpo,2013-14,19.0,[MIL],NBA,SF,81.0,80.0,36.7,11.2,...,7.0,50.4,,https://www.basketball-reference.com/req/20210...,7,1,0,15,2013,35.0
0,Kareem Abdul-Jabbar,1969-70,22.0,"[MIL, LAL]",NBA,C,82.0,80.0,44.2,14.3,...,11.0,42.5,[UCLA],https://www.basketball-reference.com/req/20210...,19,6,1,1,1969,21.0
0,Alvan Adams,1975-76,21.0,[PHO],NBA,C,82.0,75.0,33.2,7.9,...,7.0,37.2,[OKLAHOMA],https://www.basketball-reference.com/req/20210...,1,0,0,4,1975,12.0
0,Kenny Anderson,1991-92,21.0,"[NJN, TOT, CHH, POR, BOS, SEA, NOH, IND, ATL, ...",NBA,PG,82.0,82.0,38.2,7.0,...,2.0,44.0,[GATECH],https://www.basketball-reference.com/req/20210...,1,0,0,2,1991,5.0
0,Bam Adebayo,2017-18,20.0,[MIA],NBA,C,82.0,75.0,34.6,8.0,...,5.0,37.6,[KENTUCKY],https://www.basketball-reference.com/req/20210...,2,0,0,14,2017,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,Mark Alarie,1986-87,23.0,"[DEN, WSB]",NBA,PF,82.0,25.0,23.1,4.5,...,4.0,20.4,[DUKE],,0,0,0,18,1986,
0,Vincent Askew,1987-88,21.0,"[PHI, GSW, TOT, SAC, SEA, NJN, IND, DEN, POR]",NBA,SG,80.0,10.0,25.0,3.5,...,2.0,20.4,[MEMPHIS],,0,0,0,16,1987,
0,Gary Alcorn,1959-60,23.0,"[DET, LAL]",NBA,C,58.0,,11.6,1.6,...,,,[FRESNOST],,0,0,0,1,1958,
0,Don Ackerman,1953-54,23.0,[NYK],NBA,PG,28.0,,7.9,0.5,...,,,[LONGISLAND],,0,0,0,2,1953,


In [43]:
a_players_df.columns

Index(['Name', 'Season', 'Age', 'Tm', 'Lg', 'Pos', 'G', 'GS', 'MP', 'FG',
       'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT',
       'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF',
       'PTS', 'MidSeasonTrade', 'Career high, Points', 'Career high, Rebounds',
       'Career high, Assists', 'Career high, Steals', 'Career high, Blocks',
       'Career high, Game Score', 'College', 'Image URL', 'All Star Games',
       'Championships', 'HOF', 'Pick #', 'Draft Year', 'Triple-Doubles'],
      dtype='object')

In [48]:
a_players_df.to_csv("letterDF/a")