### Testing whether web scraping works with Kareem to capture per game stats

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import os
import time
import re
import numpy as np

In [2]:
player_list = os.listdir("threadedPlayers/b")
player_0 = player_list[0]

In [4]:
player_0

'Chucky Brown.html'

### Now try to put all the above code in a for loop

In [5]:
def perGame(soup):
    
    player_pergame = player_soup.find('table', {'id':'per_game'})
    player_pergame_table = pd.read_html(str(player_pergame))[0]
    career_row = player_pergame_table[player_pergame_table['Season'] == 'Career'].index.tolist()[0]
    player_pergame_table_processed = player_pergame_table.iloc[:career_row]
    player_teams = player_pergame_table['Tm'].unique()
    
    player_teams = player_teams[~pd.isnull(player_teams)]
    
    # Convert the ndarray to a list of strings
    player_teams_list = player_teams.astype(str).tolist()

    # Remove instances longer than 4 letters
    player_teams_list = [team for team in player_teams_list if len(team) <= 4]

    # Convert the list back to a NumPy ndarray
    player_teams = np.array(player_teams_list, dtype=object)
    
    # Calculate the maximum numerical values for columns 4 onwards
    max_numerical_values = player_pergame_table_processed.iloc[:, 5:].apply(pd.to_numeric, errors='coerce').max()

    # Create a DataFrame with the maximum values as a single row
    max_values_df = max_numerical_values.to_frame().T

    # Update the first row of the DataFrame to keep the original values for columns 0 to 3
    player_full = pd.concat([player_pergame_table_processed.iloc[:1, :5], max_values_df], axis=1)
    
    if "TOT" in player_teams:
        player_full["MidSeasonTrade"] = 1
        player_teams = np.delete(player_teams, np.where(player_teams == 12))
    else:
        player_full["MidSeasonTrade"] = 0
    
    # Place teams played on as array
    player_full.at[0, 'Tm'] = player_teams
    
    return player_full

In [6]:
def gameHighsCollegeURL(soup, df):
    new_df = df.copy(deep=True)
    player_highs = soup.find('table', {'id':'stathead_insights'})
    player_highs_table = pd.read_html(str(player_highs))[0]
    
    # Convert the 'Highlight' and 'Unnamed: 1' columns to lists
    highlight_list = player_highs_table['Highlight'].tolist()
    unnamed_list = player_highs_table['Unnamed: 1'].tolist()
    
    # Assign the values from unnamed_list to the corresponding columns in new_df
    new_df.loc[0, highlight_list] = unnamed_list
    
    # COLLEGE
    player_college = soup.find('table', {'id':'all_college_stats'})
    if player_college is not None:
        player_college_table = pd.read_html(str(player_college))[0]
        player_college_table.columns = player_college_table.columns.droplevel()

        player_college_teams = player_college_table['College'].unique()
        player_college_teams = player_college_teams[~pd.isnull(player_college_teams)]

        new_df.at[0, 'College'] = player_college_teams
    else:
        new_df.at[0, 'College'] = None
    
    # URL
    player_image = soup.find(class_="media-item")
    if player_image is not None:
        player_image_url = player_image.find('img')['src']
    else:
        player_image_url = None
    new_df['Image URL'] = player_image_url
    
    return new_df

In [7]:
def awardsAndDraft(soup, df):
    
    player_full = df.copy(deep=True)
    
    # All Star
    player_allstar = soup.find(id='leaderboard_allstar')
    if player_allstar is not None:
        if player_allstar.find_all('button'):
        # If buttons exist, use the list comprehension to get the desired text
            player_allstar_text = [button.text for button in player_allstar][-2]
        else:
        # If buttons don't exist, directly extract the text
            player_allstar_text = player_allstar.text
        
        player_num_allstar = int(re.findall("\d+", player_allstar_text)[0])
        if player_num_allstar > 1000:
            player_num_allstar = 1

        player_full["All Star Games"] = player_num_allstar
    else:
        player_full["All Star Games"] = 0
        
    # Championships
    player_champs = soup.find(id='leaderboard_championships')
    if player_champs is not None:
        if player_champs.find_all('button'):
        # If buttons exist, use the list comprehension to get the desired text
            player_champs_text = [button.text for button in player_champs][-2]
        else:
        # If buttons don't exist, directly extract the text
            player_champs_text = player_champs.text
        
        player_num_champs = int(re.findall("\d+", player_champs_text)[0])
        if player_num_champs > 1000:
            player_num_champs = 1

        player_full["Championships"] = player_num_champs
    else:
        player_full["Championships"] = 0
        
    # HOF
    player_hof = soup.find(id='leaderboard_hof_prob')
    if player_hof is not None:
        player_hof_text = [button.text for button in player_hof][-2]
        player_hof_prob = int(re.findall("\d+", player_hof_text)[0])
        if (player_hof_prob) == 100:
            player_full["HOF"] = 1
        else:
            player_full["HOF"] = 0
    else:
        player_full["HOF"] = 0
    
    # DRAFT INFO
    player_drafted = soup.find(id="meta")
    player_drafted_p = player_drafted.findAll('p')
    desired_p_tag = None
    for tag in player_drafted_p:
        if tag.name == 'p' and tag.strong and tag.strong.text.strip() == 'Draft:':
            desired_p_tag = tag
            break
    if desired_p_tag is not None:
        player_draft_text = desired_p_tag.text.split('\n')[-2]
        player_match = re.search(r'\b\d+(?:st|nd|rd|th) pick\b', player_draft_text)
        if player_match is not None: # dealing with cases where bball-reference does not have pick number
            player_match = player_match.group(0)
            player_pick = int(re.findall("\d+", player_match)[0])
            player_full["Pick #"] = player_pick
        else:
            player_full["Pick #"] = None
        player_draft_match = re.search(r'\d{4}\sNBA\sDraft', player_draft_text).group(0)
        player_draft_year = int(re.findall("\d+", player_draft_match)[0])
        player_full["Draft Year"] = player_draft_year
    else:
        player_full["Pick #"] = None
        player_full["Draft Year"] = None
        
    return player_full

In [64]:
player_list[22:23]

['Jack Burmaster.html']

In [8]:
player_entries = []
player_list = os.listdir("threadedPlayers/b")
for player in player_list[22:23]:
    with open("threadedPlayers/b/{}".format(player)) as f:
        page = f.read()
    player_soup = BeautifulSoup(page, "lxml")
    
    player_df = perGame(player_soup)
    player_df = gameHighsCollegeURL(player_soup, player_df)
    player_df = awardsAndDraft(player_soup, player_df)
    
    player_name = player.replace(".html", "")
    player_df.insert(loc=0, column="Name", value=player_name)
    player_entries.append(player_df)

AttributeError: 'NoneType' object has no attribute 'group'

In [49]:
b_players_df = pd.concat(player_entries)

In [46]:
b_players_df

Unnamed: 0,Name,Season,Age,Tm,Lg,Pos,G,GS,MP,FG,...,"Career high, Steals","Career high, Blocks","Career high, Game Score",College,Image URL,All Star Games,Championships,HOF,Pick #,Draft Year
0,Chucky Brown,1989-90,21.0,"[CLE, TOT, LAL, NJN, DAL, HOU, PHO, MIL, ATL, ...",NBA,SF,82.0,82.0,24.8,3.7,...,4.0,3.0,26.0,[NCSTATE],,0,1,0,16,1989


In [512]:
b_players_df.to_csv("letterDF/b")

# Functions to test

In [None]:
def awardsAndDraft(soup, df):
    
    player_full = df.copy(deep=True)
    
    # All Star
    player_allstar = soup.find(id='leaderboard_allstar')
    if player_allstar is not None:
        if player_allstar.find_all('button'):
        # If buttons exist, use the list comprehension to get the desired text
            player_allstar_text = [button.text for button in player_allstar][-2]
        else:
        # If buttons don't exist, directly extract the text
            player_allstar_text = player_allstar.text
        
        player_num_allstar = int(re.findall("\d+", player_allstar_text)[0])
        if player_num_allstar > 1000:
            player_num_allstar = 1

        player_full["All Star Games"] = player_num_allstar
    else:
        player_full["All Star Games"] = 0
        
    # Championships
    player_champs = soup.find(id='leaderboard_championships')
    if player_champs is not None:
        if player_champs.find_all('button'):
        # If buttons exist, use the list comprehension to get the desired text
            player_champs_text = [button.text for button in player_champs][-2]
        else:
        # If buttons don't exist, directly extract the text
            player_champs_text = player_champs.text
        
        player_num_champs = int(re.findall("\d+", player_champs_text)[0])
        if player_num_champs > 1000:
            player_num_champs = 1

        player_full["Championships"] = player_num_champs
    else:
        player_full["Championships"] = 0
        
    # HOF
    player_hof = soup.find(id='leaderboard_hof_prob')
    if player_hof is not None:
        player_hof_text = [button.text for button in player_hof][-2]
        player_hof_prob = int(re.findall("\d+", player_hof_text)[0])
        if (player_hof_prob) == 100:
            player_full["HOF"] = 1
        else:
            player_full["HOF"] = 0
    else:
        player_full["HOF"] = 0
    
    # DRAFT INFO
    player_drafted = soup.find(id="meta")
    player_drafted_p = player_drafted.findAll('p')
    desired_p_tag = None
    for tag in player_drafted_p:
        if tag.name == 'p' and tag.strong and tag.strong.text.strip() == 'Draft:':
            desired_p_tag = tag
            break
    if desired_p_tag is not None:
        player_draft_text = desired_p_tag.text.split('\n')[-2]
        player_match = re.search(r'\b\d+(?:st|nd|rd|th) pick\b', player_draft_text)
        if player_match is not None: # dealing with cases where bball-reference does not have pick number
            player_match = player_match.group(0)
            player_pick = int(re.findall("\d+", player_match)[0])
            player_full["Pick #"] = player_pick
        else:
            player_full["Pick #"] = None
        player_draft_match = re.search(r'\d{4}\sNBA\sDraft', player_draft_text).group(0)
        player_draft_year = int(re.findall("\d+", player_draft_match)[0])
        player_full["Draft Year"] = player_draft_year
    else:
        player_full["Pick #"] = None
        player_full["Draft Year"] = None
        
    return player_full