### Testing whether web scraping works with Kareem to capture per game stats

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import os
import time
import re
import numpy as np

### player_list is tracking what letter we are converting to an HTML row

In [47]:
player_list = os.listdir("threadedPlayers/e")
player_0 = player_list[0]

In [48]:
player_0

'Earl Evans.html'

### Now try to put all the above code in a for loop

In [5]:
def perGame(soup):
    
    player_pergame = player_soup.find('table', {'id':'per_game'})
    player_pergame_table = pd.read_html(str(player_pergame))[0]
    career_row = player_pergame_table[player_pergame_table['Season'] == 'Career'].index.tolist()[0]
    player_pergame_table_processed = player_pergame_table.iloc[:career_row]
    player_teams = player_pergame_table['Tm'].unique()
    
    player_teams = player_teams[~pd.isnull(player_teams)]
    
    # Convert the ndarray to a list of strings
    player_teams_list = player_teams.astype(str).tolist()

    # Remove instances longer than 4 letters
    player_teams_list = [team for team in player_teams_list if len(team) <= 4]

    # Convert the list back to a NumPy ndarray
    player_teams = np.array(player_teams_list, dtype=object)
    
    # Calculate the maximum numerical values for columns 4 onwards
    max_numerical_values = player_pergame_table_processed.iloc[:, 5:].apply(pd.to_numeric, errors='coerce').max()

    # Create a DataFrame with the maximum values as a single row
    max_values_df = max_numerical_values.to_frame().T

    # Update the first row of the DataFrame to keep the original values for columns 0 to 3
    player_full = pd.concat([player_pergame_table_processed.iloc[:1, :5], max_values_df], axis=1)
    
    if "TOT" in player_teams:
        player_full["MidSeasonTrade"] = 1
        player_teams = np.delete(player_teams, np.where(player_teams == 12))
    else:
        player_full["MidSeasonTrade"] = 0
    
    # Place teams played on as array
    player_full.at[0, 'Tm'] = player_teams
    
    return player_full

In [6]:
def gameHighsCollegeURL(soup, df):
    new_df = df.copy(deep=True)
    player_highs = soup.find('table', {'id':'stathead_insights'})
    player_highs_table = pd.read_html(str(player_highs))[0]
    
    # Convert the 'Highlight' and 'Unnamed: 1' columns to lists
    highlight_list = player_highs_table['Highlight'].tolist()
    unnamed_list = player_highs_table['Unnamed: 1'].tolist()
    
    # Assign the values from unnamed_list to the corresponding columns in new_df
    new_df.loc[0, highlight_list] = unnamed_list
    
    # COLLEGE
    player_college = soup.find('table', {'id':'all_college_stats'})
    if player_college is not None:
        player_college_table = pd.read_html(str(player_college))[0]
        player_college_table.columns = player_college_table.columns.droplevel()

        player_college_teams = player_college_table['College'].unique()
        player_college_teams = player_college_teams[~pd.isnull(player_college_teams)]

        new_df.at[0, 'College'] = player_college_teams
    else:
        new_df.at[0, 'College'] = None
    
    # URL
    player_image = soup.find(class_="media-item")
    if player_image is not None:
        player_image_url = player_image.find('img')['src']
    else:
        player_image_url = None
    new_df['Image URL'] = player_image_url
    
    return new_df

In [34]:
def awardsAndDraft(soup, df):
    
    player_full = df.copy(deep=True)
    
    # All Star
    player_allstar = soup.find(id='leaderboard_allstar')
    if player_allstar is not None:
        if player_allstar.find_all('button'):
        # If buttons exist, use the list comprehension to get the desired text
            player_allstar_text = [button.text for button in player_allstar][-2]
        else:
        # If buttons don't exist, directly extract the text
            player_allstar_text = player_allstar.text
        
        player_num_allstar = int(re.findall("\d+", player_allstar_text)[0])
        if player_num_allstar > 1000:
            player_num_allstar = 1

        player_full["All Star Games"] = player_num_allstar
    else:
        player_full["All Star Games"] = 0
        
    # Championships
    player_champs = soup.find(id='leaderboard_championships')
    if player_champs is not None:
        if player_champs.find_all('button'):
        # If buttons exist, use the list comprehension to get the desired text
            player_champs_text = [button.text for button in player_champs][-2]
        else:
        # If buttons don't exist, directly extract the text
            player_champs_text = player_champs.text
        
        player_num_champs = int(re.findall("\d+", player_champs_text)[0])
        if player_num_champs > 1000:
            player_num_champs = 1

        player_full["Championships"] = player_num_champs
    else:
        player_full["Championships"] = 0
        
    # HOF
    player_hof = soup.find(id='leaderboard_hof_prob')
    if player_hof is not None:
        player_hof_text = [button.text for button in player_hof][-2]
        player_hof_prob = int(re.findall("\d+", player_hof_text)[0])
        if (player_hof_prob) == 100:
            player_full["HOF"] = 1
        else:
            player_full["HOF"] = 0
    else:
        player_full["HOF"] = 0
    
    # DRAFT INFO
    player_drafted = soup.find(id="meta")
    player_drafted_p = player_drafted.findAll('p')
    desired_p_tag = None
    for tag in player_drafted_p:
        if tag.name == 'p' and tag.strong and tag.strong.text.strip() == 'Draft:':
            desired_p_tag = tag
            break
    if desired_p_tag is not None:
        player_draft_text = desired_p_tag.text.split('\n')[-2]
        player_match = re.search(r'\b\d+(?:st|nd|rd|th) pick\b', player_draft_text)
        if player_match is not None: # dealing with cases where bball-reference does not have pick number
            player_match = player_match.group(0)
            player_pick = int(re.findall("\d+", player_match)[0])
            player_full["Pick #"] = player_pick
        else:
            player_full["Pick #"] = None
        # Fixed a bug in this line so it picks up any type of draft instead of just NBA Draft
        player_draft_match = re.search(r'\b(\d{4}\s[A-Za-z\s]+Draft)\b', player_draft_text).group(0)
        player_draft_year = int(re.findall("\d+", player_draft_match)[0])
        player_full["Draft Year"] = player_draft_year
    else:
        player_full["Pick #"] = None
        player_full["Draft Year"] = None
        
    return player_full

## For loop to apply all previous functions and convert each letter to a dataframe extracting all necessary info from their respective reference page

In [151]:
player_entries = []
player_list = os.listdir("threadedPlayers/z")
for player in player_list:
    with open("threadedPlayers/z/{}".format(player)) as f:
        page = f.read()
    player_soup = BeautifulSoup(page, "lxml")
    
    player_df = perGame(player_soup)
    player_df = gameHighsCollegeURL(player_soup, player_df)
    player_df = awardsAndDraft(player_soup, player_df)
    
    player_name = player.replace(".html", "")
    player_df.insert(loc=0, column="Name", value=player_name)
    player_entries.append(player_df)

In [152]:
z_players_df = pd.concat(player_entries)

In [157]:
z_players_df["Triple-Doubles"] = np.nan

In [158]:
z_players_df.sort_values("Triple-Doubles", ascending=False)

Unnamed: 0,Name,Season,Age,Tm,Lg,Pos,G,GS,MP,FG,...,"Career high, Blocks","Career high, Game Score",College,Image URL,All Star Games,Championships,HOF,Pick #,Draft Year,Triple-Doubles
0,Phil Zevenbergen,1987-88,23.0,[SAS],NBA,C,8.0,0.0,7.3,1.9,...,1.0,8.9,"[SEATTLEPAC, WASHINGTON]",,0,0,0,4.0,1987.0,
0,Ivica Zubac,2016-17,19.0,"[LAL, TOT, LAC]",NBA,C,76.0,76.0,28.6,4.3,...,7.0,35.0,,https://www.basketball-reference.com/req/20210...,0,0,0,2.0,2016.0,
0,Matt Zunic,1948-49,29.0,[WSC],BAA,,56.0,,,1.8,...,,,,,0,0,0,,1947.0,
0,Luke Zeller,2012-13,25.0,[PHO],NBA,C,16.0,0.0,3.6,0.6,...,,6.4,[NOTREDAME],https://www.basketball-reference.com/req/20210...,0,0,0,,,
0,Dave Zeller,1961-62,22.0,[CIN],NBA,PG,61.0,,4.6,0.6,...,,,[MIAMIOH],,0,0,0,2.0,1961.0,
0,Max Zaslofsky,1946-47,21.0,"[CHS, NYK, TOT, BLB, MLH, FTW]",BAA,SG,70.0,,38.0,7.8,...,,,[STJOHNS],,1,0,0,,,
0,Jim Zoet,1982-83,29.0,[DET],NBA,C,7.0,0.0,4.3,0.1,...,3.0,1.0,[KENTST],,0,0,0,,,
0,Tyler Zeller,2012-13,23.0,"[CLE, BOS, TOT, BRK, MIL, ATL, MEM, SAS]",NBA,C,82.0,59.0,26.4,4.1,...,4.0,25.4,[UNC],https://www.basketball-reference.com/req/20210...,0,0,0,17.0,2012.0,
0,Zeke Zawoluk,1952-53,22.0,"[INO, PHW]",NBA,PF,71.0,,25.3,2.9,...,,,[STJOHNS],,0,0,0,,1952.0,
0,Stephen Zimmerman,2016-17,20.0,[ORL],NBA,C,19.0,0.0,5.7,0.5,...,2.0,6.7,[UNLV],https://www.basketball-reference.com/req/20210...,0,0,0,11.0,2016.0,


In [170]:
z_players_df.to_csv("letterDFs/z.csv")

In [161]:
player_list.format('Phil Zevenbergen.html')

AttributeError: 'list' object has no attribute 'format'

### Now create a function to do all the letters at once

In [171]:
def convertHTMLtoDF(letter):
    player_entries = []
    player_list = os.listdir(f"threadedPlayers/{letter}")
    for player in player_list:
        with open(f"threadedPlayers/{letter}/{player}") as f:
            page = f.read()
        player_soup = BeautifulSoup(page, "lxml")

        player_df = perGame(player_soup)
        player_df = gameHighsCollegeURL(player_soup, player_df)
        player_df = awardsAndDraft(player_soup, player_df)

        player_name = player.replace(".html", "")
        player_df.insert(loc=0, column="Name", value=player_name)
        player_entries.append(player_df)
    
    # Convert all player entry rows into a single dataframe for that letter
    final_df = pd.concat(player_entries)
    final_df.to_csv(f"letterDFs/{letter}.csv")

In [172]:
letters_array = [letter for letter in 'abcdefghijklmnopqrstuvwy']
letters_array

['a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'y']

In [173]:
for letter in letters_array:
    convertHTMLtoDF(letter)