## Scrape Player Scores From Old Tournaments

In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from io import StringIO


In [6]:
team_link = '/cbb/schools/connecticut/men/2024.html'

In [None]:
print(f"Loading player data for team link: {team_link}")
url = "https://www.sports-reference.com" + team_link
response = requests.get(url)
html_content = response.text

# Assuming the HTML is stored in a variable called 'html_content'
soup = BeautifulSoup(html_content, 'html.parser')

# Find the table with id 'players_per_game'
table = soup.find('table', id='players_per_game')

# Extract player names and their links
player_data = []
for row in table.find('tbody').find_all('tr'):
    player_cell = row.find('td', {'data-stat': 'name_display'})
    if player_cell:
        player_name = player_cell.text.strip()
        player_link = None
        link_tag = player_cell.find('a')
        if link_tag:
            player_link = link_tag.get('href')
        player_data.append({'name': player_name, 'link': player_link})

# Extract all data including stats
rows = []
for tr in table.find('tbody').find_all('tr'):
    row_data = {}
    for td in tr.find_all(['th', 'td']):
        stat_name = td.get('data-stat')
        value = td.text.strip()
        row_data[stat_name] = value
        
        # Special handling for player links
        if stat_name == 'name_display' and td.find('a'):
            row_data['player_link'] = td.find('a').get('href')
    
    rows.append(row_data)

# Create DataFrame
df = pd.DataFrame(rows)

# Clean up the data
# Convert numeric columns to appropriate types
numeric_columns = ['games', 'games_started', 'mp_per_g', 'fg_per_g', 'fga_per_g', 'fg_pct',
                  'fg3_per_g', 'fg3a_per_g', 'fg3_pct', 'fg2_per_g', 'fg2a_per_g', 'fg2_pct',
                  'efg_pct', 'ft_per_g', 'fta_per_g', 'ft_pct', 'orb_per_g', 'drb_per_g',
                  'trb_per_g', 'ast_per_g', 'stl_per_g', 'blk_per_g', 'tov_per_g', 'pf_per_g', 'pts_per_g']

for col in numeric_columns:
    if col in df.columns:
        # Replace empty strings with NaN
        df[col] = df[col].replace('', float('nan'))
        # Remove any non-numeric characters (like % or *)
        df[col] = df[col].str.replace('[^0-9.-]', '', regex=True)
        # Convert to float
        df[col] = pd.to_numeric(df[col], errors='coerce')

# Print the first few rows to verify
print(df[['name_display', 'player_link', 'pos', 'games', 'pts_per_g']].head())

# Create a dictionary mapping player names to their links for easy lookup
player_links = dict(zip(df['name_display'], df['player_link']))

# Example of how to use this in your existing code:
# Convert the DataFrame to a list of dictionaries
raw_ppg = df.to_dict(orient='records')
player_ppg = {player['name_display']: {'avg': player['pts_per_g'], 'running_total': 0, 'link': player['player_link']} for player in raw_ppg}

Loading player data for team link: /cbb/schools/connecticut/men/2024.html
      name_display                          player_link pos  games  pts_per_g
0   Tristen Newton   /cbb/players/tristen-newton-1.html   G     40       15.1
1      Cam Spencer      /cbb/players/cam-spencer-1.html   G     40       14.3
2     Alex Karaban     /cbb/players/alex-karaban-1.html   F     39       13.3
3  Donovan Clingan  /cbb/players/donovan-clingan-1.html   C     35       13.0
4   Stephon Castle   /cbb/players/stephon-castle-2.html   G     34       11.1


In [45]:
player_ppg

{'Tristen Newton': {'avg': 15.1,
  'running_total': 0,
  'link': '/cbb/players/tristen-newton-1.html'},
 'Cam Spencer': {'avg': 14.3,
  'running_total': 0,
  'link': '/cbb/players/cam-spencer-1.html'},
 'Alex Karaban': {'avg': 13.3,
  'running_total': 0,
  'link': '/cbb/players/alex-karaban-1.html'},
 'Donovan Clingan': {'avg': 13.0,
  'running_total': 0,
  'link': '/cbb/players/donovan-clingan-1.html'},
 'Stephon Castle': {'avg': 11.1,
  'running_total': 0,
  'link': '/cbb/players/stephon-castle-2.html'},
 'Hassan Diarra': {'avg': 6.1,
  'running_total': 0,
  'link': '/cbb/players/hassan-diarra-1.html'},
 'Samson Johnson': {'avg': 5.4,
  'running_total': 0,
  'link': '/cbb/players/samson-johnson-1.html'},
 'Solomon Ball': {'avg': 3.3,
  'running_total': 0,
  'link': '/cbb/players/solomon-ball-2.html'},
 'Jaylin Stewart': {'avg': 2.5,
  'running_total': 0,
  'link': '/cbb/players/jaylin-stewart-3.html'},
 'Apostolos Roumoglou': {'avg': 1.0,
  'running_total': 0,
  'link': '/cbb/players

In [14]:
df

Unnamed: 0,ranker,name_display,player_link,pos,games,games_started,mp_per_g,fg_per_g,fga_per_g,fg_pct,...,orb_per_g,drb_per_g,trb_per_g,ast_per_g,stl_per_g,blk_per_g,tov_per_g,pf_per_g,pts_per_g,awards
0,1,Tristen Newton,/cbb/players/tristen-newton-1.html,G,40,40,33.2,4.6,11.0,0.415,...,1.4,5.2,6.6,6.2,0.9,0.3,2.5,1.8,15.1,"AA-1,AP-AA-1,NABC-AA-1,SN-AA-1,USBWA-AA-2"
1,2,Cam Spencer,/cbb/players/cam-spencer-1.html,G,40,40,33.0,4.9,10.1,0.484,...,1.1,3.8,4.9,3.6,1.5,0.3,1.0,2.2,14.3,
2,3,Alex Karaban,/cbb/players/alex-karaban-1.html,F,39,39,31.4,4.9,9.9,0.495,...,1.4,3.6,5.1,1.5,0.9,0.8,0.8,1.7,13.3,
3,4,Donovan Clingan,/cbb/players/donovan-clingan-1.html,C,35,33,22.5,5.3,8.3,0.639,...,2.5,4.9,7.4,1.5,0.5,2.5,0.8,2.0,13.0,
4,5,Stephon Castle,/cbb/players/stephon-castle-2.html,G,34,30,27.0,4.0,8.5,0.472,...,1.7,3.0,4.7,2.9,0.8,0.5,1.5,2.4,11.1,
5,6,Hassan Diarra,/cbb/players/hassan-diarra-1.html,G,40,1,19.4,2.2,4.5,0.483,...,0.8,2.2,3.0,2.4,0.8,0.3,1.1,1.9,6.1,
6,7,Samson Johnson,/cbb/players/samson-johnson-1.html,F,40,7,16.0,2.5,3.5,0.725,...,1.0,1.7,2.8,0.5,0.4,0.9,0.8,3.0,5.4,
7,8,Solomon Ball,/cbb/players/solomon-ball-2.html,G,39,10,11.5,1.2,3.0,0.385,...,0.4,0.6,1.0,0.3,0.2,0.0,0.4,0.7,3.3,
8,9,Jaylin Stewart,/cbb/players/jaylin-stewart-3.html,F,37,0,8.9,1.0,2.1,0.474,...,0.5,0.8,1.2,0.3,0.1,0.2,0.2,0.8,2.5,
9,10,Apostolos Roumoglou,/cbb/players/apostolos-roumoglou-1.html,G,24,0,2.1,0.3,0.5,0.583,...,0.0,0.4,0.5,0.2,0.1,0.1,0.0,0.2,1.0,


In [43]:
# Scrape actual pts in tournament for each player
def get_player_pts(player_link, year):
    url = f'https://www.sports-reference.com{player_link[:-5]}/gamelog/{year}'
    response = requests.get(url)
    html_content = response.text
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find the table with id 'player_game_log'
    table = soup.find('table', id='player_game_log')
    if not table:
        # throw error if table not found
        raise ValueError(f"Gamelog not found for player link: {player_link}")
    
    # Extract table to pandas DataFrame
    table_html = str(table)
    df = pd.read_html(StringIO(table_html))[0]

    # Clean up the DataFrame
    df = df[['Date', 'Opp', 'Type', 'PTS']].copy()

    return df

def get_tournament_games(player_gamelog_df):
    # Filter for tournament games (ROUND-64, ROUND-32,..., NATIONAL-SEMI, NATIONAL-FINAL)
    tournament_games = player_gamelog_df[player_gamelog_df['Type'].isin(['ROUND-64', 'ROUND-32', 'ROUND-16', 'ROUND-8', 'NATIONAL-SEMI', 'NATIONAL-FINAL'])]
    return tournament_games

In [41]:
test_player_link = '/cbb/players/donovan-clingan-1.html'

df2 = get_player_pts(test_player_link, 2024)
df2

Unnamed: 0,Date,Opp,Type,PTS
0,2023-11-06,Northern Arizona,REG (Non-Conf),12
1,2023-11-11,Stonehill,REG (Non-Conf),16
2,2023-11-14,Mississippi Valley State,REG (Non-Conf),17
3,2023-11-19,Indiana,REG (Non-Conf),7
4,2023-11-20,Texas,REG (Non-Conf),7
5,2023-11-24,Manhattan,REG (Non-Conf),17
6,2023-11-27,New Hampshire,REG (Non-Conf),29
7,2023-12-01,Kansas,REG (Non-Conf),8
8,2023-12-05,North Carolina,REG (Non-Conf),8
9,2023-12-09,Arkansas–Pine Bluff,REG (Non-Conf),11


In [44]:
get_tournament_games(df2)

Unnamed: 0,Date,Opp,Type,PTS
31,2024-03-22,Stetson,ROUND-64,19
32,2024-03-24,Northwestern,ROUND-32,14
33,2024-03-28,San Diego State,ROUND-16,8
34,2024-03-30,Illinois,ROUND-8,22
35,2024-04-06,Alabama,NATIONAL-SEMI,18
36,2024-04-08,Purdue,NATIONAL-FINAL,11


In [15]:
for player in player_ppg:
    player_link = player_links[player]
    try:
        player_gamelog_df = get_player_pts(player_link, 2024)
        tournament_games_df = get_tournament_games(player_gamelog_df)
        
        # Calculate running total for points in tournament games
        player_ppg[player]['running_total_actual'] = tournament_games_df['PTS'].sum()
    except ValueError as e:
        print(e)

{'Tristen Newton': {'avg': 15.1,
  'running_total': 0,
  'link': '/cbb/players/tristen-newton-1.html'},
 'Cam Spencer': {'avg': 14.3,
  'running_total': 0,
  'link': '/cbb/players/cam-spencer-1.html'},
 'Alex Karaban': {'avg': 13.3,
  'running_total': 0,
  'link': '/cbb/players/alex-karaban-1.html'},
 'Donovan Clingan': {'avg': 13.0,
  'running_total': 0,
  'link': '/cbb/players/donovan-clingan-1.html'},
 'Stephon Castle': {'avg': 11.1,
  'running_total': 0,
  'link': '/cbb/players/stephon-castle-2.html'},
 'Hassan Diarra': {'avg': 6.1,
  'running_total': 0,
  'link': '/cbb/players/hassan-diarra-1.html'},
 'Samson Johnson': {'avg': 5.4,
  'running_total': 0,
  'link': '/cbb/players/samson-johnson-1.html'},
 'Solomon Ball': {'avg': 3.3,
  'running_total': 0,
  'link': '/cbb/players/solomon-ball-2.html'},
 'Jaylin Stewart': {'avg': 2.5,
  'running_total': 0,
  'link': '/cbb/players/jaylin-stewart-3.html'},
 'Apostolos Roumoglou': {'avg': 1.0,
  'running_total': 0,
  'link': '/cbb/players