<a href="https://colab.research.google.com/github/zhenyisx/scoutoid/blob/main/scraper2db.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

To scrape website and populate to database (csv file) on Google Drive. The files can then be uploaded to Goolge Storage for clients (i.e., stremlit) to consume.

# Import and Function Definitions

In [5]:
# Imports
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from datetime import date, datetime, timedelta
import re

from google.colab import  drive
drive.mount('/drive')

%matplotlib inline

# TODO
# - home team and away team are reverse in current table
# - team should add city and state

Mounted at /drive


In [None]:
def get_scores_from_soup(maxpreps_day_soup, this_day):
  """to get scores of games in one day.

  Args:
      maxpreps_day_soup (soup): a soup parsed from MaxPreps HTML by BS.
      this_day (str): the date.

  Returns:
      schedulf_df (dataframe): the dataframe of all game results.

  """

  # create dateframe of game schedules and scores
  schedule_df = pd.DataFrame({'Date': pd.Series(dtype='datetime64[ns]'),
                    'Home Team': pd.Series(dtype='str'),
                    'Home Score': pd.Series(dtype='int'),
                    'Away Team': pd.Series(dtype='str'),
                    'Away Score': pd.Series(dtype='int')})

  # populate the datafrme by analyzing the soup (MaxPreps specific)
  # to use ChromeDev Tool to get the following rules of find
  matches = maxpreps_day_soup.find('div', {'class': 'contests'})

  for m in matches.find_all('ul', {'class': 'teams'}):
    raw_record = m.find_all("li")

    # home score
    try:
      home_score = int(raw_record[0].find('div', {'class': 'score'}).text.strip())
    except ValueError:
        # Handle the exception
        print('score not valid, using -1 instead')
        home_score = -1
    except AttributeError:
        # Handle the exception
        print('score not valid, using -1 instead')
        home_score = -1
    # print(home_score)

    # home name
    home_name = raw_record[0].find('div', {'class': 'name'}).text.strip()
    # print(home_name)

    # away score
    try:
      away_score = int(raw_record[1].find('div', {'class': 'score'}).text.strip())
    except ValueError:
        # Handle the exception
        print('score not valid, using -1 instead')
        away_score = -1
    except AttributeError:
        # Handle the exception
        print('score not valid, using -1 instead')
        away_score = -1
    # print(away_score)

    # away name
    away_name = raw_record[1].find('div', {'class': 'name'}).text.strip()
    # print(away_name)

    m_result = {'Date':this_day, 'Home Team':home_name, 'Home Score':home_score,
                'Away Team':away_name, 'Away Score':away_score}
    # schedule_df = schedule_df.append(m_result, ignore_index=True)
    schedule_df = pd.concat([schedule_df, pd.DataFrame([m_result])], ignore_index=True)


  schedule_df = schedule_df.drop_duplicates(keep='last')

  return schedule_df


def get_scores_from_maxpreps_for_one_day(today):
  """to get scores of games from MaxPreps in one day.

  Example:
  df = get_scores_from_maxpreps_for_one_day("12/1/2022")
  print(df)

  Args:
      this_day (str): the date str in '%m/%d/%Y' format.

  Returns:
      schedulf_df (dataframe): the dataframe of all game results.

  """
  # maxpreps's homepage
  maxpreps_url = 'https://www.maxpreps.com/ca/central-coast-section/soccer/girls/scores/?date={}'.format(today)  # please change the date if needed

  # Use requests to retrieve data from a given URL
  maxpreps_response = requests.get(maxpreps_url, headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"})

  # Parse the whole HTML page using BeautifulSoup
  maxpreps_soup = BeautifulSoup(maxpreps_response.text, 'html.parser')

  # Title of the parsed page
  print(maxpreps_soup.title.text)

  # check if the dates match
  try:
    content_date = datetime.strptime(maxpreps_soup.title.text.split('|')[1].strip().split()[0], '%m/%d/%Y')
    # another way to find the content current date
    # content = maxpreps_soup.find('div', {'class': 'calendar'}).find('ol', {'class': 'week'}).find('a', {'class': 'btn btn-default active'})
    # content_date = datetime.strptime(content.text, '%m/%d/%Y'))
    if content_date == datetime.strptime(today, '%m/%d/%Y'):
      print('Found Games for {}'.format(content_date))
      scores_df = get_scores_from_soup(maxpreps_soup, content_date)
      return scores_df
    else:
      print('No Games found for {}'.format(today))
  except TypeError:
    print('No Games found for {}'.format(today))
  except ValueError:
    print('No Games found for {}'.format(today))


def get_scores_from_maxpreps_for_range(start_date, end_date):
  """to get scores of games from MaxPreps in a date range.

  Example:
  start_date = date(2022, 11, 10)
  end_date = date(2023, 2, 10)
  scores = get_scores_from_maxpreps_for_range(start_date, end_date)

  Args:
      start_date (date): the start date.
      end_date (date): the end date.

  Returns:
      schedulf_df (dataframe): the dataframe of all game results.

  """
  def daterange(start_date, end_date):
    for n in range(int((end_date - start_date).days)):
        yield start_date + timedelta(n)

  dfs = []
  for single_date in daterange(start_date, end_date):
      today = single_date.strftime("%m/%d/%Y")
      scores_df = get_scores_from_maxpreps_for_one_day(today)
      if scores_df is None:
        print('No games found on {}'.format(today))
      else:
        print('Found {} games for {}'.format(len(scores_df), today))
      dfs.append(scores_df)

  # see pd.concat documentation for more info
  concat_scores_df = pd.concat(dfs)
  concat_scores_df = concat_scores_df.drop_duplicates(keep='last')
  return concat_scores_df


# Running Script

In [None]:
# demo
df = get_scores_from_maxpreps_for_one_day("3/8/2025")
print(df)


	Central Coast Section High School Girls Soccer Scores | 3/8/2025 Results

Found Games for 2025-03-08 00:00:00


AttributeError: 'DataFrame' object has no attribute 'append'

In [None]:
# get all scores
# please note game info of previous seasons/years are not accessible,
# only current season is accessible.

# load historical data
# scores.csv was generated between 11/10/2022 and 2/1/2023
hist_scores = pd.read_csv('/drive/My Drive/scores.csv')

start_date = date(2025, 2, 1)
end_date = date(2025, 3, 12)
new_scores = get_scores_from_maxpreps_for_range(start_date, end_date)

scores = pd.concat([hist_scores, new_scores])
scores['Date'] = pd.to_datetime(scores['Date'])
scores = scores.drop_duplicates(keep='last')
scores.to_csv('/drive/My Drive/scores_2425_03122025.csv', index=False)


	Central Coast Section High School Girls Soccer Scores | 2/1/2025 Results

Found Games for 2025-02-01 00:00:00
Found 6 games for 02/01/2025

	Central Coast Section High School Girls Soccer Scores | 3/8/2025 Results

No Games found for 02/02/2025
No games found on 02/02/2025

	Central Coast Section High School Girls Soccer Scores | 2/3/2025 Results

Found Games for 2025-02-03 00:00:00
score not valid, using -1 instead
Found 11 games for 02/03/2025

	Central Coast Section High School Girls Soccer Scores | 2/4/2025 Results

Found Games for 2025-02-04 00:00:00
score not valid, using -1 instead
score not valid, using -1 instead
score not valid, using -1 instead
score not valid, using -1 instead
Found 29 games for 02/04/2025

	Central Coast Section High School Girls Soccer Scores | 2/5/2025 Results

Found Games for 2025-02-05 00:00:00
score not valid, using -1 instead
score not valid, using -1 instead
Found 18 games for 02/05/2025

	Central Coast Section High School Girls Soccer Scores | 

In [None]:
# get all rankings
rankings = get_rankings_from_scores(scores)
rankings.to_csv('/drive/My Drive/rankings_2425_03122025.csv', index=False)

# New Section

# Advanced Method

In [None]:
# create a new dataset for the scores and rankings
# one shot (not combining)

# combine with historical data

# Test of Script

In [None]:
def get_rankings_from_scores(schedule_df):
  """to ranking statistics of teams from the schedules and scores

  Example:
  df = get_scores_from_maxpreps_for_one_day("12/1/2022")
  print(df)

  Args:
      schedule_df (dataframe): the scores dataframe.

  Returns:
      ranking_df (dataframe): the dataframe of ranking statistics.

  """
  teams = list(set(list(schedule_df["Home Team"].unique()) +
                   list(schedule_df["Away Team"].unique())))

  # initialize the ranking dataframe
  ranking_df = pd.DataFrame({
                    'Team': pd.Series(dtype='str'),
                    'Total Points': pd.Series(dtype='int'),
                    'Total Games': pd.Series(dtype='int'),
                    'Total Wins': pd.Series(dtype='int'),
                    'Total Losses': pd.Series(dtype='int'),
                    'Total Ties': pd.Series(dtype='int'),
                    'PPG': pd.Series(dtype='float'),})

  for t in teams:
    t_result = {'Team':t, 'Total Points': 0,
                    'Total Games': 0,
                    'Total Wins': 0,
                    'Total Losses': 0,
                    'Total Ties': 0,
                    'PPG':0}
    # ranking_df = ranking_df.append(t_result, ignore_index=True)
    ranking_df = pd.concat([ranking_df, pd.DataFrame([t_result])], ignore_index=True)


  # populate ranking dataframe based on schedule
  for index,row in schedule_df.iterrows():
    home_team = row['Home Team']
    away_team = row['Away Team']
    home_score = row['Home Score']
    away_score = row['Away Score']

    ranking_df.loc[(ranking_df['Team'] == home_team), 'Total Games'] += 1
    ranking_df.loc[(ranking_df['Team'] == away_team), 'Total Games'] += 1

    if home_score > away_score: # home win
      ranking_df.loc[(ranking_df['Team'] == home_team), 'Total Wins'] += 1
      ranking_df.loc[(ranking_df['Team'] == home_team), 'Total Points'] += 3
      ranking_df.loc[(ranking_df['Team'] == away_team), 'Total Losses'] += 1
    elif home_score < away_score: # away win
      ranking_df.loc[(ranking_df['Team'] == away_team), 'Total Wins'] += 1
      ranking_df.loc[(ranking_df['Team'] == away_team), 'Total Points'] += 3
      ranking_df.loc[(ranking_df['Team'] == home_team), 'Total Losses'] += 1
    elif home_score == away_score: # tie
      ranking_df.loc[(ranking_df['Team'] == home_team), 'Total Ties'] += 1
      ranking_df.loc[(ranking_df['Team'] == away_team), 'Total Ties'] += 1
      ranking_df.loc[(ranking_df['Team'] == home_team), 'Total Points'] += 1
      ranking_df.loc[(ranking_df['Team'] == away_team), 'Total Points'] += 1

  ranking_df['PPG'] = ranking_df['Total Points'] / ranking_df['Total Games']

  ranking_df = ranking_df.sort_values(by=['PPG', 'Total Points'],
                                      ascending=False)
  ranking_df = ranking_df.drop_duplicates(keep='last')
  return ranking_df


### analyze game summary page

In [None]:
single_date =  date(2023, 2, 28)
today = single_date.strftime("%m/%d/%Y")

# maxpreps's homepage
maxpreps_url = 'https://www.maxpreps.com/ca/central-coast-section/soccer/girls/scores/?date={}'.format(today)  # please change the date if needed

# Use requests to retrieve data from a given URL
maxpreps_response = requests.get(maxpreps_url, headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"})

# Parse the whole HTML page using BeautifulSoup
maxpreps_soup = BeautifulSoup(maxpreps_response.text, 'html.parser')


In [None]:
# find all the div (each div is a game) and print a link in each div
for gamediv in maxpreps_soup.find_all("div", {"class": "contest-box-item"}):#maxpreps_soup.findAll('a', {'class': 'c-c'}):
    try:
        # print(gamediv['href'])
        # print(gamediv)
        print(gamediv.find('a')['href'])
    except KeyError:
        print("no href")
        pass

In [None]:
# find  details of one game using the link
# game_detail_page = "https://www.maxpreps.com/games/2-28-2023/girls-soccer-winter-22-23/presentation-vs-windsor.htm?c=kWgQYflpLk60D3-K60z9pA" # simple example
game_detail_page = "https://www.maxpreps.com/games/2-22-2023/girls-soccer-winter-22-23/menlo-atherton-vs-mountain-view.htm?c=JPH2i5aDH0yYvdk1bipRfg" # complete example

# Use requests to retrieve data from a given URL
gamedetail_response = requests.get(game_detail_page, headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"})

# Parse the whole HTML page using BeautifulSoup
gamedetail_soup = BeautifulSoup(gamedetail_response.text, 'html.parser')

In [None]:
# print('Classes of each table:')
# for table in gamedetail_soup.find_all('table'):
#     print(table.get('class'))

def get_text(ele):
  return ele.text if ele is not None else None


def find_city(team_name, summary):
  """ find city of team from summary
  """
  res = re.search(r'{} \((.*?)\)'.format(team_name), summary, re.IGNORECASE)
  if res:
      return res.group(1)




# need to record howm/away information in one row

## create a dataframe from statistics page

In [None]:
# find info for a game from its details url
def extract_game_info_from_details_page(page_url):
  # Use requests to retrieve data from a given URL
  page_response = requests.get(page_url,
                               headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"})

  # Parse the whole HTML page using BeautifulSoup
  page_soup = BeautifulSoup(page_response.text, 'html.parser')

  # find summary
  game_summary = get_text(page_soup.find('p', attrs={'class' : 'contest-description'}))
  # print(game_summary)

  # find school names (names in table are not reliable)
  school_names = [i.text for i in page_soup.find('div', attrs={'class' : 'school-names'}).find_all('a')]
  # print(school_names)

  # analyze target table
  table = page_soup.find('div' , {"data-l-s-c":"box-score"}).find('table', class_='mx-grid boxscore d-b-s post soccer')
  # print(table)

  team_names = []
  team_cities = []
  first_half_scores = []
  second_half_scores = []
  total_scores = []
  shootout_scores = []
  is_winner = []

  try:
    for r in table.tbody.find_all('tr'):
      # find team name
      team_name = get_text(r.find('th', class_='team first'))
      team_names.append(team_name)
      # team_cities.append(find_city(team_name, game_summary))
      first_half_scores.append(get_text(r.find('td', class_='firsthalf score dw')))
      second_half_scores.append(get_text(r.find('td', class_='secondhalf score dw')))
      total_scores.append(get_text(r.find('td', class_='score total score')))
      shootout_scores.append(get_text(r.find('td', class_='shootout stat tiebreaker dw')))
      is_winner.append(get_text(r.find('td', class_='winner last')))
  except AttributeError as err:
    print(err)
    pass

  # team_names = [j if i != j else i for i, j in zip(team_names, school_names)]
  team_names = school_names
  team_cities = [find_city(j, game_summary) for j in team_names]
  return team_names, team_cities, first_half_scores, second_half_scores, total_scores, shootout_scores, is_winner, game_summary

game_detail_page1 = "https://www.maxpreps.com/games/2-28-2023/girls-soccer-winter-22-23/presentation-vs-windsor.htm?c=kWgQYflpLk60D3-K60z9pA" # simple example
game_detail_page2 = "https://www.maxpreps.com/games/2-22-2023/girls-soccer-winter-22-23/menlo-atherton-vs-mountain-view.htm?c=JPH2i5aDH0yYvdk1bipRfg" # complete example with shootout
game_detail_page3 = "https://www.maxpreps.com/games/3-2-2023/girls-soccer-winter-22-23/hollister-vs-stone-ridge-christian.htm?c=tv8S33dtGkSS-2ZuVO0vLg"
game_detail_page4 = "https://www.maxpreps.com/games/11-29-2022/girls-soccer-winter-22-23/gateway-vs-oceana.htm?c=70Hrb_EAd0mWPDTpdhiXoQ" # example of missing scores
game_detail_page5 = "https://www.maxpreps.com/games/1-6-2023/girls-soccer-winter-22-23/king-city-vs-st-francis.htm?c=ImFn1XMo2EGTXl-9PYtm0A" # no game result is reported

# print(extract_game_info_from_details_page(game_detail_page1))
# print(extract_game_info_from_details_page(game_detail_page2))
print(extract_game_info_from_details_page(game_detail_page3))
print(extract_game_info_from_details_page(game_detail_page5))

In [None]:
# extract team info from school profile page
def extract_team_info_from_profile_page(school_profile_url):
  """
  return school name, address, mascot, color, school type, athletic director, phone
  """
  # Use requests to retrieve data from a given URL
  page_response = requests.get(school_profile_url,
                               headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"})

  # Parse the whole HTML page using BeautifulSoup
  page_soup = BeautifulSoup(page_response.text, 'html.parser')

  school_name = page_soup.find("h1").string

  dl_data = page_soup.find_all("dd")
  info_list = [dl.text for dl in dl_data]

  return tuple([school_name] + info_list)
team_profile_page1 = "https://www.maxpreps.com/ca/mountain-view/mountain-view-spartans"

print(extract_team_info_from_profile_page(team_profile_page1))



In [None]:
# find team profile page url from game page url
def find_team_profile_pages(game_page_url):
  # Use requests to retrieve data from a given URL
  page_response = requests.get(game_page_url,
                               headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"})

  # Parse the whole HTML page using BeautifulSoup
  page_soup = BeautifulSoup(page_response.text, 'html.parser')

  page_urls = []
  for sdiv in page_soup.find_all("div", {"class": "school-names"}):
    for l in sdiv.find_all('a'):
      page_urls.append("https://www.maxpreps.com"+l['href'])
  return page_urls

game_detail_page5 = "https://www.maxpreps.com/games/1-6-2023/girls-soccer-winter-22-23/king-city-vs-st-francis.htm?c=ImFn1XMo2EGTXl-9PYtm0A" # no game result is reported
print(find_team_profile_pages(game_detail_page5))

In [None]:
from tqdm.notebook import tqdm

# create dataframe of teams

teams_df = pd.DataFrame({'School Name': pd.Series(dtype='str'),
                         'School Alias': pd.Series(dtype='str'),
                        'School Address': pd.Series(dtype='str'),
                        'Mascot': pd.Series(dtype='int'),
                        'School Link': pd.Series(dtype='str'),
                        'Team Link': pd.Series(dtype='str')})

# load schedule df to
# step 1: find team link from each game details page
# step 2: add a record of team info
# step 3: dedupe
# note: moving forward we don't need to parse school info from game details page

hist_scores_df = pd.read_csv('/drive/My Drive/scores_2223_03182023.csv')
for index, row in tqdm(hist_scores_df.iterrows(), total=hist_scores_df.shape[0]):
    game_url = row['Web Link']
    team_profile_urls = find_team_profile_pages(game_url)
    for u in team_profile_urls:
        school_profile_url = u.removesuffix("soccer/girls/winter/schedule/")
        info_list = extract_team_info_from_profile_page(school_profile_url)
        try:
          # pd.concat([new_row,df.loc[:]]).reset_index(drop=True)
          m_result = pd.DataFrame({'School Name':info_list[0],
                      'School Alias':"",
                      'School Address':info_list[1],
                      'Mascot':info_list[2],
                      'School Link':school_profile_url,
                      'Team Link':u }, index=[0])
          teams_df = pd.concat([m_result, teams_df.loc[:]]).reset_index(drop=True)
          # teams_df = teams_df.append(m_result, ignore_index=True)
        except IndexError as err:
          print(err)
          print(school_profile_url)
          print(u)

teams_df = teams_df.drop_duplicates(keep='last')
# teams_df.to_csv('/drive/My Drive/teams_2223_03182023.csv', index=False)
print(teams_df)


In [None]:
# create dateframe of game information
def create_game_info_dataframe_from_stats_page(page_url, today):
  """ create game info dataframe from a stats page

  today: Date object

  note: details page has no date
  return dataframe

  """


  # Use requests to retrieve data from a given URL
  page_response = requests.get(page_url, headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"})

  # Parse the whole HTML page using BeautifulSoup
  page_soup = BeautifulSoup(page_response.text, 'html.parser')

  # verify the date
  title_date = datetime.strptime(page_soup.title.text.split('|')[1].strip().split()[0], '%m/%d/%Y').date()
  # print(page_soup.title.text.split('|')[1].strip().split()[0])
  # print(title_date)
  # print(today)

  if title_date == today:
      print('Found Games for {}'.format(title_date))
      schedule_df = pd.DataFrame({'Date': pd.Series(dtype='datetime64[ns]'),
                                'Home Team': pd.Series(dtype='str'),
                                'Home City': pd.Series(dtype='str'),
                                'Home Score': pd.Series(dtype='int'),
                                'Away Team': pd.Series(dtype='str'),
                                'Away City': pd.Series(dtype='str'),
                                'Away Score': pd.Series(dtype='int'),
                                'Game Summary': pd.Series(dtype='str'),
                                'Web Link': pd.Series(dtype='str')})
      # find all the div (each div is a game) and print a link in each div
      for gamediv in page_soup.find_all("div", {"class": "contest-box-item"}):#maxpreps_soup.findAll('a', {'class': 'c-c'}):
          try:
              game_detail_url = gamediv.find('a')['href']
              print(game_detail_url)
              team_names, team_cities, first_half_scores, second_half_scores, total_scores, shootout_scores, is_winner, game_summary = extract_game_info_from_details_page(game_detail_url)
              if len(total_scores)==2:
                  if shootout_scores[0] == None or shootout_scores[1]==None:
                    m_result = {'Date':today,
                                'Home Team':team_names[1],
                                'Home City':team_cities[1],
                                'Home Score':total_scores[1],
                                'Away Team':team_names[0],
                                'Away City':team_cities[0],
                                'Away Score':total_scores[0],
                                'Game Summary': game_summary,
                                'Web Link': game_detail_url}
                  else:
                    m_result = {'Date':today,
                                'Home Team':team_names[1],
                                'Home City':team_cities[1],
                                'Home Score':shootout_scores[1],
                                'Away Team':team_names[0],
                                'Away City':team_cities[0],
                                'Away Score':shootout_scores[0],
                                'Game Summary': game_summary,
                                'Web Link': game_detail_url}
                  # schedule_df = schedule_df.append(m_result, ignore_index=True)
                  schedule_df = pd.concat([schedule_df, pd.DataFrame([m_result])], ignore_index=True)
          except (KeyError, TypeError) as err:
              print(err)
              pass

      return schedule_df
  else:
      print('No Games for {}'.format(today))




In [None]:
today = date(2022, 11, 29)
maxpreps_url = 'https://www.maxpreps.com/ca/central-coast-section/soccer/girls/scores/?date={}'.format(today.strftime("%m/%d/%Y"))
print(maxpreps_url)


print(create_game_info_dataframe_from_stats_page(maxpreps_url, today))

In [None]:
# reprocess all season data
from tqdm.notebook import tqdm

start_date = date(2022, 11, 1)
end_date = date(2023, 3, 18)

def daterange(start_date, end_date):
    for n in range(int((end_date - start_date).days)):
        yield start_date + timedelta(n)

dfs = []
for today in tqdm(daterange(start_date, end_date)):
    # print('https://www.maxpreps.com/ca/central-coast-section/soccer/girls/scores/?date={}'.format(today.strftime("%m/%d/%Y")) )
    scores_df = create_game_info_dataframe_from_stats_page(
        'https://www.maxpreps.com/ca/central-coast-section/soccer/girls/scores/?date={}'.format(today.strftime("%m/%d/%Y")) ,
        today)
    if scores_df is not None:
      print('Found {} games for {}'.format(len(scores_df), today))
    dfs.append(scores_df)

# see pd.concat documentation for more info
if len(dfs)>0:
  scores_df = pd.concat(dfs)
  scores_df = scores_df.drop_duplicates(keep='last')
  print(scores_df)


# scores['Date'] = pd.to_datetime(scores['Date'])
# scores = scores.drop_duplicates(keep='last')
scores_df.to_csv('/drive/My Drive/scores_2223_03182023.csv', index=False)

In [None]:
# get all rankings
rankings = get_rankings_from_scores(scores_df)
rankings.to_csv('/drive/My Drive/rankings_2223_03182023.csv', index=False)

In [None]:
pip install maxpreps_scraper

Collecting maxpreps_scraper
  Downloading maxpreps_scraper-0.1.2-py3-none-any.whl.metadata (3.7 kB)
Downloading maxpreps_scraper-0.1.2-py3-none-any.whl (7.1 kB)
Installing collected packages: maxpreps_scraper
Successfully installed maxpreps_scraper-0.1.2


# Scrape Data

In [13]:
# from maxpreps_scraper import MaxPrepsScraper
scraper = MaxPrepsScraperV1()

state = 'tx'
sport = 'soccer'
year = '21-22'
# Get Team Rankings
# rankings_df = scraper.get_rankings(state = 'de', sport = 'football', year = '23-24')

# Get Contest Data
contests_df = scraper.get_contests(state=state, sport=sport, year=year, boys=False)

#output datframes
# rankings_df.head(10)
contests_df.head(10)
contests_df.to_csv(f'/drive/My Drive/games_{state}_{sport}_{year}_girls.csv', index=False)

Scraping Schools for  tx:  11%|█▏        | 80/709 [00:09<00:55, 11.43 schools/s]

Failed to fetch /tx/san-antonio/madison-mavericks/soccer/girls/winter/21-22/schedule/ (status code: 500)
Failed to fetch /tx/temple/lake-belton-broncos/soccer/girls/winter/21-22/schedule/ (status code: 500)


Scraping Schools for  tx:  12%|█▏        | 83/709 [00:09<00:45, 13.73 schools/s]

Failed to fetch /tx/mesquite/horn-jaguars/soccer/girls/winter/21-22/schedule/ (status code: 500)


Scraping Schools for  tx:  12%|█▏        | 85/709 [00:10<01:11,  8.72 schools/s]

Failed to fetch /tx/el-paso/franklin-cougars/soccer/girls/winter/21-22/schedule/ (status code: 500)
Failed to fetch /tx/melissa/melissa-cardinals/soccer/girls/winter/21-22/schedule/ (status code: 500)
Failed to fetch /tx/spring/grand-oaks-grizzlies/soccer/girls/winter/21-22/schedule/ (status code: 500)


Scraping Schools for  tx:  14%|█▎        | 96/709 [00:10<00:55, 11.11 schools/s]

Failed to fetch /tx/austin/austin-maroons/soccer/girls/winter/21-22/schedule/ (status code: 500)


Scraping Schools for  tx:  14%|█▍        | 102/709 [00:11<00:41, 14.61 schools/s]

Failed to fetch /tx/arlington/martin-warriors/soccer/girls/winter/21-22/schedule/ (status code: 500)
Failed to fetch /tx/dallas/wilson-wildcats/soccer/girls/winter/21-22/schedule/ (status code: 500)


Scraping Schools for  tx:  15%|█▌        | 107/709 [00:11<00:38, 15.45 schools/s]

Failed to fetch /tx/richmond/foster-falcons/soccer/girls/winter/21-22/schedule/ (status code: 500)
Failed to fetch /tx/hurst/bell-blue-raiders/soccer/girls/winter/21-22/schedule/ (status code: 500)
Failed to fetch /tx/austin/anderson-trojans/soccer/girls/winter/21-22/schedule/ (status code: 500)


Scraping Schools for  tx:  16%|█▌        | 115/709 [00:12<00:52, 11.24 schools/s]

Failed to fetch /tx/frisco/lebanon-trail-trail-blazers/soccer/girls/winter/21-22/schedule/ (status code: 500)
Failed to fetch /tx/buda/johnson-jaguars/soccer/girls/winter/21-22/schedule/ (status code: 500)
Failed to fetch /tx/mcallen/mcallen-bulldogs/soccer/girls/winter/21-22/schedule/ (status code: 500)


Scraping Schools for  tx:  18%|█▊        | 125/709 [00:13<00:45, 12.88 schools/s]

Failed to fetch /tx/new-braunfels/new-braunfels-unicorns/soccer/girls/winter/21-22/schedule/ (status code: 500)
Failed to fetch /tx/austin/mcneil-mavericks/soccer/girls/winter/21-22/schedule/ (status code: 500)
Failed to fetch /tx/harlingen/harlingen-cardinals/soccer/girls/winter/21-22/schedule/ (status code: 500)
Failed to fetch /tx/corinth/lake-dallas-falcons/soccer/girls/winter/21-22/schedule/ (status code: 500)


Scraping Schools for  tx:  18%|█▊        | 131/709 [00:13<00:37, 15.54 schools/s]

Failed to fetch /tx/wichita-falls/wichita-falls-coyotes/soccer/girls/winter/21-22/schedule/ (status code: 500)


Scraping Schools for  tx:  19%|█▉        | 133/709 [00:13<00:41, 13.74 schools/s]

Failed to fetch /tx/stephenville/stephenville-yellow-jackets-honeybees/soccer/girls/winter/21-22/schedule/ (status code: 500)
Failed to fetch /tx/bay-city/bay-city-blackcats/soccer/girls/winter/21-22/schedule/ (status code: 500)


Scraping Schools for  tx:  19%|█▉        | 138/709 [00:14<00:38, 14.73 schools/s]

Failed to fetch /tx/kingwood/kingwood-park-panthers/soccer/girls/winter/21-22/schedule/ (status code: 500)
Failed to fetch /tx/plano/john-paul-ii-cardinals/soccer/girls/winter/21-22/schedule/ (status code: 500)


Scraping Schools for  tx:  20%|█▉        | 141/709 [00:14<00:34, 16.57 schools/s]

Failed to fetch /tx/tomball/tomball-cougars/soccer/girls/winter/21-22/schedule/ (status code: 500)


Scraping Schools for  tx:  20%|██        | 145/709 [00:14<00:41, 13.67 schools/s]

Failed to fetch /tx/san-antonio/san-antonio-christian-lions/soccer/girls/winter/21-22/schedule/ (status code: 500)
Failed to fetch /tx/fulshear/jordan-warriors/soccer/girls/winter/21-22/schedule/ (status code: 500)


Scraping Schools for  tx:  21%|██        | 150/709 [00:15<00:50, 11.11 schools/s]

Failed to fetch /tx/edinburg/vela-sabercats/soccer/girls/winter/21-22/schedule/ (status code: 500)
Failed to fetch /tx/corpus-christi/calallen-wildcats/soccer/girls/winter/21-22/schedule/ (status code: 500)
Failed to fetch /tx/pflugerville/weiss-wolves/soccer/girls/winter/21-22/schedule/ (status code: 500)


Scraping Schools for  tx:  22%|██▏       | 154/709 [00:15<00:47, 11.81 schools/s]

Failed to fetch /tx/houston/klein-cain-hurricanes/soccer/girls/winter/21-22/schedule/ (status code: 500)
Failed to fetch /tx/kingwood/kingwood-mustangs/soccer/girls/winter/21-22/schedule/ (status code: 500)


Scraping Schools for  tx:  23%|██▎       | 162/709 [00:16<00:45, 11.95 schools/s]

Failed to fetch /tx/jacksonville/jacksonville-fightin-indians/soccer/girls/winter/21-22/schedule/ (status code: 500)


Scraping Schools for  tx:  23%|██▎       | 166/709 [00:16<01:04,  8.38 schools/s]

Failed to fetch /tx/spring/klein-collins-tigers/soccer/girls/winter/21-22/schedule/ (status code: 500)
Failed to fetch /tx/longview/pine-tree-pirates/soccer/girls/winter/21-22/schedule/ (status code: 500)


Scraping Schools for  tx:  25%|██▌       | 178/709 [00:17<00:48, 10.86 schools/s]

Failed to fetch /tx/the-woodlands/college-park-cavaliers/soccer/girls/winter/21-22/schedule/ (status code: 500)


Scraping Schools for  tx:  27%|██▋       | 191/709 [00:19<00:49, 10.38 schools/s]

Failed to fetch /tx/paris/paris-wildcats/soccer/girls/winter/21-22/schedule/ (status code: 500)
Failed to fetch /tx/montgomery/montgomery-bears/soccer/girls/winter/21-22/schedule/ (status code: 500)


Scraping Schools for  tx:  27%|██▋       | 194/709 [00:19<00:55,  9.31 schools/s]

Failed to fetch /tx/brownsville/porter-cowboys/soccer/girls/winter/21-22/schedule/ (status code: 500)


Scraping Schools for  tx:  31%|███       | 219/709 [00:25<02:53,  2.82 schools/s]

Failed to fetch /tx/bellaire/bellaire-cardinals/soccer/girls/winter/21-22/schedule/ (status code: 504)
Failed to fetch /tx/brownsville/hanna-golden-eagles/soccer/girls/winter/21-22/schedule/ (status code: 504)


Scraping Schools for  tx:  31%|███▏      | 223/709 [00:25<01:26,  5.64 schools/s]

Failed to fetch /tx/palestine/palestine-wildcats/soccer/girls/winter/21-22/schedule/ (status code: 504)
Failed to fetch /tx/port-neches/port-neches-groves-indians/soccer/girls/winter/21-22/schedule/ (status code: 504)
Failed to fetch /tx/san-antonio/antonian-prep-apaches/soccer/girls/winter/21-22/schedule/ (status code: 504)


Scraping Schools for  tx:  32%|███▏      | 225/709 [00:25<01:25,  5.67 schools/s]

Failed to fetch /tx/fort-worth/paschal-panthers/soccer/girls/winter/21-22/schedule/ (status code: 504)


Scraping Schools for  tx:  33%|███▎      | 232/709 [00:26<01:01,  7.76 schools/s]

Failed to fetch /tx/el-paso/jefferson-silver-foxes/soccer/girls/winter/21-22/schedule/ (status code: 504)


Scraping Schools for  tx:  34%|███▎      | 239/709 [00:27<00:40, 11.63 schools/s]

Failed to fetch /tx/deer-park/deer-park-deer/soccer/girls/winter/21-22/schedule/ (status code: 504)
Failed to fetch /tx/el-paso/bel-air-highlanders/soccer/girls/winter/21-22/schedule/ (status code: 504)


Scraping Schools for  tx:  36%|███▋      | 258/709 [00:30<00:58,  7.77 schools/s]

Failed to fetch /tx/copperas-cove/copperas-cove-bulldawgs/soccer/girls/winter/21-22/schedule/ (status code: 504)


Scraping Schools for  tx:  37%|███▋      | 262/709 [00:31<01:35,  4.66 schools/s]

Failed to fetch /tx/san-elizario/san-elizario-eagles/soccer/girls/winter/21-22/schedule/ (status code: 504)


Scraping Schools for  tx:  39%|███▉      | 276/709 [00:32<00:36, 12.00 schools/s]

Failed to fetch /tx/plano/plano-east-panthers/soccer/girls/winter/21-22/schedule/ (status code: 504)
Failed to fetch /tx/forney/north-forney-falcons/soccer/girls/winter/21-22/schedule/ (status code: 504)


Scraping Schools for  tx:  41%|████      | 288/709 [00:33<00:41, 10.27 schools/s]

Failed to fetch /tx/houston/langham-creek-lobos/soccer/girls/winter/21-22/schedule/ (status code: 504)


Scraping Schools for  tx: 100%|██████████| 709/709 [01:20<00:00,  8.81 schools/s]


data collected before cleaning


In [None]:
print(contests_df.columns)

Index(['Date', 'Team 1', 'Team 2', 'Team 1 Score', 'Team 2 Score', 'Outcome',
       'Forfeit', 'Venue', 'Game Type', 'Team 1 Address', 'Team 1 City',
       'Team 1 State', 'Team 1 Zipcode', 'Team 1 URL', 'Team 2 URL'],
      dtype='object')


In [None]:
!pip show maxpreps-scraper

Name: maxpreps-scraper
Version: 0.1.2
Summary: A Python scraper for MaxPreps high school sports data
Home-page: https://github.com/raghavdhir03/maxpreps_scraper
Author: Raghav Dhir
Author-email: dhir.raghav@gmail.com
License: 
Location: /usr/local/lib/python3.11/dist-packages
Requires: beautifulsoup4, html5lib, lxml, pandas, requests, tqdm
Required-by: 


# Utility Function

In [11]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd
from io import StringIO
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor, as_completed
import re

class MaxPrepsScraperV1():

    BASE_URL = 'https://www.maxpreps.com'

    def __init__(self):
        """Initialize scraper settings (headers, session, etc.)."""
        self.session = requests.Session()

    def get_rankings(self, state: str, sport: str, year: str, boys=True):
        """
        Get rankings for a given sport, state, and year.
        Returns a Pandas DataFrame with:
        - School Name
        - State Rank
        - Strength of Schedule (SOS)
        - Team Rating
        - Team URL
        """

        if sport not in ['basketball', 'football', 'baseball', 'soccer', 'volleyball', 'lacrosse', 'softball']:
            raise ValueError(f"Sport '{sport}' is not supported. Please choose from: basketball, football, baseball, soccer, volleyball, lacrosse, softball")

        if (boys == False and sport in ['football', 'baseball']) or (boys and sport in ['softball', 'volleyball']):
            raise ValueError(f"{'boys' if boys else 'girls'} {sport} is not supported")

        if sport == 'soccer' and state not in ['tx', 'la', 'ms', 'hi', 'ca', 'fl', 'az']:
            raise ValueError(f"Soccer is not supported in {state}")

        state_url = f"{self.BASE_URL}/{state}/{sport}/{'' if (boys or sport in ['softball', 'volleyball']) else 'girls/'}{'winter/' if sport == 'soccer' else ''}{year}/rankings"

        page = 1
        full_df = pd.DataFrame()

        while True:
            page_url = f'{state_url}/{page}/'

            response = requests.get(page_url)

            # Stop if the page does not exist
            if response.status_code != 200:
                break

            soup = BeautifulSoup(response.text, 'html.parser')

            # You can add extra checks here to stop if there's no data
            table = soup.find('table')
            if not table:
                break

            # Process your table here
            df = self._scrape_table(soup)

            full_df = pd.concat([full_df, df]).reset_index(drop=True)

            page += 1

        full_df['Team'] = full_df['Team'].str.replace(r'^([A-Z])\1', r'\1', regex=True) #take care of schools with no mascot. Turns AAustin into Austin

        return full_df

    #sports that this function suppoprts: basketball, football, baseball, soccer, volleyball, lacrosse, softball,
    def get_contests(self, state: str, sport: str, year: str, boys: bool = True, cities=None):

        if sport not in ['basketball', 'football', 'baseball', 'soccer', 'volleyball', 'lacrosse', 'softball']:
            raise ValueError(f"Sport '{sport}' is not supported. Please choose from: basketball, football, baseball, soccer, volleyball, lacrosse, softball")

        if (boys == False and sport in ['football', 'baseball']) or (boys and sport in ['softball', 'volleyball']):
            raise ValueError(f"{'boys' if boys else 'girls'} {sport} is not supported")

        if sport == 'soccer' and state not in ['tx', 'la', 'ms', 'hi', 'ca', 'fl', 'az']:
            raise ValueError(f"Soccer is not supported in {state}")


        state_url = f"{self.BASE_URL}/{state}/{sport}/{'' if (boys or sport in ['softball', 'volleyball']) else 'girls/'}{'winter/' if sport == 'soccer' else ''}{year}/rankings"
        page = 1
        school_list = []

        # Step 1: Collect all school links
        while True:
            page_url = f'{state_url}/{page}/'
            # print(page_url)
            response = requests.get(page_url)

            if response.status_code != 200:
                break

            soup = BeautifulSoup(response.text, 'html.parser')
            page_list = self._get_school_list(soup, cities)
            # print(page_list)
            school_list += page_list
            # print(len(school_list))
            page += 1

        if year == '24-25':
            school_list = [(name, url+f"/{year}/schedule") for name, url in school_list]

        # print(school_list)

        # Step 2: Thread-safe scrape function that still uses self._scrape_table()
        def _fetch_and_scrape(school, url, base_url='https://www.maxpreps.com'):
            try:
                response = requests.get(base_url + url, timeout=10)
                if response.status_code == 200:
                    soup = BeautifulSoup(response.text, 'html.parser')
                    table = self._scrape_table(soup)
                    table['Team 2 URL'] = self._extract_opponent_urls(soup)
                    location_info = self._extract_location_info(soup)
                    table['Team 1'] = school
                    table['Team 1 Address'] = location_info['address']
                    table['Team 1 City'] = location_info['city']
                    table['Team 1 State'] = location_info['state']
                    table['Team 1 Zipcode'] = location_info['zipcode']
                    table['Team 1 URL'] = url
                    return table
                else:
                    print(f"Failed to fetch {url} (status code: {response.status_code})")
            except Exception as e:
                print(f"Error scraping {school} at {url}: {e}")
            return None

        # Step 3: Run threads

        full_df = pd.DataFrame()
        with ThreadPoolExecutor(max_workers=10) as executor:
            futures = [executor.submit(_fetch_and_scrape, school, url) for school, url in school_list]

            with tqdm(total=len(futures), desc=f"Scraping Schools for {', '.join(city for city in cities) if cities else ''} {state}", unit=" schools") as pbar:
                for future in as_completed(futures):
                    result_df = future.result()
                    if result_df is not None and not result_df.empty:
                        full_df = pd.concat([full_df, result_df], ignore_index=True)
                    pbar.update(1)

        print("data collected before cleaning")
        full_df = self._clean_contest_data(full_df)

        return full_df


    def _scrape_table(self, soup):
        table = soup.find('table')
        html = str(table)
        df = pd.read_html(StringIO(html))[0]
        return df

    def _get_school_list(self, soup, cities=None, base_url='https://www.maxpreps.com'):
        """
        Extracts school names and links to schedule pages based on new MaxPreps HTML structure.
        """
        school_data = []

        for td in soup.find_all('td'):
            a_tag = td.find('a', href=True)
            # print(a_tag)
            if a_tag and 'soccer' in a_tag['href']:
                href = a_tag['href']

                # Check if cities is None (no filter), or if any city is in the href
                if cities is None or any(city.lower().replace(" ", "-") == href.lower().split('/')[2] for city in cities):
                    name = a_tag.get_text(strip=True)
                    link = href
                    school_data.append((name, link))


        return school_data

    def _extract_location_info(self, soup):

        address_element = soup.select_one('address')
        if not address_element:
            return {
                "address": None,
                "city": None,
                "state": None,
                "zipcode": None
            }

        # Get city/state/zip from <span>
        city_state_span = address_element.find('span')
        if city_state_span:
            city_state_text = city_state_span.get_text(strip=True)
            city_state_span.decompose()  # Remove span from the address block
        else:
            city_state_text = ""

        # Now get street address (without the span)
        street_address = address_element.get_text(strip=True)

        # Parse city, state, and zip using regex
        city, state, zipcode = None, None, None
        match = re.match(r'^(.*?),\s*([A-Z]{2})\s*(\d{5})(?:-\d{4})?$', city_state_text)
        if match:
            city, state, zipcode = match.groups()

        return {
            "address": street_address or None,
            "city": city,
            "state": state,
            "zipcode": zipcode
        }

    def _extract_opponent_urls(self, soup): #Extracts a list of opponent URLs (or None) from the 'Opponent' column in the schedule table.

        opponent_urls = []

        # Find the schedule table
        table = soup.find('table')
        if not table:
            return []
        # print(table)

        # Step 1: Get header row and find the column index for 'Opponent'
        header_row = table.find('tr')
        headers = [th.get_text(strip=True).lower() for th in header_row.find_all('th')]

        try:
            opponent_idx = headers.index('opponent')
        except ValueError:
            return []

        # Step 2: Go through all remaining rows and get href from the opponent column
        for row in table.find_all('tr')[1:]:  # skip header row
            cells = row.find_all('td')
            if len(cells) > opponent_idx:
                opponent_cell = cells[opponent_idx]
                a_tag = opponent_cell.find('a', href=True)

                if a_tag and a_tag['href'].endswith('/schedule/'):
                    opponent_urls.append(a_tag['href'])
                else:
                    opponent_urls.append(None)

        return opponent_urls

    def _clean_contest_data(self, df):
        # --- 1. Clean 'Opponent' column ---
        opponent_pattern = r'(?P<VenueRaw>vs\.?|@)?\s*(?P<Team2>.+?)(?P<Star>\*{0,3})$'

        opponent_info = df['Opponent'].str.extract(opponent_pattern)

        # Venue
        opponent_info['Venue'] = opponent_info['VenueRaw'].map({
            'vs': 'Home',
            'vs.': 'Home',
            '@': 'Away'
        }).fillna('Neutral')

        # Game Type
        opponent_info['Game Type'] = opponent_info['Star'].map({
            '': 'Regular Season',
            '*': 'District',
            '**': 'Playoff',
            '***': 'Tournament'
        }).fillna('Regular Season')

        df['Team 2'] = opponent_info['Team2'].str.strip()
        df['Venue'] = opponent_info['Venue']
        df['Game Type'] = opponent_info['Game Type']

        # --- 2. Clean 'Result' column ---
        result_pattern = r'(?P<Outcome>[WL])(?: (?P<Team1_Score>\d+)-(?P<Team2_Score>\d+)|\((?P<Forfeit>FF)\))'

        result_info = df['Result'].str.extract(result_pattern)

        df['Outcome'] = result_info['Outcome']
        df['Team 1 Score'] = pd.to_numeric(result_info['Team1_Score'], errors='coerce')
        df['Team 2 Score'] = pd.to_numeric(result_info['Team2_Score'], errors='coerce')
        df['Forfeit'] = result_info['Forfeit'].notna()

        cols_to_drop = ['Opponent', 'Result', 'Game Info', 'Match Info']
        df.drop(columns=[col for col in cols_to_drop if col in df.columns], inplace=True)

        df.loc[df['Outcome'] == 'L', ['Team 1 Score', 'Team 2 Score']] = df.loc[df['Outcome'] == 'L', ['Team 2 Score', 'Team 1 Score']].values # Swap scores if team 1 lost

        df['Team 1'] = df['Team 1'].str.replace(r'(^[A-Z])\1', '', regex=True) #take care of schools with no mascot. Turns AAustin into Austin

        # Reorder columns
        new_order = ['Date', 'Team 1', 'Team 2', 'Team 1 Score', 'Team 2 Score', 'Outcome', 'Forfeit', 'Venue', 'Game Type', 'Team 1 Address', 'Team 1 City', 'Team 1 State', 'Team 1 Zipcode', 'Team 1 URL', 'Team 2 URL']
        df = df[new_order]

        return df
