<a href="https://colab.research.google.com/github/zhenyisx/scoutoid/blob/main/scraper2db.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

To scrape website and populate to database (csv file) on Google Drive. The files can then be uploaded to Goolge Storage for clients (i.e., stremlit) to consume.

# Import and Function Definitions

In [5]:
# Imports
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from datetime import date, datetime, timedelta

from google.colab import  drive
drive.mount('/drive')

%matplotlib inline

Mounted at /drive


In [6]:
def get_scores_from_soup(maxpreps_day_soup, this_day):
  """to get scores of games in one day.

  Args:
      maxpreps_day_soup (soup): a soup parsed from MaxPreps HTML by BS.
      this_day (str): the date.

  Returns:
      schedulf_df (dataframe): the dataframe of all game results.

  """

  # create dateframe of game schedules and scores
  schedule_df = pd.DataFrame({'Date': pd.Series(dtype='datetime64[ns]'),
                    'Home Team': pd.Series(dtype='str'),
                    'Home Score': pd.Series(dtype='int'),
                    'Away Team': pd.Series(dtype='str'),
                    'Away Score': pd.Series(dtype='int')})

  # populate the datafrme by analyzing the soup (MaxPreps specific)
  # to use ChromeDev Tool to get the following rules of find 
  matches = maxpreps_day_soup.find('div', {'class': 'contests'})

  for m in matches.find_all('ul', {'class': 'teams'}):
    raw_record = m.find_all("li")

    # home score
    try:
      home_score = int(raw_record[0].find('div', {'class': 'score'}).text.strip())
    except ValueError:
        # Handle the exception
        print('score not valid, using -1 instead')
        home_score = -1
    except AttributeError:
        # Handle the exception
        print('score not valid, using -1 instead')
        home_score = -1
    # print(home_score)

    # home name
    home_name = raw_record[0].find('div', {'class': 'name'}).text.strip()
    # print(home_name)

    # away score
    try:
      away_score = int(raw_record[1].find('div', {'class': 'score'}).text.strip())
    except ValueError:
        # Handle the exception
        print('score not valid, using -1 instead')
        away_score = -1
    except AttributeError:
        # Handle the exception
        print('score not valid, using -1 instead')
        away_score = -1
    # print(away_score)

    # away name
    away_name = raw_record[1].find('div', {'class': 'name'}).text.strip()
    # print(away_name)

    m_result = {'Date':this_day, 'Home Team':home_name, 'Home Score':home_score, 
                'Away Team':away_name, 'Away Score':away_score}
    schedule_df = schedule_df.append(m_result, ignore_index=True)

  schedule_df = schedule_df.drop_duplicates(keep='last')

  return schedule_df
  

def get_scores_from_maxpreps_for_one_day(today):
  """to get scores of games from MaxPreps in one day.

  Example:
  df = get_scores_from_maxpreps_for_one_day("12/1/2022")
  print(df)

  Args:
      this_day (str): the date str in '%m/%d/%Y' format.

  Returns:
      schedulf_df (dataframe): the dataframe of all game results.

  """
  # maxpreps's homepage
  maxpreps_url = 'https://www.maxpreps.com/ca/central-coast-section/soccer/girls/scores/?date={}'.format(today)  # please change the date if needed

  # Use requests to retrieve data from a given URL
  maxpreps_response = requests.get(maxpreps_url, headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"})

  # Parse the whole HTML page using BeautifulSoup
  maxpreps_soup = BeautifulSoup(maxpreps_response.text, 'html.parser')

  # Title of the parsed page
  print(maxpreps_soup.title.text)

  # check if the dates match
  try:
    content_date = datetime.strptime(maxpreps_soup.title.text.split('|')[1].strip().split()[0], '%m/%d/%Y')
    # another way to find the content current date
    # content = maxpreps_soup.find('div', {'class': 'calendar'}).find('ol', {'class': 'week'}).find('a', {'class': 'btn btn-default active'})
    # content_date = datetime.strptime(content.text, '%m/%d/%Y'))
    if content_date == datetime.strptime(today, '%m/%d/%Y'):
      print('Found Games for {}'.format(content_date))
      scores_df = get_scores_from_soup(maxpreps_soup, content_date)
      return scores_df
    else:
      print('No Games found for {}'.format(today))
  except TypeError:
    print('No Games found for {}'.format(today))
  except ValueError:
    print('No Games found for {}'.format(today))


def get_scores_from_maxpreps_for_range(start_date, end_date):
  """to get scores of games from MaxPreps in a date range.

  Example:
  start_date = date(2022, 11, 10)
  end_date = date(2023, 2, 10)
  scores = get_scores_from_maxpreps_for_range(start_date, end_date)

  Args:
      start_date (date): the start date.
      end_date (date): the end date.

  Returns:
      schedulf_df (dataframe): the dataframe of all game results.

  """
  def daterange(start_date, end_date):
    for n in range(int((end_date - start_date).days)):
        yield start_date + timedelta(n)

  dfs = []
  for single_date in daterange(start_date, end_date):
      today = single_date.strftime("%m/%d/%Y")
      scores_df = get_scores_from_maxpreps_for_one_day(today)
      if scores_df is None:
        print('No games found on {}'.format(today))
      else:
        print('Found {} games for {}'.format(len(scores_df), today))
      dfs.append(scores_df)

  # see pd.concat documentation for more info
  concat_scores_df = pd.concat(dfs)
  concat_scores_df = concat_scores_df.drop_duplicates(keep='last')
  return concat_scores_df


In [7]:
def get_rankings_from_scores(schedule_df):
  """to ranking statistics of teams from the schedules and scores

  Example:
  df = get_scores_from_maxpreps_for_one_day("12/1/2022")
  print(df)

  Args:
      schedule_df (dataframe): the scores dataframe.

  Returns:
      ranking_df (dataframe): the dataframe of ranking statistics.

  """
  teams = list(set(list(schedule_df["Home Team"].unique()) + 
                   list(schedule_df["Away Team"].unique())))

  # initialize the ranking dataframe
  ranking_df = pd.DataFrame({
                    'Team': pd.Series(dtype='str'),
                    'Total Points': pd.Series(dtype='int'),
                    'Total Games': pd.Series(dtype='int'),
                    'Total Wins': pd.Series(dtype='int'),
                    'Total Losses': pd.Series(dtype='int'),
                    'Total Ties': pd.Series(dtype='int'),
                    'PPG': pd.Series(dtype='float'),})

  for t in teams:
    t_result = {'Team':t, 'Total Points': 0,
                    'Total Games': 0,
                    'Total Wins': 0,
                    'Total Losses': 0,
                    'Total Ties': 0,
                    'PPG':0}
    ranking_df = ranking_df.append(t_result, ignore_index=True)

  # populate ranking dataframe based on schedule
  for index,row in schedule_df.iterrows():
    home_team = row['Home Team']
    away_team = row['Away Team']
    home_score = row['Home Score']
    away_score = row['Away Score']

    ranking_df.loc[(ranking_df['Team'] == home_team), 'Total Games'] += 1
    ranking_df.loc[(ranking_df['Team'] == away_team), 'Total Games'] += 1

    if home_score > away_score: # home win  
      ranking_df.loc[(ranking_df['Team'] == home_team), 'Total Wins'] += 1
      ranking_df.loc[(ranking_df['Team'] == home_team), 'Total Points'] += 3
      ranking_df.loc[(ranking_df['Team'] == away_team), 'Total Losses'] += 1
    elif home_score < away_score: # away win
      ranking_df.loc[(ranking_df['Team'] == away_team), 'Total Wins'] += 1
      ranking_df.loc[(ranking_df['Team'] == away_team), 'Total Points'] += 3
      ranking_df.loc[(ranking_df['Team'] == home_team), 'Total Losses'] += 1
    elif home_score == away_score: # tie
      ranking_df.loc[(ranking_df['Team'] == home_team), 'Total Ties'] += 1
      ranking_df.loc[(ranking_df['Team'] == away_team), 'Total Ties'] += 1
      ranking_df.loc[(ranking_df['Team'] == home_team), 'Total Points'] += 1
      ranking_df.loc[(ranking_df['Team'] == away_team), 'Total Points'] += 1

  ranking_df['PPG'] = ranking_df['Total Points'] / ranking_df['Total Games']

  ranking_df = ranking_df.sort_values(by=['PPG', 'Total Points'], 
                                      ascending=False)
  ranking_df = ranking_df.drop_duplicates(keep='last')
  return ranking_df


# Running Script

In [13]:
# demo
df = get_scores_from_maxpreps_for_one_day("2/15/2023")
print(df)


	Central Coast Section High School Girls Soccer Scores | 2/15/2023 Results | MaxPreps

Found Games for 2023-02-15 00:00:00
score not valid, using -1 instead
score not valid, using -1 instead
        Date      Home Team  Home Score                Away Team  Away Score
0 2023-02-15   Presentation           1         Archbishop Mitty           4
1 2023-02-15         Marina          -1                  Oakwood          -1
2 2023-02-15         Harker           3  Crystal Springs Uplands           1
3 2023-02-15         Aragon           1                  Sequoia           4
4 2023-02-15           Gunn           1                Homestead           2
5 2023-02-15  Saint Francis           4         Valley Christian           0
6 2023-02-15    Santa Clara           2                 Saratoga           1
7 2023-02-15      Los Altos           6                Palo Alto           1
8 2023-02-15       Westmont           1                 Overfelt           1


In [None]:
# get all scores
# please note game info of previous seasons/years are not accessible,
# only current season is accessible.

# load historical data
# scores.csv was generated between 11/10/2022 and 2/1/2023
hist_scores = pd.read_csv('/drive/My Drive/scores.csv')

start_date = date(2023, 2, 1)
end_date = date(2023, 2, 18)
new_scores = get_scores_from_maxpreps_for_range(start_date, end_date)

scores = pd.concat([hist_scores, new_scores])
scores['Date'] = pd.to_datetime(scores['Date'])
scores = scores.drop_duplicates(keep='last')
scores.to_csv('/drive/My Drive/scores_2223.csv', index=False)

In [40]:
# get all rankings
rankings = get_rankings_from_scores(scores)
rankings.to_csv('/drive/My Drive/rankings_2223.csv', index=False)