<a href="https://colab.research.google.com/github/zhenyisx/scoutoid/blob/main/scraper2db.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

To scrape website and populate to database (csv file) on Google Drive. The files can then be uploaded to Goolge Storage for clients (i.e., stremlit) to consume.

In [None]:
# Imports
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from datetime import date, datetime, timedelta

from google.colab import  drive
drive.mount('/drive')

%matplotlib inline

In [148]:
def get_scores_from_soup(maxpreps_day_soup, this_day):
  """to get scores of games in one day.

  Args:
      maxpreps_day_soup (soup): a soup parsed from MaxPreps HTML by BS.
      this_day (str): the date.

  Returns:
      schedulf_df (dataframe): the dataframe of all game results.

  """

  # create dateframe of game schedules and scores
  schedule_df = pd.DataFrame({'Date': pd.Series(dtype='datetime64[ns]'),
                    'Home Team': pd.Series(dtype='str'),
                    'Home Score': pd.Series(dtype='int'),
                    'Away Team': pd.Series(dtype='str'),
                    'Away Score': pd.Series(dtype='int')})

  # populate the datafrme by analyzing the soup (MaxPreps specific)
  # to use ChromeDev Tool to get the following rules of find 
  matches = maxpreps_day_soup.find('div', {'class': 'contests'})

  for m in matches.find_all('ul', {'class': 'teams'}):
    raw_record = m.find_all("li")

    # home score
    try:
      home_score = int(raw_record[0].find('div', {'class': 'score'}).text.strip())
    except ValueError:
        # Handle the exception
        print('score not valid, using -1 instead')
        home_score = -1
    except AttributeError:
        # Handle the exception
        print('score not valid, using -1 instead')
        home_score = -1
    # print(home_score)

    # home name
    home_name = raw_record[0].find('div', {'class': 'name'}).text.strip()
    # print(home_name)

    # away score
    try:
      away_score = int(raw_record[1].find('div', {'class': 'score'}).text.strip())
    except ValueError:
        # Handle the exception
        print('score not valid, using -1 instead')
        away_score = -1
    except AttributeError:
        # Handle the exception
        print('score not valid, using -1 instead')
        away_score = -1
    # print(away_score)

    # away name
    away_name = raw_record[1].find('div', {'class': 'name'}).text.strip()
    # print(away_name)

    m_result = {'Date':this_day, 'Home Team':home_name, 'Home Score':home_score, 
                'Away Team':away_name, 'Away Score':away_score}
    schedule_df = schedule_df.append(m_result, ignore_index=True)

  schedule_df = schedule_df.drop_duplicates(keep='last')

  return schedule_df
  


def get_info_for_one_day(today):
  # maxpreps's homepage
  maxpreps_url = 'https://www.maxpreps.com/ca/central-coast-section/soccer/girls/scores/?date={}'.format(today)  # please change the date if needed

  # Use requests to retrieve data from a given URL
  maxpreps_response = requests.get(maxpreps_url, headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"})

  # Parse the whole HTML page using BeautifulSoup
  maxpreps_soup = BeautifulSoup(maxpreps_response.text, 'html.parser')

  # Title of the parsed page
  print(maxpreps_soup.title.text)

  # check if the dates match
  try:
    print(maxpreps_soup.title.text.split('|')[1].strip().split()[0])
    content_date = datetime.strptime(maxpreps_soup.title.text.split('|')[1].strip().split()[0], '%m/%d/%Y')
    if content_date == datetime.strptime(today, '%m/%d/%Y'):
      print('Found Games for {}'.format(content_date))
      scores_df = get_scores_from_soup(maxpreps_soup, content_date)
      return scores_df
    else:
      print('No')
  except TypeError:
    print('No Games found for {}'.format(today))
  except ValueError:
    print('No Games found for {}'.format(today))


def get_info_for_range(start_date, end_date):
  def daterange(start_date, end_date):
    for n in range(int((end_date - start_date).days)):
        yield start_date + timedelta(n)

  dfs = []
  for single_date in daterange(start_date, end_date):
      today = single_date.strftime("%m/%d/%Y")
      print(today)
      scores_df = get_info_for_one_day(today)
      if scores_df is None:
        print('Found 0 games for {}'.format(today))
      else:
        print('Found {} games for {}'.format(len(scores_df), today))
        
      # print(scores_df)
      dfs.append(scores_df)
  # see pd.concat documentation for more info
  scores_dfs = pd.concat(dfs)
  return scores_dfs


In [None]:
df = get_info_for_one_day("12/1/2022")
print(df)

In [149]:
# get all scores
start_date = date(2022, 11, 10)
end_date = date(2023, 2, 10)
scores = get_info_for_range(start_date, end_date)
# print(len(scores))
scores.to_csv('/drive/My Drive/scores.csv', index=False)

11/10/2022

	Central Coast Section High School Girls Soccer Scores | Live Scoreboard | MaxPreps

Live
No Games found for 11/10/2022
Found 0 games for 11/10/2022
11/11/2022

	Central Coast Section High School Girls Soccer Scores | Live Scoreboard | MaxPreps

Live
No Games found for 11/11/2022
Found 0 games for 11/11/2022
11/12/2022

	Central Coast Section High School Girls Soccer Scores | Live Scoreboard | MaxPreps

Live
No Games found for 11/12/2022
Found 0 games for 11/12/2022
11/13/2022

	Central Coast Section High School Girls Soccer Scores | Live Scoreboard | MaxPreps

Live
No Games found for 11/13/2022
Found 0 games for 11/13/2022
11/14/2022

	Central Coast Section High School Girls Soccer Scores | 11/14/2022 Results | MaxPreps

11/14/2022
Found Games for 2022-11-14 00:00:00
Found 1 games for 11/14/2022
11/15/2022

	Central Coast Section High School Girls Soccer Scores | 11/15/2022 Results | MaxPreps

11/15/2022
Found Games for 2022-11-15 00:00:00
Found 1 games for 11/15/2022
11

In [36]:
# find date

# <a href="/ca/central-coast-section/soccer/girls/scores/?date=2/13/2023" class="btn btn-default active">Feb 13, 2023</a>
today = maxpreps_soup.find('div', {'class': 'calendar'}).find('ol', {'class': 'week'}).find('a', {'class': 'btn btn-default active'})
print(today.text)

Feb 13, 2023


In [None]:
# create dateframe for this website
schedule_df = pd.DataFrame({'Date': pd.Series(dtype='datetime64[ns]'),
                   'Home Team': pd.Series(dtype='str'),
                   'Home Score': pd.Series(dtype='int'),
                   'Away Team': pd.Series(dtype='str'),
                   'Away Score': pd.Series(dtype='int')})

# populate the datafrme
def convert_to_score(raw_input):
  try:
      return int(raw_input)
  except ValueError:
      # Handle the exception
      print('score not valid, using -1 instead')
      return -1
  except AttributeError:
      # Handle the exception
      print('score not valid, using -1 instead')
      return -1

matches = maxpreps_soup.find('div', {'class': 'contests'})

for m in  matches.find_all('ul', {'class': 'teams'}):
  raw_record = m.find_all("li")

  # home score
  try:
    home_score = convert_to_score(raw_record[0].find('div', {'class': 'score'}).text.strip())
  except ValueError:
      # Handle the exception
      print('score not valid, using -1 instead')
      home_score = -1
  except AttributeError:
      # Handle the exception
      print('score not valid, using -1 instead')
      home_score = -1
  print(home_score)

  # home name
  home_name = raw_record[0].find('div', {'class': 'name'}).text.strip()
  print(home_name)

  # away score
  try:
    away_score = convert_to_score(raw_record[1].find('div', {'class': 'score'}).text.strip())
  except ValueError:
      # Handle the exception
      print('score not valid, using -1 instead')
      away_score = -1
  except AttributeError:
      # Handle the exception
      print('score not valid, using -1 instead')
      away_score = -1
  print(away_score)

  # away name
  away_name = raw_record[1].find('div', {'class': 'name'}).text.strip()
  print(away_name)

  m_result = {'Date':today.text, 'Home Team':home_name, 'Home Score':home_score, 'Away Team':away_name, 'Away Score':away_score}
  schedule_df = schedule_df.append(m_result, ignore_index=True)

schedule_df = schedule_df.drop_duplicates(keep='last')
# show schedules
print(schedule_df)

In [66]:

schedule_df.to_csv('/drive/My Drive/schedule.csv', index=False)

In [150]:
def get_rankings_from_scores(schedule_df):
  teams = list(set(list(schedule_df["Home Team"].unique())+ list(schedule_df["Away Team"].unique())))
  print(teams)

  # initialize the ranking dataframe
  ranking_df = pd.DataFrame({
                    'Team': pd.Series(dtype='str'),
                    'Total Points': pd.Series(dtype='int'),
                    'Total Games': pd.Series(dtype='int'),
                    'Total Wins': pd.Series(dtype='int'),
                    'Total Losses': pd.Series(dtype='int'),
                    'Total Ties': pd.Series(dtype='int'),
                    'PPG': pd.Series(dtype='float'),})



  for t in teams:
    t_result = {'Team':t, 'Total Points': 0,
                    'Total Games': 0,
                    'Total Wins': 0,
                    'Total Losses': 0,
                    'Total Ties': 0,
                    'PPG':0}
    ranking_df = ranking_df.append(t_result, ignore_index=True)
  print(ranking_df)


  # populate ranking dataframe based on schedule
  for index,row in schedule_df.iterrows():
    home_team = row['Home Team']
    away_team = row['Away Team']
    home_score = row['Home Score']
    away_score = row['Away Score']

    ranking_df.loc[(ranking_df['Team'] == home_team), 'Total Games'] += 1
    ranking_df.loc[(ranking_df['Team'] == away_team), 'Total Games'] += 1

    if home_score > away_score: # home win  
      ranking_df.loc[(ranking_df['Team'] == home_team), 'Total Wins'] += 1
      ranking_df.loc[(ranking_df['Team'] == home_team), 'Total Points'] += 3
      ranking_df.loc[(ranking_df['Team'] == away_team), 'Total Losses'] += 1
    elif home_score < away_score: # away win
      ranking_df.loc[(ranking_df['Team'] == away_team), 'Total Wins'] += 1
      ranking_df.loc[(ranking_df['Team'] == away_team), 'Total Points'] += 3
      ranking_df.loc[(ranking_df['Team'] == home_team), 'Total Losses'] += 1
    elif home_score == away_score: # tie
      ranking_df.loc[(ranking_df['Team'] == home_team), 'Total Ties'] += 1
      ranking_df.loc[(ranking_df['Team'] == away_team), 'Total Ties'] += 1
      ranking_df.loc[(ranking_df['Team'] == home_team), 'Total Points'] += 1
      ranking_df.loc[(ranking_df['Team'] == away_team), 'Total Points'] += 1

  ranking_df['PPG'] = ranking_df['Total Points'] / ranking_df['Total Games']

  ranking_df = ranking_df.sort_values(by=['PPG', 'Total Points'], ascending=False)
  return ranking_df


In [151]:
rankings = get_rankings_from_scores(scores)
rankings.to_csv('/drive/My Drive/rankings.csv', index=False)

['Thacher', 'La Jolla Country Day', 'Presentation', 'Trinity Christian', 'Carmel', 'Gonzales', 'Cristo Rey', 'Marina', 'Everett Alvarez', 'Pajaro Valley', 'ACE Charter', 'Prospect', 'Beacon Hill', 'Soledad', 'Hill', 'Pacific Ridge', 'Greenfield', 'Arizona College Prep', 'Campo Verde', 'Higley', 'Mercy', 'Milpitas', 'Christopher', 'KIPP San Jose Collegiate', 'James Lick', 'North Salinas', 'Wilcox', 'Terra Nova', 'Summit Tahoma', 'Palo Alto', 'Yerba Buena', 'Washington', 'Sacred Heart Prep', 'Cindy Avitia', "Bishop's", 'Kathleen MacDonald', 'Atascadero', 'KIPP Esperanza', 'Rancho San Juan', 'Evergreen Valley', 'Leland', 'Drew', 'Watsonville', 'Mira Mesa', 'Santa Cruz', 'Fairfield', 'Oceana', 'Santa Fe Christian', 'Willow Glen', 'Gunderson', 'Burlingame', 'East Palo Alto Academy', 'Liberty', 'Leigh', 'Francis Parker', 'Monte Vista Christian', 'The Academy - San Francisco', 'Escondido', 'Napa', 'Santa Teresa', 'Archbishop Riordan', 'Mt. Pleasant', 'Fremont', 'Hollister', 'Santa Clara', 'Wo