<a href="https://colab.research.google.com/github/zhenyisx/scoutoid/blob/main/scraper2db.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

To scrape website and populate to database (csv file)

In [2]:
# Imports
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt

from google.colab import  drive
drive.mount('/drive')

%matplotlib inline

In [3]:
# maxpreps's homepage
maxpreps_url = 'https://www.maxpreps.com/ca/central-coast-section/soccer/girls/scores/?date=2/13/2023'  # please change the date if needed

# Use requests to retrieve data from a given URL
maxpreps_response = requests.get(maxpreps_url, headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"})

# Parse the whole HTML page using BeautifulSoup
maxpreps_soup = BeautifulSoup(maxpreps_response.text, 'html.parser')

# Title of the parsed page
maxpreps_soup.title

<title>
	Central Coast Section High School Girls Soccer Scores | 2/13/2023 Results | MaxPreps
</title>

In [None]:
# step 1: find date

In [36]:
# find date

# <a href="/ca/central-coast-section/soccer/girls/scores/?date=2/13/2023" class="btn btn-default active">Feb 13, 2023</a>
today = maxpreps_soup.find('div', {'class': 'calendar'}).find('ol', {'class': 'week'}).find('a', {'class': 'btn btn-default active'})
print(today.text)

Feb 13, 2023


In [64]:
# create dateframe for this date
schedule_df = pd.DataFrame({'Date': pd.Series(dtype='datetime64[ns]'),
                   'Home Team': pd.Series(dtype='str'),
                   'Home Score': pd.Series(dtype='int'),
                   'Away Team': pd.Series(dtype='str'),
                   'Away Score': pd.Series(dtype='int')})

# populate the datafrme
def convert_to_score(raw_input):
  try:
      return int(raw_input)
  except ValueError:
      # Handle the exception
      print('score not valid, using -1 instead')
      return -1
  except AttributeError:
      # Handle the exception
      print('score not valid, using -1 instead')
      return -1

matches = maxpreps_soup.find('div', {'class': 'contests'})

for m in  matches.find_all('ul', {'class': 'teams'}):
  raw_record = m.find_all("li")

  # home score
  try:
    home_score = convert_to_score(raw_record[0].find('div', {'class': 'score'}).text.strip())
  except ValueError:
      # Handle the exception
      print('score not valid, using -1 instead')
      home_score = -1
  except AttributeError:
      # Handle the exception
      print('score not valid, using -1 instead')
      home_score = -1
  print(home_score)

  # home name
  home_name = raw_record[0].find('div', {'class': 'name'}).text.strip()
  print(home_name)

  # away score
  try:
    away_score = convert_to_score(raw_record[1].find('div', {'class': 'score'}).text.strip())
  except ValueError:
      # Handle the exception
      print('score not valid, using -1 instead')
      away_score = -1
  except AttributeError:
      # Handle the exception
      print('score not valid, using -1 instead')
      away_score = -1
  print(away_score)

  # away name
  away_name = raw_record[1].find('div', {'class': 'name'}).text.strip()
  print(away_name)

  m_result = {'Date':today.text, 'Home Team':home_name, 'Home Score':home_score, 'Away Team':away_name, 'Away Score':away_score}
  schedule_df = schedule_df.append(m_result, ignore_index=True)


0
Seaside
1
St. Francis
7
Santa Cruz
0
San Lorenzo Valley
0
Anzar
0
Oakwood
score not valid, using -1 instead
-1
Pacific Grove
1
Gonzales
score not valid, using -1 instead
-1
York
0
Santa Catalina
2
Homestead
1
Woodside
score not valid, using -1 instead
-1
Carmel
score not valid, using -1 instead
-1
North Monterey County
4
Prospect
0
Overfelt
10
The Nueva School
0
Summit Preparatory


In [65]:
# show schedules
print(schedule_df)
  # print(matches.prettify())

           Date         Home Team  Home Score              Away Team  \
0  Feb 13, 2023           Seaside           0            St. Francis   
1  Feb 13, 2023        Santa Cruz           7     San Lorenzo Valley   
2  Feb 13, 2023             Anzar           0                Oakwood   
3  Feb 13, 2023     Pacific Grove          -1               Gonzales   
4  Feb 13, 2023              York          -1         Santa Catalina   
5  Feb 13, 2023         Homestead           2               Woodside   
6  Feb 13, 2023            Carmel          -1  North Monterey County   
7  Feb 13, 2023          Prospect           4               Overfelt   
8  Feb 13, 2023  The Nueva School          10     Summit Preparatory   

   Away Score  
0           1  
1           0  
2           0  
3           1  
4           0  
5           1  
6          -1  
7           0  
8           0  


In [66]:

schedule_df.to_csv('/drive/My Drive/schedule.csv', index=False)

In [67]:
teams = list(set(list(schedule_df["Home Team"].unique())+ list(schedule_df["Away Team"].unique())))
print(teams)

# initialize the ranking dataframe
ranking_df = pd.DataFrame({
                   'Team': pd.Series(dtype='str'),
                   'Total Points': pd.Series(dtype='int'),
                   'Total Games': pd.Series(dtype='int'),
                   'Total Wins': pd.Series(dtype='int'),
                   'Total Losses': pd.Series(dtype='int'),
                   'Total Ties': pd.Series(dtype='int'),
                   'PPG': pd.Series(dtype='float'),})



for t in teams:
  t_result = {'Team':t, 'Total Points': 0,
                   'Total Games': 0,
                   'Total Wins': 0,
                   'Total Losses': 0,
                   'Total Ties': 0,
                   'PPG':0}
  ranking_df = ranking_df.append(t_result, ignore_index=True)
print(ranking_df)


# populate ranking dataframe based on schedule
for index,row in schedule_df.iterrows():
  home_team = row['Home Team']
  away_team = row['Away Team']
  home_score = row['Home Score']
  away_score = row['Away Score']

  ranking_df.loc[(ranking_df['Team'] == home_team), 'Total Games'] += 1
  ranking_df.loc[(ranking_df['Team'] == away_team), 'Total Games'] += 1

  if home_score > away_score: # home win  
    ranking_df.loc[(ranking_df['Team'] == home_team), 'Total Wins'] += 1
    ranking_df.loc[(ranking_df['Team'] == home_team), 'Total Points'] += 3
    ranking_df.loc[(ranking_df['Team'] == away_team), 'Total Losses'] += 1
  elif home_score < away_score: # away win
    ranking_df.loc[(ranking_df['Team'] == away_team), 'Total Wins'] += 1
    ranking_df.loc[(ranking_df['Team'] == away_team), 'Total Points'] += 3
    ranking_df.loc[(ranking_df['Team'] == home_team), 'Total Losses'] += 1
  elif home_score == away_score: # tie
    ranking_df.loc[(ranking_df['Team'] == home_team), 'Total Ties'] += 1
    ranking_df.loc[(ranking_df['Team'] == away_team), 'Total Ties'] += 1
    ranking_df.loc[(ranking_df['Team'] == home_team), 'Total Points'] += 1
    ranking_df.loc[(ranking_df['Team'] == away_team), 'Total Points'] += 1

ranking_df['PPG'] = ranking_df['Total Points'] / ranking_df['Total Games']

ranking_df = ranking_df.sort_values(by=['PPG', 'Total Points'], ascending=False)
print(ranking_df)

#     print(row['c1'], row['c2'])


['Anzar', 'York', 'San Lorenzo Valley', 'Seaside', 'Carmel', 'Santa Catalina', 'Gonzales', 'Pacific Grove', 'Santa Cruz', 'Overfelt', 'Summit Preparatory', 'Prospect', 'Woodside', 'The Nueva School', 'Oakwood', 'North Monterey County', 'Homestead', 'St. Francis']
                     Team  Total Points  Total Games  Total Wins  \
0                   Anzar             0            0           0   
1                    York             0            0           0   
2      San Lorenzo Valley             0            0           0   
3                 Seaside             0            0           0   
4                  Carmel             0            0           0   
5          Santa Catalina             0            0           0   
6                Gonzales             0            0           0   
7           Pacific Grove             0            0           0   
8              Santa Cruz             0            0           0   
9                Overfelt             0            0    

In [68]:
ranking_df.to_csv('/drive/My Drive/ranking.csv', index=False)

                     Team  Total Points  Total Games  Total Wins  \
0                   Anzar             0            0           0   
1                    York             0            0           0   
2      San Lorenzo Valley             0            0           0   
3                 Seaside             0            0           0   
4                  Carmel             0            0           0   
5          Santa Catalina             0            0           0   
6                Gonzales             0            0           0   
7           Pacific Grove             0            0           0   
8              Santa Cruz             0            5           5   
9                Overfelt             0            0           0   
10     Summit Preparatory             0            0           0   
11               Prospect             0            1           1   
12               Woodside             0            0           0   
13       The Nueva School             0         

In [21]:
# create a record and insert to dataframe

raw_record = m.find_all("li")

# home score
home_socre = int(raw_record[0].find('div', {'class': 'score'}).text.strip())
print(home_socre)

# home name
home_name = raw_record[0].find('div', {'class': 'name'}).text.strip()
print(home_name)

# away score
away_socre = int(raw_record[1].find('div', {'class': 'score'}).text.strip())
print(away_socre)

# away name
away_name = raw_record[1].find('div', {'class': 'name'}).text.strip()
print(away_name)


0
Seaside
1
St. Francis


In [9]:
for li in m.find_all("li"):
  # for each match, extract a record and save it to a dataframe
    print(li.text.strip(), end=" ")

0
Seaside 1
St. Francis 

In [None]:
# extract match scores

for match in maxpreps_soup.find_all('div', {'class': 'article'}):
    if article.h3:
        # Title of the article
        print(article.h3.string)
        # Text
        print(article.p.text)
        print()


           Date Home Team  Home Score Away Team  Away Score
0  Feb 13, 2023         a           0         b           0


In [35]:
from google.colab import  drive
drive.mount('/drive')
schedule_df2.to_csv('/drive/My Drive/schedule_tmp.csv', index=False)

Drive already mounted at /drive; to attempt to forcibly remount, call drive.mount("/drive", force_remount=True).
