# XC Result Collector

This script collects events and event results from https://www.athletic.net/

It is focused om Massachusetts High School XC

# TODO

* Improve logic for integrating new events into existing events
* Improve logic so we only pull event details for new events (or ones we want to force updates for)
* Pull single event results

# Config

In [None]:
import os
import requests
import datetime
import random
import json
import shutil

from typing import Dict, Type, Union
from datetime import datetime

import pandas as pd
import numpy as np

# Web scraping tools
from bs4 import BeautifulSoup
import asyncio
from playwright.async_api import async_playwright
from playwright.async_api import TimeoutError as PlaywrightTimeoutError



In [2]:
path_data = os.path.join('..', 'data')
fn_events = 'db_events.csv'
fn_event_details = 'db_event_details.csv'
fn_athletes = 'db_athletes.csv'
fn_results = 'db_results.csv'

In [3]:
FLAG_SAVE = True

FLAG_UPDATE_EXISTING = False

FLAG_DEBUG = True

# Get pre-existing and new data

## Define schemas

In [4]:
# Schema

dict_schema_events = {
    'IDMeet': int,
    'Sport': str,
    'MeetName': str,
    'SeasonId': int,
    'StartDate': 'datetime64[ns]',
    'EndDate': 'datetime64[ns]',
    'Virtual': bool,
    'LocationName': str,
    'StreetAddress': str,  # nullable
    'City': str,
    'PostalCode': str,
    'State': str,
    'OwnerID': int,
    'CalendarLock': int,
    'UCalendarLock': int,
    'RegEnd': 'datetime64[ns]',  # nullable
    'HasResults': int,
    'LevelMask': int,
    'HostName': str,  # nullable
    'MascotUrl': str,  # nullable
    'Lat': float,
    'Long': float,
    'score': float,
    'CalCount': int,
    'OffDays': str,  # nullable, appears to be JSON string
    'Data': str,  # nullable
    'rsUrl': str,  # nullable
    'LiveID': int,  # nullable
    'LivePublished': bool,
    'VideoURL': str,  # nullable
    'Website': str,
    'Country': str,
    
    # Additional columns:
    
    'dtRetrieved': 'datetime64[ns]',
    'flagValidEvent': bool
    #'flagXCEvent': bool
    # 'flag_results_fetched': bool
    
}

# Event Details
dict_schema_event_details = {

    'CourseName': str,         # String, can be None
    'IDMeetDiv': int,          # Integer ID for division
    'HyTekId': int,            # Integer ID from HyTek
    'CourseId': int,           # Integer ID for course
    'LevelMask': int,          # Integer representing competition level
    'Gender': str,             # String ('M' or 'F')
    'DivName': str,            # Full division name with distance
    'Division': str,           # Short division name
    'Meters': int,             # Race distance in meters
    'Result': float,           # Numeric result, can be NaN
    'RaceTime': str,           # ISO format datetime string, can be None
    'Day': str,                # String representing day, can be None
    'PlaceDepth': int,         # Integer for place depth
    'ScoreDepth': int,         # Integer for score depth
    'results': object,         # Can be None
    'WarnScrollTo': str,       # String, can be None
    'TeamScores': object,      # Can be None
    'warnScrollTo': str,       # String, can be None
    'warnSummaryString': str,  # String, can be None

    # Additional columns:

    'dtRetrieved': 'datetime64[ns]',  # Timestamp of data retrieval
    'IDLocation': int,         # Location ID
    'IDMeet': int              # Meet ID
}

# Results

dict_schema_results = {
    'Date': 'datetime64[ns]',
    'IDMeetDiv': int, 
    'Gender': str, 
    'IDMeet': int, 
    'Location': int,
    'Grade': int,
    'AthleteID': int,
    'First Name': str,
    'Last Name': str,
    'Name': str,
    'Time': str,
    'Time Dt': 'timedelta64[ns]',
    'Seconds': float, 
    'Minutes': float,
    'Team': str,
    'TeamID': int,
    'Team Count': int, 
    'Team Position': int, 
    'Place': int, 
    'Points': int
}

## Functions    

In [5]:

def df_fix_types(df: pd.DataFrame, dict_schema: Dict[str, Union[Type, str]]) -> pd.DataFrame:
    """
    Convert DataFrame column types according to a specified schema dictionary.
    
    Parameters
    ----------
    df : pandas.DataFrame
        The input DataFrame whose column types need to be converted
    dict_schema : Dict[str, Union[Type, str]]
        Dictionary mapping column names to their desired types.
        Supported types are:
        - str: Converts to pandas string type
        - int: Converts to nullable Int64 type
        - float: Converts to float type
        - bool: Converts to boolean type
        - 'datetime64[ns]': Converts to datetime using ISO8601 format
    
    Returns
    -------
    pandas.DataFrame
        A DataFrame with columns converted to specified types
        
    Raises
    ------
    Exception
        If type conversion fails for any column, with details about the failure
        
    Examples
    --------
    >>> schema = {
    ...     'id': int,
    ...     'name': str,
    ...     'value': float,
    ...     'active': bool,
    ...     'timestamp': 'datetime64[ns]'
    ... }
    >>> df_fixed = df_fix_types(df, schema)
    """
    df = df.copy()  # Create a copy to avoid modifying the original DataFrame
    
    # Convert each column according to its type
    for col, dtype in dict_schema.items():
        if col in df.columns:
            try:
                if dtype == str:
                    df[col] = df[col].astype('string')
                elif dtype == int:
                    df[col] = pd.to_numeric(df[col], errors='coerce').astype('Int64')
                elif dtype == float:
                    df[col] = pd.to_numeric(df[col], errors='coerce')
                elif dtype == bool:
                    df[col] = df[col].astype(bool)
                elif dtype == 'datetime64[ns]':
                    df[col] = pd.to_datetime(df[col], format='ISO8601')
            except Exception as e:
                error_msg = f"Error converting column {col} to {dtype}: {str(e)}"
                raise TypeError(error_msg) from e
    
    return df

In [6]:

def create_df_events(response, schema):
    """
    Create a DataFrame from the API response using a provided schema.
    
    Args:
        response: requests.Response object from the API call
        schema: dict mapping column names to their intended data types
        
    Returns:
        pandas.DataFrame: Properly typed DataFrame containing the events data
    """
    
    # Parse JSON response
    data = response.json()
    
    # Convert events list to DataFrame
    df = pd.DataFrame(data['events'])
    
    # # Add retrieval timestamp for any datetime columns not in the response
    # for col, dtype in schema.items():
    #     if dtype == 'datetime64[ns]' and col not in df.columns:
    #         df[col] = pd.Timestamp.now()
            
    # Hard-coded additional fields. Make sure the types are defined in the table schema1
    
    df['dtRetrieved'] = pd.Timestamp.now()
    
    df = df_fix_types(df, schema)
    
    return df


In [7]:
def create_df_event_details(meet_id, response, schema):
    
    data = response.json()
    
    if FLAG_DEBUG:
        print('Meet ID: {}'.format(meet_id))
        print(data)
        for key in data.keys():
            print(f"{key}: {data[key]}")
    
    if data is not None:
        location_id = data['meet']['Location']['ID']
        meet_id_response = data['meet']['ID']

        df = pd.DataFrame(data['xcDivisions'])

        df['IDLocation'] = location_id
        df['dtRetrieved'] = pd.Timestamp.now()
        df['IDMeet'] = meet_id_response

        df = df_fix_types(df, schema)

    else:
        
        print('No valid response for {}'.format(meet_id))
        df = None

    return df

In [8]:
def create_df_results(meet_div_id, response, schema):

    data = response.json()
    
    if FLAG_DEBUG:
        print('Meet Div ID: {}'.format(meet_div_id))
        
        print(data)
        
        for key in data.keys():
            print(f"{key}: {data[key]}")
    
    if data is not None:
        
        df = None
        
        # print(data['currentEventValid'])
        # print(data['resultsXC'])
        
    #     location_id = data['meet']['Location']['ID']
    #     meet_id_response = data['meet']['ID']

    #     df = pd.DataFrame(data['xcDivisions'])

    #     df['IDLocation'] = location_id
    #     df['dtRetrieved'] = pd.Timestamp.now()
    #     df['IDMeet'] = meet_id_response

    #     df = df_fix_types(df, schema)

    else:
        
        print('No valid response for {}'.format(meet_div_id))
        df = None

    return df
    

In [9]:
def get_events(dt_start, dt_end, state_2, country_2='US', level=4):
    """
    Retrieves athletic events data from athletic.net API for a specified state and date range.

    Args:
        state (str): Two-letter state code (e.g., 'MA' for Massachusetts)
        dt_start (str): Start date in 'YYYY-MM-DD' format
        dt_end (str): End date in 'YYYY-MM-DD' format
        country_2 (str, optional): Two-letter country code. Defaults to 'US'
        level (int, optional): Competition level filter. Defaults to 4

    Returns:
        pandas.DataFrame or None: DataFrame containing event information if request is successful,
                                None if request fails. DataFrame structure is determined by 
                                create_df_events() function using dict_schema_events

    Raises:
        Potential requests.exceptions.RequestException: If the HTTP request fails

    Example:
        >>> events_df = get_events('MA', '2024-11-01', '2024-11-30')
        >>> if events_df is not None:
        ...     print(events_df.shape)

    Notes:
        - Makes a POST request to athletic.net's API endpoint
        - Requires the create_df_events() function and dict_schema_events schema
        - Some request headers are commented out but may be needed for authentication
        - API endpoint: https://www.athletic.net/api/v1/Event/Events
    """

    #state_name = 'Massachusetts'
    
    url = 'https://www.athletic.net/api/v1/Event/Events'

    params = {
        "start":dt_start,
        "end":dt_end,
        "levelMask":0,
        "sportMask":0,
        "country":country_2,
        "state":state_2,
        "location":"",
        "distanceKM":0,
        "filterTerm":""}

    # Make the POST request
    headers = {
        'content-type': 'application/json',
        'authority': 'www.athletic.net',
        'accept': 'application/json, text/plain, */*',
        'accept-encoding': 'gzip, deflate, br, zstd',
        'accept-language': 'en-US,en;q=0.9',
        
        # 'anet-appinfo': 'web:web:0:300',
        # 'dnt': '1',
        # 'origin': 'https://www.athletic.net',
        # 'pageguid': 'c73bf291-0b5a-4062-a158-321e0c72c0f4',
        # 'priority': 'u=1, i',
        # 'referer': 'https://www.athletic.net/events/usa/massachusetts/2024-11-16;level=4',
        # 'sec-ch-ua': '"Chromium";v="130", "Google Chrome";v="130", "Not?A_Brand";v="99"',
        # 'sec-ch-ua-mobile': '?0',
        # 'sec-ch-ua-platform': '"macOS"',
        # 'sec-fetch-dest': 'empty',
        # 'sec-fetch-mode': 'cors',
        # 'sec-fetch-site': 'same-origin',
        # 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36'
    }

    response = requests.post(url, json=params, headers=headers)

    # Check the response
    if response.status_code == 200:
        print("Request successful!")
        
        # TODO: Make the schema a parameter
        df_events_new = create_df_events(response, dict_schema_events)
        
    else:
        print(f"Request failed with status code: {response.status_code}")
        print(f"Response: {response.text}")
        
        df_events_new = None
        
    return df_events_new


In [10]:
def get_event_details(meet_id):

    url = 'https://www.athletic.net/api/v1/Meet/GetMeetData?meetId={}&sport=xc'.format(meet_id)

    headers = {
        'content-type': 'application/json',
        'authority': 'www.athletic.net',
        'accept': 'application/json, text/plain, */*',
        'accept-encoding': 'gzip, deflate, br, zstd',
        'accept-language': 'en-US,en;q=0.9'
    }

    # Make the GET request
    response = requests.get(url, headers=headers)

    # Check if request was successful
    if response.status_code == 200:
        print("Request successful!")
        
        df_event_details_new = create_df_event_details(meet_id, response, dict_schema_event_details)
        
    else:
        print(f"Request failed with status code: {response.status_code}")
        print(f"Response: {response.text}")
        
        df_event_details_new = None
        
    return df_event_details_new



In [11]:
# Function to start a persistent browser
async def start_browser():
    """
    Starts a persistent Playwright browser instance.

    Returns:
        Browser: A Playwright browser instance.
    """
    playwright = await async_playwright().start()
    browser = await playwright.chromium.launch(headless=True)
    return browser, playwright

# Fetch a page using the persistent browser
# async def fetch_page(browser, url):
#     """
#     Fetches a webpage using an existing Playwright browser instance and returns a BeautifulSoup object.

#     Parameters:
#         browser (Browser): A Playwright browser instance.
#         url (str): The URL of the webpage to fetch.

#     Returns:
#         BeautifulSoup: Parsed BeautifulSoup object containing the rendered HTML.
#     """
#     page = await browser.new_page()
#     await page.goto(url)
#     await page.wait_for_load_state("networkidle")
#     rendered_html = await page.content()
#     await page.close()
#     return BeautifulSoup(rendered_html, 'html.parser')

async def fetch_page(browser, url, max_retries=3, timeout=60000):
    retries = 0
    page = None
    
    while retries < max_retries:
        try:
            if page is not None:
                await page.close()
            
            page = await browser.new_page()
            page.set_default_timeout(timeout)  # Removed await - this is synchronous
            
            # Navigate to the page with a more lenient load condition
            response = await page.goto(url, wait_until="domcontentloaded")
            if not response or not response.ok:
                raise Exception(f"Failed to load page: {response.status if response else 'No response'}")
            
            try:
                # Try to wait for network idle, but don't fail if it times out
                await page.wait_for_load_state("networkidle", timeout=30000)
            except PlaywrightTimeoutError:
                print(f"Network idle timeout on attempt {retries + 1}, proceeding with partial load")
            
            rendered_html = await page.content()
            await page.close()
            return BeautifulSoup(rendered_html, 'html.parser') 
            
        except Exception as e:
            retries += 1
            if page:
                try:
                    await page.close()
                except:
                    pass
                    
            if retries == max_retries:
                raise Exception(f"Failed to fetch page after {max_retries} attempts: {str(e)}")
                
            print(f"Attempt {retries} failed: {str(e)}, retrying...")
            await asyncio.sleep(2 * retries)
            
            

# Function to close the persistent browser
async def close_browser(browser, playwright):
    """
    Closes the persistent Playwright browser instance.

    Parameters:
        browser (Browser): A Playwright browser instance.
        playwright: The Playwright object instance.
    """
    await browser.close()
    await playwright.stop()


In [12]:
def get_results_from_html_athletic(soup):
    """
    Parses the HTML content (soup) and extracts race results into a structured DataFrame.
    This function works specifically with results from athletic.net

    Parameters:
        soup (BeautifulSoup): The BeautifulSoup object of the rendered HTML.

    Returns:
        pd.DataFrame: A DataFrame containing the extracted results.
    """

    # Find all result rows
    result_rows = soup.find_all("div", class_="result-row")

    # Initialize a list to store data
    data = []

    # Iterate through each result row
    for row in result_rows:
        # Extract place
        place = row.find("div", class_="place-column").text.strip()

        # Extract full name
        # name_tag = row.find("a", ashrefonweb="")
        # name = name_tag.text.strip() if name_tag else None

        # name_tag = row.find("a", ashrefonweb=True, class_="ng-star-inserted")
        # name = name_tag.text.strip() if name_tag else None
            
        # Extract full name and athlete ID
        name_tag = row.find("a", ashrefonweb=True, class_="ng-star-inserted")
        name = name_tag.text.strip() if name_tag else None
        athlete_id = None
        if name_tag and "href" in name_tag.attrs:
            href = name_tag["href"]
            athlete_id = href.split("/")[2]  # Extract the ID from the URL
            
        # # Extract team (from second <a> in "subtitle team" section)
        # team_tag = row.find("div", class_="subtitle team")
        # team = None
        # if team_tag:
        #     team_a_tag = team_tag.find_all("a", ashrefonweb=True)
        #     if len(team_a_tag) > 1:
        #         team = team_a_tag[1].text.strip()
        
        # Extract team name and team ID
        team_tag = row.find("div", class_="subtitle team")
        team = None
        team_id = None
        if team_tag:
            team_a_tag = team_tag.find_all("a", ashrefonweb=True)
            if len(team_a_tag) > 1:
                team = team_a_tag[1].text.strip()
                if "href" in team_a_tag[1].attrs:
                    team_href = team_a_tag[1]["href"]
                    team_id = team_href.split("/")[2]  # Extract the ID from "/team/{id}/cross-country"

        
        # Extract time
        time_tag = row.find("div", class_="secondary").find("a", class_="ng-star-inserted")
        time = time_tag.text.strip() if time_tag else None

        # Extract year (at most 2 characters)
        year_tag = row.find("shared-tertiary-stats").find("span", string=lambda t: t and "Yr:" in t)
        year = year_tag.text.replace("Yr:", "").strip()[:2] if year_tag else None

        # Extract points (strip "+" and unnecessary characters)
        points_tag = row.find("shared-tertiary-stats").find("span", string=lambda t: t and "pts" in t)
        points = (
            points_tag.text.split("•")[-1].replace("pts", "").replace("+", "").strip()
            if points_tag
            else None
        )

        # Add to data list
        data.append({
            "Place": place,
            "Name": name,
            "Athlete ID": athlete_id,
            "Team": team,
            "Team ID": team_id,
            "Time": time,
            "Year": year,
            "Points": points,
        })

    # Convert to a pandas DataFrame for better presentation
    df = pd.DataFrame(data)

    return df


In [48]:
def backup_if_exists(filepath):
    """
    Create a timestamped backup of a file if it exists.
    
    Args:
        filepath (str): Path to the file to backup
        ignore_files (list, optional): List of filenames to ignore in the target directory
    """
    
    if not os.path.exists(filepath):
        return
        
    # Split the path into directory, filename, and extension
    directory = os.path.dirname(filepath)
    filename = os.path.basename(filepath)
    
    name, ext = os.path.splitext(filename)
    
    # Get current timestamp for backup file
    timestamp = datetime.now().strftime('%H-%M-%S')
    print('Time: {}'.format(timestamp))
    
    # Create backup filename with timestamp
    backup_filename = f"{name}-BAK-{timestamp}{ext}"
    backup_path = os.path.join(directory, backup_filename)
    
    # Create the backup
    shutil.copy2(filepath, backup_path)



## DEV AREA FOR RESULTS

In [13]:

# TEST FOR EXTRACTING RESULTS FOR A SINGLE PAGE

# Set to True to run this code

if False:
    
    async def fetch_page():
        async with async_playwright() as p:
            # Launch browser in headless mode
            browser = await p.chromium.launch(headless=True)
            page = await browser.new_page()

            # Navigate to the page
            #await page.goto("https://www.athletic.net/CrossCountry/meet/250591/results/1001120")
            
            await page.goto("https://www.athletic.net/CrossCountry/meet/250562/results/997016")

            # Wait for network activity to finish
            await page.wait_for_load_state("networkidle")

            # Get the rendered HTML
            rendered_html = await page.content()

            # Parse with BeautifulSoup
            soup = BeautifulSoup(rendered_html, 'html.parser')
            print(soup.prettify())
            with open(os.path.join(path_data, 'test_results.html'), 'w', encoding='utf-8') as file:
                file.write(soup.prettify())
            
            # Close the browser
            await browser.close()

    # Use 'await' directly in Jupyter Notebook
    await fetch_page()



In [14]:
if False:

    # Start the browser
    browser, playwright = await start_browser()

    try:
        # Fetch multiple pages
        urls = [
            "https://www.athletic.net/CrossCountry/meet/250562/results/997016",
            #"https://www.athletic.net/CrossCountry/meet/250591/results/1001120"
        ]
        for url in urls:
            soup = await fetch_page(browser, url)
            print(soup.title.string)  # Example: Print the page title
    finally:
        # Close the browser
        print('OK!')
        
    #print(soup)

In [15]:
# print(soup)

## Load data

In [None]:

# Events

pe = os.path.join(path_data, fn_events)

if os.path.isfile(pe):
    df_events = pd.read_csv(pe)
    df_events = df_fix_types(df_events, dict_schema_events)
    print('Loaded df_events: {} rows'.format(len(df_events)))
else:
    df_events = pd.DataFrame({col: pd.Series(dtype=dtype) for col, dtype in dict_schema_events.items()})
    print('Initializing blank df_events')


# Event Details

ped = os.path.join(path_data, fn_event_details)

if os.path.isfile(ped):
    df_event_details = pd.read_csv(ped)
    df_event_details = df_fix_types(df_event_details, dict_schema_event_details)
    print('Loaded df_event_details: {} rows'.format(len(df_event_details)))
else:
    df_event_details = pd.DataFrame({col: pd.Series(dtype=dtype) for col, dtype in dict_schema_event_details.items()})
    print('Initializing blank df_event_details')


# Results

pr = os.path.join(path_data, fn_results)

if os.path.isfile(pr):
    df_results = pd.read_csv(pr)
    print('Loaded df_results: {} rows'.format(len(df_results)))

else:
    df_results = pd.DataFrame({col: pd.Series(dtype=dtype) for col, dtype in dict_schema_results.items()})
    print('Initializing blank df_results')

    


## Get new events

In [None]:
# Hard-coded dates
# Use this to back-fill older events

df_events_new = get_events('2024-11-01', '2024-12-31', 'MA')
set_new_events = set(df_events_new['IDMeet'])

print(len(df_events_new))

In [None]:
# Add to df_events, remove duplicates

# TODO: Add logic to make updating more flexible

set_existing_events = set(df_events['IDMeet'])

cnt_events_existing = len(df_events)
cnt_events_new = len(df_events_new)

df_events = pd.concat([df_events, df_events_new], axis=0)

df_events = df_events.dropna(how='all')

df_events = df_events.sort_values('dtRetrieved', ascending=True).drop_duplicates(subset=['IDMeet'], keep='last')

df_events = df_fix_types(df_events, dict_schema_events)

cnt_events_total = len(df_events)

cnt_dupes = cnt_events_existing + cnt_events_new - cnt_events_total 

print('Merged {} events into existing set of {}; final count is {}. {} duplicates'.format(cnt_events_new, cnt_events_existing, cnt_events_total, cnt_dupes))

In [19]:
# Currently, this block is controlled with comment/uncomments

# Get event details for all newly retrieved events

df_get_these_event_details = df_events_new

# Get event details for a sample of N events in df_events that are not in df_event_details

# max_events = 150

# set_events = set(df_events.loc[df_events['Sport']=='XC', 'IDMeet'])

# set_existing_event_details = set(df_event_details['IDMeet'])

# set_events_to_get = set_events - set_existing_event_details

# df_get_these_event_details = df_events[df_events['IDMeet'].isin(set_events_to_get)]

# if len(df_get_these_event_details) < max_events:
#     max_events = len(df_get_these_event_details)
    
# df_get_these_event_details = df_get_these_event_details.sample(max_events)

# print(len(set_events), len(set_existing_event_details), len(set_events_to_get), len(df_get_these_event_details))


In [20]:
# One-time to initialize flagValidEvent

# df_events['flagValidEvent'] = True
# df_events.loc[df_events['IDMeet'].isin(set_events_to_get), 'flagValidEvent'] = False

## Get new event details

In [None]:
# Get new event details

list_df_event_details = []
list_inactive_events = []

for i, r in df_get_these_event_details.iterrows():
    
    meet_id = r['IDMeet']
    
    print(i, meet_id)
    
    if r['Sport'] == 'XC':
        _df = get_event_details(meet_id)
        
        if _df is None:
            print('Failed to get details for {}'.format(r['MeetName']))
            list_inactive_events.append(meet_id)
        else:
            if len(_df) == 0:
                print('No event details for {}'.format(r['MeetName']))
                list_inactive_events.append(meet_id)
            else:
                print('Got {} event details'.format(len(_df)))
                list_df_event_details.append(_df)
                
    else:
        print('Not an XC event')
        list_inactive_events.append(meet_id)

if len(list_df_event_details) > 0:
    df_new_event_details = pd.concat(list_df_event_details, axis=0)
else:
    print('No new valid events found')

# Tag any past event for which there was no info as invalid
dt_now = datetime.datetime.now()
df_events.loc[df_events['IDMeet'].isin(list_inactive_events) & (df_events['EndDate'] < dt_now), 'flagValidEvent'] = False

print(len(df_new_event_details))
df_new_event_details = df_new_event_details.drop_duplicates()
print(len(df_new_event_details))

# TODO: Fix errors here

In [None]:
print(len(df_event_details))

if len(df_new_event_details) > 0:
    df_event_details = pd.concat([df_event_details, df_new_event_details], axis=0)
else:
    print('No new event details')
print(len(df_event_details))

df_event_details = df_event_details.drop_duplicates()

print(len(df_event_details))


In [None]:
## Checks

set_events = set(df_events.loc[df_events['Sport']=='XC', 'IDMeet'])

set_events_valid = set(df_events.loc[(df_events['Sport']=='XC') & (df_events['flagValidEvent']==True), 'IDMeet'])

set_events_in_event_details = set(df_event_details['IDMeet'])

events_in_e_not_ed = set_events - set_events_in_event_details
events_valid_in_e_not_ed = set_events_valid - set_events_in_event_details

events_in_ed_not_e = set_events_in_event_details - set_events

print('There are {} events without event details'.format(len(events_in_e_not_ed)))

# This should be zero
print('There are {} valid events without event details'.format(len(events_valid_in_e_not_ed)))

# This should be zero
print('There are {} events with event details that are not in the events table'.format(len(events_in_ed_not_e)))


## Get new results

* Start browser
* Get a list of event details with no results
* Get some results
* Save

2 modes:
* Build historical results
* Get recent results

In [None]:
# Get new results

# Figure out what events to get:

# At event level

# set_existing_results_div = set(df_results['IDMeetDiv'])
# set_results_div_to_get = set(df_event_details['IDMeetDiv']) - set_existing_results_div

# At meet level

set_existing_meets_with_results = set(df_results['IDMeet'])
set_meet_results_to_get = set(df_event_details['IDMeet']) - set_existing_meets_with_results

#set_results_div_to_get = set(df_event_details.loc[df_event_details['IDMeet'].isin(set_meet_results_to_get), 'IDMeetDiv'])

# Currently we just randomly grab a few:

#TODO: Change this to grab results for N meets, not N events

max_results = 5

sample_set = set(random.sample(list(set_meet_results_to_get), max_results))

df_event_details_for_new_results = df_event_details[df_event_details['IDMeet'].isin(sample_set)]

cnt_new_event_details = len(df_event_details_for_new_results)

print('Preparing to fetch results for {} events in {} meets'.format(cnt_new_event_details, max_results))
print(list(sample_set))

In [50]:
# Warm up the browser...
browser, playwright = await start_browser()

In [None]:

# Get the results!

list_df_event_results = []
list_bad_results = []

for i, r in df_event_details_for_new_results.iterrows():
    
    meet_div_id = r['IDMeetDiv']
    meet_id = r['IDMeet']
    meet_name = df_events.loc[df_events['IDMeet'] == meet_id, 'MeetName']
    
    if FLAG_DEBUG:
        print(i, meet_id, meet_div_id)
    
    url = 'https://www.athletic.net/CrossCountry/meet/{}/results/{}'.format(meet_id, meet_div_id)
    
    #if FLAG_DEBUG:
    print(i, url)
    
    results_soup = await fetch_page(browser, url)
    
    if FLAG_DEBUG:
        print(type(results_soup))
        
    _df = get_results_from_html_athletic(results_soup)
    
    if _df is None:
        print('Failed to get results for {} at {}'.format(meet_div_id, meet_name))
        list_bad_results.append(meet_div_id)
    else:
        if len(_df) == 0:
            print('No results for {} at {}'.format(meet_div_id, meet_name))
            list_bad_results.append(meet_div_id)
        else:
            print('Got {} results'.format(len(_df)))
            _df['IDMeet'] = meet_id
            _df['IDMeetDiv'] = meet_div_id
            list_df_event_results.append(_df)
                
    # else:
    #     print('Not an XC event')
    #     list_bad_results.append(meet_id)

if len(list_df_event_results) > 0:
    df_new_event_results = pd.concat(list_df_event_results, axis=0)
else:
    print('No new valid results found')


print('DONE')


In [None]:
# Close the browser

close_browser(browser, playwright)

In [None]:
# Concatenate to results

if len(df_new_event_results) > 0:

    df_results = pd.concat([df_results, df_new_event_results], axis=0)
    
    df_results = df_results.drop_duplicates()
    
print(df_results.shape)

## Save data

In [None]:

cnt_meets_all = df_events['IDMeet'].nunique()
cnt_events_all = df_event_details['IDMeetDiv'].nunique()

cnt_meets_results = df_results['IDMeet'].nunique()
cnt_events_results = df_results['IDMeetDiv'].nunique()

print('There are {} meets covering {} events in the meet/event data'.format(cnt_meets_all, cnt_events_all))
print('There are results for {} meets covering {} events in the results data'.format(cnt_meets_results, cnt_events_results))


In [None]:
# Save data!

if FLAG_SAVE:

    # Create backups if files exist
    print('Backing files up...')
    backup_if_exists(pe)
    backup_if_exists(ped)
    backup_if_exists(pr)
    
    # Save the new files
    print('Saving files...')
    df_events.to_csv(pe, index=False)
    df_event_details.to_csv(ped, index=False)
    df_results.to_csv(pr, index=False)
    
    print('Done!')

In [None]:
.columns

# Ad-Hoc

In [57]:
# https://www.athletic.net/api/v1/public/GetStatesCountries2


In [58]:
# Show all events at Wrentham

#df_events[df_events['LocationName'].str.contains('wrentham', case=False)]