# XC Result Collector

This script collects events and event results from https://www.athletic.net/

It is focused om Massachusetts High School XC

# Config

In [139]:
import os
import requests
import datetime

import pandas as pd
import numpy as np


In [140]:
path_data = os.path.join('..', 'data')
fn_events = 'db_events.csv'
fn_event_details = 'db_event_details.csv'
fn_athletes = 'db_athletes.csv'
fn_results = 'db_results.csv'

In [141]:
FLAG_SAVE = True

FLAG_UPDATE_EXISTING = False

FLAG_DEBUG = True

# Load prexisting data

## Define schemas

In [142]:
# Schema

dict_schema_events = {
    'IDMeet': int,
    'Sport': str,
    'MeetName': str,
    'SeasonId': int,
    'StartDate': 'datetime64[ns]',
    'EndDate': 'datetime64[ns]',
    'Virtual': bool,
    'LocationName': str,
    'StreetAddress': str,  # nullable
    'City': str,
    'PostalCode': str,
    'State': str,
    'OwnerID': int,
    'CalendarLock': int,
    'UCalendarLock': int,
    'RegEnd': 'datetime64[ns]',  # nullable
    'HasResults': int,
    'LevelMask': int,
    'HostName': str,  # nullable
    'MascotUrl': str,  # nullable
    'Lat': float,
    'Long': float,
    'score': float,
    'CalCount': int,
    'OffDays': str,  # nullable, appears to be JSON string
    'Data': str,  # nullable
    'rsUrl': str,  # nullable
    'LiveID': int,  # nullable
    'LivePublished': bool,
    'VideoURL': str,  # nullable
    'Website': str,
    'Country': str,
    
    # Additional columns:
    
    'dtRetrieved': 'datetime64[ns]',
    # 'flag_results_fetched': bool
    
}

# Event Details
dict_schema_event_details = {

    'CourseName': str,         # String, can be None
    'IDMeetDiv': int,          # Integer ID for division
    'HyTekId': int,            # Integer ID from HyTek
    'CourseId': int,           # Integer ID for course
    'LevelMask': int,          # Integer representing competition level
    'Gender': str,             # String ('M' or 'F')
    'DivName': str,            # Full division name with distance
    'Division': str,           # Short division name
    'Meters': int,             # Race distance in meters
    'Result': float,           # Numeric result, can be NaN
    'RaceTime': str,           # ISO format datetime string, can be None
    'Day': str,                # String representing day, can be None
    'PlaceDepth': int,         # Integer for place depth
    'ScoreDepth': int,         # Integer for score depth
    'results': object,         # Can be None
    'WarnScrollTo': str,       # String, can be None
    'TeamScores': object,      # Can be None
    'warnScrollTo': str,       # String, can be None
    'warnSummaryString': str,  # String, can be None

    # Additional columns:

    'dtRetrieved': 'datetime64[ns]',  # Timestamp of data retrieval
    'IDLocation': int,         # Location ID
    'IDMeet': int              # Meet ID
}

# Results

dict_schema_results = {
    'Date': 'datetime64[ns]',
    'EventID': int, 
    'Gender': str, 
    'RaceID': int, 
    'Location': int,
    'Grade': int,
    'AthleteID': int,
    'First Name': str,
    'Last Name': str,
    'Name': str,
    'Time': 'timedelta64[ns]',
    'School': str,
    'Seconds': float, 
    'Minutes': float,
    'Team Count': int, 
    'Team Position': int, 
    'Place': int, 
    'Points': int
}

## Functions    

In [122]:
def df_fix_types(df, dict_schema):
    
    # Convert each column according to its type
    for col, dtype in dict_schema.items():
        if col in df.columns:
            try:
                if dtype == str:
                    df[col] = df[col].astype('string')
                elif dtype == int:
                    #print(f"Column type before conversion: {type(df[col])}")
                    df[col] = pd.to_numeric(df[col], errors='coerce').astype('Int64')
                elif dtype == float:
                    df[col] = pd.to_numeric(df[col], errors='coerce')
                elif dtype == bool:
                    df[col] = df[col].astype(bool)
                elif dtype == 'datetime64[ns]':
                    #df[col] = pd.to_datetime(df[col])
                    df[col] = pd.to_datetime(df[col], format='ISO8601')
            except Exception as e:
                print(f"Error converting column {col} to {dtype}: {str(e)}")
                raise
    
    return df

In [123]:

def create_df_events(response, schema):
    """
    Create a DataFrame from the API response using a provided schema.
    
    Args:
        response: requests.Response object from the API call
        schema: dict mapping column names to their intended data types
        
    Returns:
        pandas.DataFrame: Properly typed DataFrame containing the events data
    """
    
    # Parse JSON response
    data = response.json()
    
    # Convert events list to DataFrame
    df = pd.DataFrame(data['events'])
    
    # # Add retrieval timestamp for any datetime columns not in the response
    # for col, dtype in schema.items():
    #     if dtype == 'datetime64[ns]' and col not in df.columns:
    #         df[col] = pd.Timestamp.now()
            
    # Hard-coded additional fields. Make sure the types are defined in the table schema1
    
    df['dtRetrieved'] = pd.Timestamp.now()
    
    df = df_fix_types(df, schema)
    
    return df


In [152]:
def create_df_event_details(meet_id, response, schema):
    
    data = response.json()
    
    if FLAG_DEBUG:
        print('Meet ID: {}'.format(meet_id))
        print(data)
    
    if data is not None:
        location_id = data['meet']['Location']['ID']
        meet_id_response = data['meet']['ID']

        df = pd.DataFrame(data['xcDivisions'])

        df['IDLocation'] = location_id
        df['dtRetrieved'] = pd.Timestamp.now()
        df['IDMeet'] = meet_id_response

        df = df_fix_types(df, schema)

    else:
        
        print('No valid response for {}'.format(meet_id))
        df = None

    return df

In [146]:
def get_events(dt_start, dt_end, state_2, country_2='US', level=4):
    """
    Retrieves athletic events data from athletic.net API for a specified state and date range.

    Args:
        state (str): Two-letter state code (e.g., 'MA' for Massachusetts)
        dt_start (str): Start date in 'YYYY-MM-DD' format
        dt_end (str): End date in 'YYYY-MM-DD' format
        country_2 (str, optional): Two-letter country code. Defaults to 'US'
        level (int, optional): Competition level filter. Defaults to 4

    Returns:
        pandas.DataFrame or None: DataFrame containing event information if request is successful,
                                None if request fails. DataFrame structure is determined by 
                                create_df_events() function using dict_schema_events

    Raises:
        Potential requests.exceptions.RequestException: If the HTTP request fails

    Example:
        >>> events_df = get_events('MA', '2024-11-01', '2024-11-30')
        >>> if events_df is not None:
        ...     print(events_df.shape)

    Notes:
        - Makes a POST request to athletic.net's API endpoint
        - Requires the create_df_events() function and dict_schema_events schema
        - Some request headers are commented out but may be needed for authentication
        - API endpoint: https://www.athletic.net/api/v1/Event/Events
    """

    #state_name = 'Massachusetts'
    
    url = 'https://www.athletic.net/api/v1/Event/Events'

    params = {
        "start":dt_start,
        "end":dt_end,
        "levelMask":0,
        "sportMask":0,
        "country":country_2,
        "state":state_2,
        "location":"",
        "distanceKM":0,
        "filterTerm":""}

    # Make the POST request
    headers = {
        'content-type': 'application/json',
        'authority': 'www.athletic.net',
        'accept': 'application/json, text/plain, */*',
        'accept-encoding': 'gzip, deflate, br, zstd',
        'accept-language': 'en-US,en;q=0.9',
        
        # 'anet-appinfo': 'web:web:0:300',
        # 'dnt': '1',
        # 'origin': 'https://www.athletic.net',
        # 'pageguid': 'c73bf291-0b5a-4062-a158-321e0c72c0f4',
        # 'priority': 'u=1, i',
        # 'referer': 'https://www.athletic.net/events/usa/massachusetts/2024-11-16;level=4',
        # 'sec-ch-ua': '"Chromium";v="130", "Google Chrome";v="130", "Not?A_Brand";v="99"',
        # 'sec-ch-ua-mobile': '?0',
        # 'sec-ch-ua-platform': '"macOS"',
        # 'sec-fetch-dest': 'empty',
        # 'sec-fetch-mode': 'cors',
        # 'sec-fetch-site': 'same-origin',
        # 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36'
    }

    response = requests.post(url, json=params, headers=headers)

    # Check the response
    if response.status_code == 200:
        print("Request successful!")
        
        # TODO: Make the schema a parameter
        df_events_new = create_df_events(response, dict_schema_events)
        
    else:
        print(f"Request failed with status code: {response.status_code}")
        print(f"Response: {response.text}")
        
        df_events_new = None
        
    return df_events_new


In [149]:
def get_event_details(meet_id):

    url = 'https://www.athletic.net/api/v1/Meet/GetMeetData?meetId={}&sport=xc'.format(meet_id)

    headers = {
        'content-type': 'application/json',
        'authority': 'www.athletic.net',
        'accept': 'application/json, text/plain, */*',
        'accept-encoding': 'gzip, deflate, br, zstd',
        'accept-language': 'en-US,en;q=0.9'
    }

    # Make the GET request
    response = requests.get(url, headers=headers)

    # Check if request was successful
    if response.status_code == 200:
        print("Request successful!")
        
        #data = response.json()  # Parse JSON response
        # Process your data here
        
        df_event_details_new = create_df_event_details(meet_id, response, dict_schema_event_details)
        
    else:
        print(f"Request failed with status code: {response.status_code}")
        print(f"Response: {response.text}")
        
        df_event_details_new = None
        
    return df_event_details_new



In [150]:
    # IDMeet =  243293
    
    # url = 'https://www.athletic.net/api/v1/Meet/GetMeetData?meetId={}&sport=xc'.format(IDMeet)

    # headers = {
    #     'content-type': 'application/json',
    #     'authority': 'www.athletic.net',
    #     'accept': 'application/json, text/plain, */*',
    #     'accept-encoding': 'gzip, deflate, br, zstd',
    #     'accept-language': 'en-US,en;q=0.9'
    # }

    # # Make the GET request
    # response = requests.get(url, headers=headers)

    # # Check if request was successful
    # if response.status_code == 200:
    #     print("Request successful!")
        
    #     #data = response.json()  # Parse JSON response
    #     # Process your data here
        
    #     df_event_details_new = create_df_event_details(response, dict_schema_event_details)
        
    # else:
    #     print(f"Request failed with status code: {response.status_code}")
    #     print(f"Response: {response.text}")
        
    #     df_event_details_new = None

## Load data

In [128]:
# Events

pe = os.path.join(path_data, fn_events)

if os.path.isfile(pe):
    df_events = pd.read_csv(pe)
    df_events = df_fix_types(df_events, dict_schema_events)
else:
    df_events = pd.DataFrame({col: pd.Series(dtype=dtype) for col, dtype in dict_schema_events.items()})


# Event Details

ped = os.path.join(path_data, fn_event_details)

if os.path.isfile(ped):
    df_event_details = pd.read_csv(ped)
    df_event_details = df_fix_types(df_event_details, dict_schema_event_details)
else:
    df_event_details = pd.DataFrame({col: pd.Series(dtype=dtype) for col, dtype in dict_schema_event_details.items()})


# Results

pr = os.path.join(path_data, fn_results)

if os.path.isfile(pr):
    df_events = pd.read_csv(pr)
else:
    df_results = pd.DataFrame({col: pd.Series(dtype=dtype) for col, dtype in dict_schema_results.items()})
    


## Load new events

In [133]:
# Hard-coded dates
# Use this to back-fill older events

df_events_new = get_events('2024-09-01', '2024-12-31', 'MA')
set_new_events = set(df_events_new['IDMeet'])

print(len(df_events_new))

Request successful!
444


In [134]:
# Add to df_events, remove duplicates

# TODO: Add logic to make updating more flexible

set_existing_events = set(df_events['IDMeet'])

cnt_events_existing = len(df_events)
cnt_events_new = len(df_events_new)

df_events = pd.concat([df_events, df_events_new], axis=0)

df_events = df_events.dropna(how='all')

df_events = df_events.sort_values('dtRetrieved', ascending=True).drop_duplicates(subset=['IDMeet'], keep='last')

df_events = df_fix_types(df_events, dict_schema_events)

cnt_events_total = len(df_events)

cnt_dupes = cnt_events_existing + cnt_events_new - cnt_events_total 

print('Merged {} events into existing set of {}; final count is {}. {} duplicates'.format(cnt_events_new, cnt_events_existing, cnt_events_total, cnt_dupes))

Merged 444 events into existing set of 1470; final count is 1497. 417 duplicates


In [153]:
# Get new event details

list_df_event_details = []

for i, r in df_events_new.iterrows():
    
    meet_id = r['IDMeet']
    
    print(meet_id)
    
    _df = get_event_details(meet_id)
    
    if _df is None:
        print('Failed to get details for {}'.format(r['MeetName']))
    else:
        list_df_event_details.append(_df)

df_new_events = pd.concat(list_df_event_details, axis=0)

print(len(df_new_events))
df_new_events = df_new_events.drop_duplicates()
print(len(df_new_events))

# TODO: Fix errors here

571447
Request successful!
Meet ID: 571447
None
No valid response for 571447
Failed to get details for Vaultarama #6
244471
Request successful!
Meet ID: 244471
{'sport': 'xc', 'sport1': 'xc', 'sport2': 'xc', 'meet': {'AcceptOverrides': 0, 'Location': {'Courses': None, 'Meets': [], 'ID': 45019, 'Name': 'Hampshire Regional High School', 'Address': '19 Stage Rd', 'City': 'Westhampton', 'State': 'MA', 'PostalCode': '01027', 'Country': 'US', 'Country3': 'USA', 'Venue': None, 'Verified': False, 'Notes': None, 'Altitude': None, 'Lat': 42.3027621, 'Long': -72.7717766, 'GoogleData': {'address_components': [{'types': ['street_number'], 'short_name': '19', 'long_name': '19'}, {'types': ['route'], 'short_name': 'Stage Rd', 'long_name': 'Stage Road'}, {'types': ['locality', 'political'], 'short_name': 'Westhampton', 'long_name': 'Westhampton'}, {'types': ['administrative_area_level_2', 'political'], 'short_name': 'Hampshire County', 'long_name': 'Hampshire County'}, {'types': ['administrative_area_

  df_new_events = pd.concat(list_df_event_details, axis=0)


In [157]:
if FLAG_SAVE:
    df_events.to_csv(pe, index=False)
    df_event_details.to_csv(ped, index=False)

In [58]:
# https://www.athletic.net/api/v1/public/GetStatesCountries2


In [158]:
# Show all events at Wrentham

df_events[df_events['LocationName'].str.contains('wrentham', case=False)]

Unnamed: 0,IDMeet,Sport,MeetName,SeasonId,StartDate,EndDate,Virtual,LocationName,StreetAddress,City,...,CalCount,OffDays,Data,rsUrl,LiveID,LivePublished,VideoURL,Website,Country,dtRetrieved
707,230367,XC,Hockomock League Championship,2023,2023-10-28,2023-10-28,False,Wrentham Development Center,Emerald Street,Wrentham,...,1,,,,,True,,,US,2024-11-19 19:37:45.380727
703,233659,XC,Hockomock League Championships,2023,2023-10-28,2023-10-28,False,Wrentham Development Center,131 Emerald St,Wrentham,...,14,,,,28731,True,,,US,2024-11-19 19:37:45.380727
701,233660,XC,South Coast Conference Meet,2023,2023-10-28,2023-10-28,False,Wrentham Development Center,Emerald Street,Wrentham,...,10,{},,,28732,True,,,US,2024-11-19 19:37:45.380727
693,233663,XC,Cape Ann League Championships,2023,2023-10-28,2023-10-28,False,Wrentham Development Center,131 Emerald St,Wrentham,...,13,{},,,28735,True,,,US,2024-11-19 19:37:45.380727
686,233657,XC,Southeast Conference Meet,2023,2023-10-25,2023-10-25,False,Wrentham Developmental Center,131 Emerald St,Wrentham,...,7,,,,28725,True,,,US,2024-11-19 19:37:45.380727
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
364,250198,XC,The Big Three XC Championships,2024,2024-10-23,2024-10-23,False,Wrentham Developmental Center,131 Emerald Street,Wrentham,...,4,,,,41399,True,,,US,2024-11-19 21:45:54.541469
373,250201,XC,Southeastern Conference XC Championships,2024,2024-10-25,2024-10-25,False,Wrentham Development Center,,Wrentham,...,3,,,,41400,True,,,US,2024-11-19 21:45:54.541469
192,248137,XC,Bay,2024,2024-09-28,2024-09-28,False,Wrentham Developmental Center,131 Emerald Street,Wrentham,...,1,,,,,True,,,US,2024-11-19 21:45:54.541469
194,242588,XC,MSTCA Bay State XC Invitational,2024,2024-09-28,2024-09-28,False,Wrentham Developmental Center,131 Emerald Street,Wrentham,...,70,,,,40580,True,,https://mstca.org/meet/2024-bay-state-invitati...,US,2024-11-19 21:45:54.541469
