In [55]:
# Autoreload so we're using most recent modules
%load_ext autoreload
%autoreload 2

import sys
import os
import pandas as pd

# Add the project root to the Python path
notebook_dir = os.path.dirname(os.path.abspath('__file__'))
project_root = os.path.dirname(notebook_dir)
sys.path.append(project_root)

# Import from src
from src.data.collection import (
    fetch_games,
    fetch_team_game_stats,
    fetch_advanced_team_game_stats_v1,
    fetch_advanced_team_game_stats_v2,
    convert_to_dataframe
)
from pprint import pprint

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Get Games

In [None]:
## Get Games

# Fetch games for 2022 to 2023
start_year = 2022
end_year = 2023

try:
    games = fetch_games(start_year, end_year)

    # Convert to pandas dataframe
    df = convert_to_dataframe(games)

    # Display basic information about the dataframe
    print(f"Fetched {len(df)} games from {start_year} to {end_year}")
    print(df.info())

    # save to parquet
    output_path = "../data/games.parquet"
    df.to_parquet(output_path, index=False)
    print(f"Games data saved to {output_path}")

except Exception as e:
    print(f"An error occurred while fetching games: {str(e)}")

Successfully fetched games for 2022, conference: SEC
Successfully fetched games for 2022, conference: B1G
Successfully fetched games for 2022, conference: ACC
Successfully fetched games for 2022, conference: B12
Successfully fetched games for 2022, conference: PAC
Successfully fetched games for 2023, conference: SEC
Successfully fetched games for 2023, conference: B1G
Successfully fetched games for 2023, conference: ACC
Successfully fetched games for 2023, conference: B12
Successfully fetched games for 2023, conference: PAC
[{'attendance': 9346,
 'away_conference': 'SEC',
 'away_division': 'fbs',
 'away_id': 238,
 'away_line_scores': [14, 7, 35, 7],
 'away_points': 63,
 'away_post_win_prob': 0.9996410399011048,
 'away_postgame_elo': 1431,
 'away_pregame_elo': 1222,
 'away_team': 'Vanderbilt',
 'completed': True,
 'conference_game': False,
 'excitement_index': 2.4199370723,
 'highlights': None,
 'home_conference': 'Mountain West',
 'home_division': 'fbs',
 'home_id': 62,
 'home_line_sco

## Team Game Stats

In [38]:
# Fetch team game stats for 2022 and 2023
team_game_stats = fetch_team_game_stats(2022, 2023)

if team_game_stats:
    # Convert to DataFrame
    team_stats_df = convert_to_dataframe(team_game_stats)
    
    # Expand the 'teams' column
    team_stats_df = team_stats_df.explode('teams')
    team_stats_df = pd.concat([team_stats_df.drop(['teams'], axis=1), 
                               team_stats_df['teams'].apply(pd.Series)], axis=1)

    # Rename columns for clarity
    team_stats_df = team_stats_df.rename(columns={
        'school_id': 'team_id',
        'school': 'team_name',
        'conference': 'team_conference'
    })

    # Function to safely process stats
    def process_stats(stats):
        if isinstance(stats, list):
            return {item['category']: item['stat'] for item in stats if isinstance(item, dict) and 'category' in item and 'stat' in item}
        return {}

    # Explode the 'stats' column
    stats_df = team_stats_df['stats'].apply(process_stats).apply(pd.Series)

    # Merge the exploded stats back into the main dataframe
    team_stats_df = pd.concat([team_stats_df.drop('stats', axis=1), stats_df], axis=1)

    # Convert numeric columns to appropriate types
    numeric_columns = team_stats_df.columns.drop(['id', 'team_id', 'team_name', 'team_conference', 'home_away'])
    team_stats_df[numeric_columns] = team_stats_df[numeric_columns].apply(pd.to_numeric, errors='ignore')

    # Display the first few rows
    print("Sample team game stats:")
    display(team_stats_df.head())
    
    # Save to parquet
    team_stats_df.to_parquet("../data/team_game_stats.parquet", index=False)
    print("Improved team game stats saved to parquet file.")
else:
    print("Failed to fetch team game stats.")

Successfully fetched team game stats for 2022, conference: SEC
Successfully fetched team game stats for 2022, conference: B1G
Successfully fetched team game stats for 2022, conference: ACC
Successfully fetched team game stats for 2022, conference: B12
Successfully fetched team game stats for 2022, conference: PAC
Successfully fetched team game stats for 2023, conference: SEC
Successfully fetched team game stats for 2023, conference: B1G
Successfully fetched team game stats for 2023, conference: ACC
Successfully fetched team game stats for 2023, conference: B12
Successfully fetched team game stats for 2023, conference: PAC
Sample team game stats:


Unnamed: 0,id,teams
0,401403853,"[{'school_id': 62, 'school': 'Hawai'i', 'confe..."
1,401403856,"[{'school_id': 2, 'school': 'Auburn', 'confere..."
2,401403854,"[{'school_id': 328, 'school': 'Utah State', 'c..."
3,401403865,"[{'school_id': 245, 'school': 'Texas A&M', 'co..."
4,401403866,"[{'school_id': 238, 'school': 'Vanderbilt', 'c..."


Team game stats saved to parquet file.


## Advanced Team Stats

In [56]:
# Test parameters
start_year = 2022
end_year = 2023
max_teams = 5  # For V1 only, set to None to use all teams
max_years = 1  # Set to None to use all years in the range

# Version 1: Filter before API call
v1_stats, v1_time = fetch_advanced_team_game_stats_v1(start_year, end_year, team_stats_df, max_teams, max_years)

# Version 2: Filter after API call
v2_stats, v2_time = fetch_advanced_team_game_stats_v2(start_year, end_year, team_stats_df, max_years)

# Convert to DataFrames
if v1_stats and v2_stats:
    v1_df = convert_to_dataframe(v1_stats)
    v2_df = convert_to_dataframe(v2_stats)
    
    print("\nVersion 1 Results (Filter Before):")
    print(f"Rows: {len(v1_df)}")
    print(f"Execution time: {v1_time:.2f} seconds")
    print(f"Rows per second: {len(v1_df) / v1_time:.2f}")
    
    print("\nVersion 2 Results (Filter After):")
    print(f"Rows: {len(v2_df)}")
    print(f"Execution time: {v2_time:.2f} seconds")
    print(f"Rows per second: {len(v2_df) / v2_time:.2f}")
    
    # Compare data consistency
    v1_games = set(v1_df['game_id'])
    v2_games = set(v2_df['game_id'])
    common_games = v1_games.intersection(v2_games)
    
    print(f"\nCommon games between versions: {len(common_games)}")
    print(f"Games only in V1: {len(v1_games - v2_games)}")
    print(f"Games only in V2: {len(v2_games - v1_games)}")
    
    # Display sample data
    print("\nSample data from Version 1:")
    display(v1_df.head())
    
    print("\nSample data from Version 2:")
    display(v2_df.head())
    
    # Save to parquet (optional)
    # v1_df.to_parquet("../data/advanced_team_game_stats_v1.parquet", index=False)
    # v2_df.to_parquet("../data/advanced_team_game_stats_v2.parquet", index=False)
else:
    print("Failed to fetch advanced team game stats for one or both versions.")

Exception when calling StatsApi->get_advanced_team_game_stats for year 2022, team Arkansas-Pine Bluff: (401)
Reason: Unauthorized
HTTP response headers: HTTPHeaderDict({'Date': 'Fri, 30 Aug 2024 01:05:42 GMT', 'Content-Type': 'text/html; charset=utf-8', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'X-DNS-Prefetch-Control': 'off', 'Expect-CT': 'max-age=0', 'X-Frame-Options': 'SAMEORIGIN', 'Strict-Transport-Security': 'max-age=15552000; includeSubDomains', 'X-Download-Options': 'noopen', 'X-Content-Type-Options': 'nosniff', 'X-Permitted-Cross-Domain-Policies': 'none', 'Referrer-Policy': 'no-referrer', 'X-XSS-Protection': '0', 'Vary': 'Origin', 'CF-Cache-Status': 'DYNAMIC', 'Report-To': '{"endpoints":[{"url":"https:\\/\\/a.nel.cloudflare.com\\/report\\/v4?s=HBIXTtHNXiFm%2F9qxUg9X7HymD9ZT0gw1nsdn8fyDxgdHwGezATA5bn%2FGiBkI2SXS7kdckfjM2dHkDT8APvtD5a2iQXHL9NbPcNmJ1skg3h9tzgIsPGRMt3mzu2GilcP53bRAJQ14Nh6IhPK99vwVgEzwj6ZeN%2FRms1o%3D"}],"group":"cf-nel","max_age":604800}', 'NEL': 

## Get Player Game Stats

In [35]:
## Get Player Game Stats

# Fetch player game stats for 2022 and 2023
start_year = 2022
end_year = 2023

for year in range(start_year, end_year + 1):
    player_stats = fetch_player_stats(year)
    
    if player_stats:
        # Convert to dataframe using the existing function
        player_stats_df = convert_to_dataframe(player_stats)
        
        # Display the first few rows
        print(f"Sample data for {year}:")
        print(player_stats_df.head())
        
        # Save to parquet, partitioned by year
        output_path = f"../data/player_game_stats/year={year}"
        player_stats_df.to_parquet(output_path, index=False)
        
        print(f"Successfully fetched and saved player stats for year {year}.")
    else:
        print(f"Failed to fetch player stats for year {year}.")

Successfully fetched player stats for year 2022, week 1
Successfully fetched player stats for year 2022, week 2


KeyboardInterrupt: 

## Next

-   Betting Lines
-   Team rankings
-   Drive Data
-   Team Talent Composite
-   Advanced Team Stats
-   Coach Information