In [70]:
# Autoreload so we're using most recent modules
%load_ext autoreload
%autoreload 2

import sys
import os
import pandas as pd

# Add the project root to the Python path
notebook_dir = os.path.dirname(os.path.abspath('__file__'))
project_root = os.path.dirname(notebook_dir)
sys.path.append(project_root)

# Import from src
from src.data.collection import (
    fetch_games,
    fetch_team_game_stats,
    fetch_advanced_team_game_stats,
    convert_to_dataframe
)
from pprint import pprint

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Get Games

In [None]:
## Get Games

# Fetch games for 2022 to 2023
start_year = 2022
end_year = 2023

try:
    games = fetch_games(start_year, end_year)

    # Convert to pandas dataframe
    df = convert_to_dataframe(games)

    # Display basic information about the dataframe
    print(f"Fetched {len(df)} games from {start_year} to {end_year}")
    print(df.info())

    # save to parquet
    output_path = "../data/games.parquet"
    df.to_parquet(output_path, index=False)
    print(f"Games data saved to {output_path}")

except Exception as e:
    print(f"An error occurred while fetching games: {str(e)}")

Successfully fetched games for 2022, conference: SEC
Successfully fetched games for 2022, conference: B1G
Successfully fetched games for 2022, conference: ACC
Successfully fetched games for 2022, conference: B12
Successfully fetched games for 2022, conference: PAC
Successfully fetched games for 2023, conference: SEC
Successfully fetched games for 2023, conference: B1G
Successfully fetched games for 2023, conference: ACC
Successfully fetched games for 2023, conference: B12
Successfully fetched games for 2023, conference: PAC
[{'attendance': 9346,
 'away_conference': 'SEC',
 'away_division': 'fbs',
 'away_id': 238,
 'away_line_scores': [14, 7, 35, 7],
 'away_points': 63,
 'away_post_win_prob': 0.9996410399011048,
 'away_postgame_elo': 1431,
 'away_pregame_elo': 1222,
 'away_team': 'Vanderbilt',
 'completed': True,
 'conference_game': False,
 'excitement_index': 2.4199370723,
 'highlights': None,
 'home_conference': 'Mountain West',
 'home_division': 'fbs',
 'home_id': 62,
 'home_line_sco

## Team Game Stats

In [38]:
# Fetch team game stats for 2022 and 2023
team_game_stats = fetch_team_game_stats(2022, 2023)

if team_game_stats:
    # Convert to DataFrame
    team_stats_df = convert_to_dataframe(team_game_stats)
    
    # Expand the 'teams' column
    team_stats_df = team_stats_df.explode('teams')
    team_stats_df = pd.concat([team_stats_df.drop(['teams'], axis=1), 
                               team_stats_df['teams'].apply(pd.Series)], axis=1)

    # Rename columns for clarity
    team_stats_df = team_stats_df.rename(columns={
        'school_id': 'team_id',
        'school': 'team_name',
        'conference': 'team_conference'
    })

    # Function to safely process stats
    def process_stats(stats):
        if isinstance(stats, list):
            return {item['category']: item['stat'] for item in stats if isinstance(item, dict) and 'category' in item and 'stat' in item}
        return {}

    # Explode the 'stats' column
    stats_df = team_stats_df['stats'].apply(process_stats).apply(pd.Series)

    # Merge the exploded stats back into the main dataframe
    team_stats_df = pd.concat([team_stats_df.drop('stats', axis=1), stats_df], axis=1)

    # Convert numeric columns to appropriate types
    numeric_columns = team_stats_df.columns.drop(['id', 'team_id', 'team_name', 'team_conference', 'home_away'])
    team_stats_df[numeric_columns] = team_stats_df[numeric_columns].apply(pd.to_numeric, errors='ignore')

    # Display the first few rows
    print("Sample team game stats:")
    display(team_stats_df.head())
    
    # Save to parquet
    team_stats_df.to_parquet("../data/team_game_stats.parquet", index=False)
    print("Improved team game stats saved to parquet file.")
else:
    print("Failed to fetch team game stats.")

Successfully fetched team game stats for 2022, conference: SEC
Successfully fetched team game stats for 2022, conference: B1G
Successfully fetched team game stats for 2022, conference: ACC
Successfully fetched team game stats for 2022, conference: B12
Successfully fetched team game stats for 2022, conference: PAC
Successfully fetched team game stats for 2023, conference: SEC
Successfully fetched team game stats for 2023, conference: B1G
Successfully fetched team game stats for 2023, conference: ACC
Successfully fetched team game stats for 2023, conference: B12
Successfully fetched team game stats for 2023, conference: PAC
Sample team game stats:


Unnamed: 0,id,teams
0,401403853,"[{'school_id': 62, 'school': 'Hawai'i', 'confe..."
1,401403856,"[{'school_id': 2, 'school': 'Auburn', 'confere..."
2,401403854,"[{'school_id': 328, 'school': 'Utah State', 'c..."
3,401403865,"[{'school_id': 245, 'school': 'Texas A&M', 'co..."
4,401403866,"[{'school_id': 238, 'school': 'Vanderbilt', 'c..."


Team game stats saved to parquet file.


## Advanced Team Stats

In [71]:
## Advanced Team Stats

# Fetch advanced team game stats for 2022 to 2023
start_year = 2022
end_year = 2023
max_teams = 5  # For testing, set to None to use all teams

try:
    advanced_df, execution_time = fetch_advanced_team_game_stats(start_year, end_year, df, max_teams)

    # Display basic information about the dataframe
    print(f"Fetched {len(advanced_df)} advanced team game stats in {execution_time:.2f} seconds")
    print(advanced_df.info())

    # Save to parquet
    output_path = "../data/advanced_team_game_stats.parquet"
    advanced_df.to_parquet(output_path, index=False)
    print(f"Advanced team game stats data saved to {output_path}")

except Exception as e:
    print(f"An error occurred while fetching advanced team game stats: {str(e)}")

Progress: 9/9 teams processed
Fetched 108 advanced team game stats in 26.23 seconds
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108 entries, 0 to 107
Data columns (total 61 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   game_id                           108 non-null    int64  
 1   season                            0 non-null      object 
 2   week                              108 non-null    int64  
 3   team                              108 non-null    object 
 4   opponent                          108 non-null    object 
 5   off_plays                         108 non-null    int64  
 6   off_drives                        108 non-null    int64  
 7   off_ppa                           108 non-null    float64
 8   off_total_ppa                     108 non-null    float64
 9   off_success_rate                  108 non-null    float64
 10  off_explosiveness                 108 non-null    

In [65]:
df.home_team.nunique()

98