In [2]:
# Autoreload so we're using most recent modules
%load_ext autoreload
%autoreload 2

import sys
import os
import pandas as pd

# Add the project root to the Python path
notebook_dir = os.path.dirname(os.path.abspath('__file__'))
project_root = os.path.dirname(notebook_dir)
sys.path.append(project_root)

# Import from src
from src.data.collection import (
    fetch_games,
    fetch_team_game_stats,
    fetch_advanced_team_game_stats,
    convert_to_dataframe,
    fetch_team_talent
)
from pprint import pprint

## Inputs

In [1]:
start_year = 2021
end_year = 2023

## Get Games

In [None]:
## Get Games

try:
    games = fetch_games(start_year, end_year)

    # Convert to pandas dataframe
    df = convert_to_dataframe(games)

    # Display basic information about the dataframe
    print(f"Fetched {len(df)} games from {start_year} to {end_year}")
    print(df.info())

    # save to parquet
    output_path = "../data/games.parquet"
    df.to_parquet(output_path, index=False)
    print(f"Games data saved to {output_path}")

except Exception as e:
    print(f"An error occurred while fetching games: {str(e)}")

Successfully fetched games for 2022, conference: SEC
Successfully fetched games for 2022, conference: B1G
Successfully fetched games for 2022, conference: ACC
Successfully fetched games for 2022, conference: B12
Successfully fetched games for 2022, conference: PAC
Successfully fetched games for 2023, conference: SEC
Successfully fetched games for 2023, conference: B1G
Successfully fetched games for 2023, conference: ACC
Successfully fetched games for 2023, conference: B12
Successfully fetched games for 2023, conference: PAC
[{'attendance': 9346,
 'away_conference': 'SEC',
 'away_division': 'fbs',
 'away_id': 238,
 'away_line_scores': [14, 7, 35, 7],
 'away_points': 63,
 'away_post_win_prob': 0.9996410399011048,
 'away_postgame_elo': 1431,
 'away_pregame_elo': 1222,
 'away_team': 'Vanderbilt',
 'completed': True,
 'conference_game': False,
 'excitement_index': 2.4199370723,
 'highlights': None,
 'home_conference': 'Mountain West',
 'home_division': 'fbs',
 'home_id': 62,
 'home_line_sco

## Team Game Stats

In [73]:
# Fetch and process team game stat
team_stats_df = fetch_team_game_stats(start_year, end_year)

if team_stats_df is not None:
    # Display the first few rows
    print("Sample team game stats:")
    display(team_stats_df.head())
    
    # Save to parquet
    team_stats_df.to_parquet("../data/team_game_stats.parquet", index=False)
    print("Improved team game stats saved to parquet file.")
else:
    print("Failed to fetch or process team game stats.")

Successfully fetched team game stats for 2022, conference: SEC
Successfully fetched team game stats for 2022, conference: B1G


KeyboardInterrupt: 

## Advanced Team Stats

In [75]:
## Advanced Team Stats

max_teams = None  # For testing, set to None to use all teams

try:
    advanced_df, execution_time = fetch_advanced_team_game_stats(start_year, end_year, df, max_teams)

    # Display basic information about the dataframe
    print(f"Fetched {len(advanced_df)} advanced game stats in {execution_time:.2f} seconds")
    print(advanced_df.info())

    # Save to parquet
    output_path = "../data/advanced_team_game_stats.parquet"
    advanced_df.to_parquet(output_path, index=False)
    print(f"Advanced team game stats data saved to {output_path}")

except Exception as e:
    print(f"An error occurred while fetching advanced team game stats: {str(e)}")

Progress: 8/8 teams processed
Fetched 89 advanced team game stats in 21.75 seconds
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89 entries, 0 to 88
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   game_id   89 non-null     int64 
 1   season    0 non-null      object
 2   week      89 non-null     int64 
 3   team      89 non-null     object
 4   opponent  89 non-null     object
 5   offense   89 non-null     object
 6   defense   89 non-null     object
dtypes: int64(2), object(5)
memory usage: 5.0+ KB
None
Advanced team game stats data saved to ../data/advanced_team_game_stats.parquet


## Team Talent Composite

In [3]:
## Team Talent Composite

try:
    talent_df = fetch_team_talent(start_year, end_year)

    if talent_df is not None:
        # Display basic information about the dataframe
        print(f"Fetched team talent data from {start_year} to {end_year}")
        print(talent_df.info())

        # Display the first few rows
        print("\nSample team talent data:")
        display(talent_df.head())

        # Save to parquet
        output_path = "../data/team_talent.parquet"
        talent_df.to_parquet(output_path, index=False)
        print(f"Team talent data saved to {output_path}")
    else:
        print("No team talent data was fetched.")

except Exception as e:
    print(f"An error occurred while fetching team talent data: {str(e)}")

Successfully fetched team talent data for 2021
Successfully fetched team talent data for 2022
Successfully fetched team talent data for 2023
Fetched team talent data from 2021 to 2023
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 697 entries, 0 to 696
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   year    697 non-null    int64  
 1   school  697 non-null    object 
 2   talent  697 non-null    float64
dtypes: float64(1), int64(1), object(1)
memory usage: 16.5+ KB
None

Sample team talent data:


Unnamed: 0,year,school,talent
0,2021,Alabama,1004.04
1,2021,Georgia,1001.79
2,2021,Ohio State,985.09
3,2021,Clemson,935.08
4,2021,LSU,903.72


Team talent data saved to ../data/team_talent.parquet
