In [4]:
# Autoreload so we're using most recent modules
%load_ext autoreload
%autoreload 2

import sys
import os
import pandas as pd

# Add the project root to the Python path
notebook_dir = os.path.dirname(os.path.abspath('__file__'))
project_root = os.path.dirname(notebook_dir)
sys.path.append(project_root)

# Import from src
from src.data.collection import (
    fetch_games,
    fetch_team_game_stats,
    fetch_advanced_team_game_stats,
    convert_to_dataframe,
    fetch_team_talent
)
from pprint import pprint

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Inputs

In [5]:
start_year = 2021
end_year = 2023

## Get Games

In [6]:
## Get Games

try:
    games = fetch_games(start_year, end_year)

    # Convert to pandas dataframe
    df = convert_to_dataframe(games)

    # Display basic information about the dataframe
    print(f"Fetched {len(df)} games from {start_year} to {end_year}")
    print(df.info())

    # save to parquet
    output_path = "../data/games.parquet"
    df.to_parquet(output_path, index=False)
    print(f"Games data saved to {output_path}")

except Exception as e:
    print(f"An error occurred while fetching games: {str(e)}")

Successfully fetched games for 2021, conference: SEC
Successfully fetched games for 2021, conference: B1G
Successfully fetched games for 2021, conference: ACC
Successfully fetched games for 2021, conference: B12
Successfully fetched games for 2021, conference: PAC
Successfully fetched games for 2022, conference: SEC
Successfully fetched games for 2022, conference: B1G
Successfully fetched games for 2022, conference: ACC
Successfully fetched games for 2022, conference: B12
Successfully fetched games for 2022, conference: PAC
Successfully fetched games for 2023, conference: SEC
Successfully fetched games for 2023, conference: B1G
Successfully fetched games for 2023, conference: ACC
Successfully fetched games for 2023, conference: B12
Successfully fetched games for 2023, conference: PAC
Fetched 1524 games from 2021 to 2023
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1524 entries, 0 to 1523
Data columns (total 33 columns):
 #   Column              Non-Null Count  Dtype  
---  ------ 

## Team Game Stats

In [7]:
# Fetch and process team game stat
team_stats_df = fetch_team_game_stats(start_year, end_year)

if team_stats_df is not None:
    # Display the first few rows
    print("Sample team game stats:")
    display(team_stats_df.head())
    
    # Save to parquet
    team_stats_df.to_parquet("../data/team_game_stats.parquet", index=False)
    print("Improved team game stats saved to parquet file.")
else:
    print("Failed to fetch or process team game stats.")

Successfully fetched team game stats for 2021, conference: SEC
Successfully fetched team game stats for 2021, conference: B1G
Successfully fetched team game stats for 2021, conference: ACC
Successfully fetched team game stats for 2021, conference: B12
Successfully fetched team game stats for 2021, conference: PAC
Successfully fetched team game stats for 2022, conference: SEC
Successfully fetched team game stats for 2022, conference: B1G
Successfully fetched team game stats for 2022, conference: ACC
Successfully fetched team game stats for 2022, conference: B12
Successfully fetched team game stats for 2022, conference: PAC
Successfully fetched team game stats for 2023, conference: SEC
Successfully fetched team game stats for 2023, conference: B1G
Successfully fetched team game stats for 2023, conference: ACC
Successfully fetched team game stats for 2023, conference: B12
Successfully fetched team game stats for 2023, conference: PAC
Sample team game stats:


  team_stats_df[numeric_columns] = team_stats_df[numeric_columns].apply(pd.to_numeric, errors='ignore')


Unnamed: 0,id,team_id,team_name,team_conference,home_away,points,rushingTDs,puntReturnYards,puntReturnTDs,puntReturns,...,yardsPerPass,completionAttempts,netPassingYards,totalYards,fourthDownEff,thirdDownEff,firstDowns,interceptionYards,interceptionTDs,passesIntercepted
0,401282144,245,Texas A&M,SEC,home,52,4,42.0,0.0,4.0,...,8.8,12-19,168,447,2-2,6-10,24,,,
0,401282144,2504,Prairie View,SWAC,away,3,0,14.0,0.0,1.0,...,0.6,2-12,7,154,0-2,6-16,9,16.0,0.0,1.0
1,401282055,97,Louisville,ACC,away,24,2,1.0,0.0,1.0,...,5.3,23-38,200,355,3-5,6-16,22,,,
1,401282055,145,Ole Miss,SEC,home,43,4,,,,...,11.9,22-32,381,569,3-3,2-9,31,15.0,0.0,1.0
2,401282113,99,LSU,SEC,away,17,1,-1.0,0.0,1.0,...,7.5,20-33,249,326,0-1,4-12,20,,,


Improved team game stats saved to parquet file.


## Advanced Team Stats

In [8]:
## Advanced Team Stats

max_teams = None  # For testing, set to None to use all teams

try:
    advanced_df, execution_time = fetch_advanced_team_game_stats(start_year, end_year, df, max_teams)

    # Display basic information about the dataframe
    print(f"Fetched {len(advanced_df)} advanced game stats in {execution_time:.2f} seconds")
    print(advanced_df.info())

    # Save to parquet
    output_path = "../data/advanced_team_game_stats.parquet"
    advanced_df.to_parquet(output_path, index=False)
    print(f"Advanced team game stats data saved to {output_path}")

except Exception as e:
    print(f"An error occurred while fetching advanced team game stats: {str(e)}")

Progress: 10/535 teams processed
Progress: 20/535 teams processed
Progress: 30/535 teams processed
Progress: 40/535 teams processed
Progress: 50/535 teams processed
Progress: 60/535 teams processed
Progress: 70/535 teams processed
Progress: 80/535 teams processed
Progress: 90/535 teams processed
Progress: 100/535 teams processed
Progress: 110/535 teams processed
Progress: 120/535 teams processed
Progress: 130/535 teams processed
Progress: 140/535 teams processed
Progress: 150/535 teams processed
Progress: 160/535 teams processed
Progress: 170/535 teams processed
Progress: 180/535 teams processed
Progress: 190/535 teams processed
Progress: 200/535 teams processed
Progress: 210/535 teams processed
Progress: 220/535 teams processed
Progress: 230/535 teams processed
Progress: 240/535 teams processed
Progress: 250/535 teams processed
Progress: 260/535 teams processed
Progress: 270/535 teams processed
Progress: 280/535 teams processed
Progress: 290/535 teams processed
Progress: 300/535 teams

## Team Talent Composite

In [9]:
## Team Talent Composite

try:
    talent_df = fetch_team_talent(start_year, end_year)

    if talent_df is not None:
        # Display basic information about the dataframe
        print(f"Fetched team talent data from {start_year} to {end_year}")
        print(talent_df.info())

        # Display the first few rows
        print("\nSample team talent data:")
        display(talent_df.head())

        # Save to parquet
        output_path = "../data/team_talent.parquet"
        talent_df.to_parquet(output_path, index=False)
        print(f"Team talent data saved to {output_path}")
    else:
        print("No team talent data was fetched.")

except Exception as e:
    print(f"An error occurred while fetching team talent data: {str(e)}")

Successfully fetched team talent data for 2021
Successfully fetched team talent data for 2022
Successfully fetched team talent data for 2023
Fetched team talent data from 2021 to 2023
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 697 entries, 0 to 696
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   year    697 non-null    int64  
 1   school  697 non-null    object 
 2   talent  697 non-null    float64
dtypes: float64(1), int64(1), object(1)
memory usage: 16.5+ KB
None

Sample team talent data:


Unnamed: 0,year,school,talent
0,2021,Alabama,1004.04
1,2021,Georgia,1001.79
2,2021,Ohio State,985.09
3,2021,Clemson,935.08
4,2021,LSU,903.72


Team talent data saved to ../data/team_talent.parquet
