In [2]:
# Autoreload so we're using most recent modules
%load_ext autoreload
%autoreload 2

import sys
import os
import pandas as pd
import random

# Add the project root to the Python path
notebook_dir = os.path.dirname(os.path.abspath('__file__'))
project_root = os.path.dirname(notebook_dir)
sys.path.append(project_root)

# Import from src
from src.data.collection import (
    initialize_games_api,
    initialize_stats_api,
    initialize_teams_api,
    fetch_games,
    fetch_team_game_stats,
    fetch_advanced_team_game_stats,
    convert_to_dataframe,
    fetch_team_talent
)
from pprint import pprint

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Inputs

In [2]:
start_year = 2001
end_year = 2024

## Get Games

In [3]:
## Get Games

games_api = initialize_games_api()

try:
    games = fetch_games(start_year, end_year, games_api)

    # Convert to pandas dataframe
    df = convert_to_dataframe(games)

    # Display basic information about the dataframe
    print(f"Fetched {len(df)} games from {start_year} to {end_year}")
    print(df.info())

    # save to parquet
    output_path = "../data/01_raw/games.parquet"
    df.to_parquet(output_path, index=False)
    print(f"Games data saved to {output_path}")

except Exception as e:
    print(f"An error occurred while fetching games: {str(e)}")

Successfully fetched games for 2001, conference: SEC
Successfully fetched games for 2001, conference: B1G
Successfully fetched games for 2001, conference: ACC
Successfully fetched games for 2001, conference: B12
Successfully fetched games for 2001, conference: PAC
Successfully fetched games for 2002, conference: SEC
Successfully fetched games for 2002, conference: B1G
Successfully fetched games for 2002, conference: ACC
Successfully fetched games for 2002, conference: B12
Successfully fetched games for 2002, conference: PAC
Successfully fetched games for 2003, conference: SEC
Successfully fetched games for 2003, conference: B1G
Successfully fetched games for 2003, conference: ACC
Successfully fetched games for 2003, conference: B12
Successfully fetched games for 2003, conference: PAC
Successfully fetched games for 2004, conference: SEC
Successfully fetched games for 2004, conference: B1G
Successfully fetched games for 2004, conference: ACC
Successfully fetched games for 2004, conferenc

In [None]:
from src.data.collection import fetch_games, fetch_team_game_stats, fetch_advanced_team_game_stats, fetch_team_talent
from src.data.transformations import process_team_game_stats

start_year = 2020
end_year = 2021

# Fetch and store raw data
games_df = fetch_games(start_year, end_year, games_api)
team_stats_df = fetch_team_game_stats(start_year, end_year)
advanced_df, execution_time = fetch_advanced_team_game_stats(start_year, end_year, games_df)
talent_df = fetch_team_talent(max(2015, start_year), end_year)

# Process data
processed_team_stats = process_team_game_stats()

# Display results
print(processed_team_stats.head())
print(processed_team_stats.info())

## Team Game Stats

In [4]:
# Fetch and process team game stat
team_stats_df = fetch_team_game_stats(start_year, end_year)

if team_stats_df is not None:
    # Display the first few rows
    print("Sample team game stats:")
    display(team_stats_df.head())
    
    # Save to parquet
    team_stats_df.to_parquet("../data/01_raw/team_game_stats.parquet", index=False)
    print("Improved team game stats saved to parquet file.")
else:
    print("Failed to fetch or process team game stats.")

Successfully fetched team game stats for 2001, conference: SEC
Successfully fetched team game stats for 2001, conference: B1G
Successfully fetched team game stats for 2001, conference: ACC
Successfully fetched team game stats for 2001, conference: B12
Successfully fetched team game stats for 2001, conference: PAC
Successfully fetched team game stats for 2002, conference: SEC
Successfully fetched team game stats for 2002, conference: B1G
Successfully fetched team game stats for 2002, conference: ACC
Successfully fetched team game stats for 2002, conference: B12
Successfully fetched team game stats for 2002, conference: PAC
Successfully fetched team game stats for 2003, conference: SEC
Successfully fetched team game stats for 2003, conference: B1G
Successfully fetched team game stats for 2003, conference: ACC
Successfully fetched team game stats for 2003, conference: B12
Successfully fetched team game stats for 2003, conference: PAC
Successfully fetched team game stats for 2004, conferen

  team_stats_df[numeric_columns] = team_stats_df[numeric_columns].apply(pd.to_numeric, errors='ignore')


Unnamed: 0,id,team_id,team_name,team_conference,home_away,points,fumblesRecovered,rushingTDs,puntReturnYards,puntReturnTDs,...,kickReturnYards,kickReturnTDs,kickReturns,totalFumbles,tacklesForLoss,defensiveTDs,tackles,sacks,qbHurries,passesDeflected
0,242692426,2426,Navy,FBS Independents,home,29,2,3.0,18.0,0.0,...,,,,,,,,,,
0,242692426,238,Vanderbilt,SEC,away,26,0,1.0,28.0,0.0,...,,,,,,,,,,
1,243250228,228,Clemson,ACC,home,29,1,3.0,21.0,0.0,...,,,,,,,,,,
1,243250228,2579,South Carolina,SEC,away,7,0,0.0,-5.0,0.0,...,,,,,,,,,,
2,242620096,84,Indiana,Big Ten,away,32,2,1.0,-1.0,0.0,...,,,,,,,,,,


Improved team game stats saved to parquet file.


## Advanced Team Stats

In [5]:
## Advanced Team Stats

max_teams = None  # For testing, set to None to use all teams

try:
    advanced_df, execution_time = fetch_advanced_team_game_stats(start_year, end_year, df, max_teams)

    # Display basic information about the dataframe
    print(f"Fetched {len(advanced_df)} advanced game stats in {execution_time:.2f} seconds")
    print(advanced_df.info())

    # Save to parquet
    output_path = "../data/01_raw/advanced_team_game_stats.parquet"
    advanced_df.to_parquet(output_path, index=False)
    print(f"Advanced team game stats data saved to {output_path}")

except Exception as e:
    print(f"An error occurred while fetching advanced team game stats: {str(e)}")

Progress: 10/3704 teams processed
Progress: 20/3704 teams processed
Progress: 30/3704 teams processed
Progress: 40/3704 teams processed
Progress: 50/3704 teams processed
Progress: 60/3704 teams processed
Progress: 70/3704 teams processed
Progress: 80/3704 teams processed
Progress: 90/3704 teams processed
Progress: 100/3704 teams processed
Progress: 110/3704 teams processed
Progress: 120/3704 teams processed
Progress: 130/3704 teams processed
Progress: 140/3704 teams processed
Progress: 150/3704 teams processed
Progress: 160/3704 teams processed
Progress: 170/3704 teams processed
Progress: 180/3704 teams processed
Progress: 190/3704 teams processed
Progress: 200/3704 teams processed
Progress: 210/3704 teams processed
Progress: 220/3704 teams processed
Progress: 230/3704 teams processed
Progress: 240/3704 teams processed
Progress: 250/3704 teams processed
Progress: 260/3704 teams processed
Progress: 270/3704 teams processed
Progress: 280/3704 teams processed
Progress: 290/3704 teams proc

## Team Talent Composite

In [7]:
## Team Talent Composite

try:
    talent_df = fetch_team_talent(max(2015, start_year), end_year)

    if talent_df is not None:
        # Display basic information about the dataframe
        print(f"Fetched team talent data from {start_year} to {end_year}")
        print(talent_df.info())

        # Display the first few rows
        print("\nSample team talent data:")
        display(talent_df.head())

        # Save to parquet
        output_path = "../data/01_raw/team_talent.parquet"
        talent_df.to_parquet(output_path, index=False)
        print(f"Team talent data saved to {output_path}")
    else:
        print("No team talent data was fetched.")

except Exception as e:
    print(f"An error occurred while fetching team talent data: {str(e)}")

Successfully fetched team talent data for 2015
Successfully fetched team talent data for 2016
Successfully fetched team talent data for 2017
Successfully fetched team talent data for 2018
Successfully fetched team talent data for 2019
Successfully fetched team talent data for 2020
Successfully fetched team talent data for 2021
Successfully fetched team talent data for 2022
Successfully fetched team talent data for 2023
Successfully fetched team talent data for 2024
Fetched team talent data from 2001 to 2024
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2144 entries, 0 to 2143
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   year    2144 non-null   int64  
 1   school  2144 non-null   object 
 2   talent  2144 non-null   float64
dtypes: float64(1), int64(1), object(1)
memory usage: 50.4+ KB
None

Sample team talent data:


Unnamed: 0,year,school,talent
0,2015,Alabama,981.9
1,2015,USC,926.71
2,2015,Ohio State,907.32
3,2015,Florida State,889.92
4,2015,LSU,889.37


Team talent data saved to ../data/team_talent.parquet
