In [2]:
# Autoreload so we're using most recent modules
%load_ext autoreload
%autoreload 2

import sys
import os
import pandas as pd
import random

# Add the project root to the Python path
notebook_dir = os.path.dirname(os.path.abspath('__file__'))
project_root = os.path.dirname(notebook_dir)
sys.path.append(project_root)

# Import from src
from src.data.collection import (
    initialize_games_api,
    initialize_stats_api,
    initialize_teams_api,
    fetch_games,
    fetch_team_game_stats,
    get_games_df,
    get_team_game_stats_df,
    fetch_advanced_team_game_stats,
    fetch_team_talent
)
from pprint import pprint

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Inputs

In [2]:
start_year = 2001
end_year = 2024

## Games API

In [15]:
start_year = 2020
end_year = 2020

# Initialize the API
games_api = initialize_games_api()

# Fetch and store games data
print("Fetching games data...")
fetch_games(start_year, end_year, games_api)

# Fetch and store team game stats data
print("Fetching team game stats data...")
fetch_team_game_stats(start_year, end_year)

Fetching games data...
Table 'games' does not exist yet.
Successfully fetched regular season games for 2020, conference: SEC
Successfully fetched postseason games for 2020, conference: SEC
Successfully fetched regular season games for 2020, conference: B1G
Successfully fetched postseason games for 2020, conference: B1G
Successfully fetched regular season games for 2020, conference: ACC
Successfully fetched postseason games for 2020, conference: ACC
Successfully fetched regular season games for 2020, conference: B12
Successfully fetched postseason games for 2020, conference: B12
Successfully fetched regular season games for 2020, conference: PAC
Successfully fetched postseason games for 2020, conference: PAC
Created table games
Data appended in games
Appended data for year 2020
Finished fetching games data
Fetching team game stats data...
Table 'team_game_stats' does not exist yet.
Successfully fetched team game stats for 2020, conference: SEC
Successfully fetched team game stats for 20

## Process Games

## Stats API

In [17]:
## Advanced Team Stats
stats_api = initialize_stats_api()
fetch_advanced_team_game_stats(2020, 2020, stats_api)

Successfully fetched advanced team game stats for 2020 regular season
Successfully fetched advanced team game stats for 2020 postseason season
Data appended in advanced_team_game_stats
Appended advanced team game stats data for year 2020
Finished fetching advanced team game stats data


## Team Talent Composite

In [7]:
## Team Talent Composite

try:
    talent_df = fetch_team_talent(max(2015, start_year), end_year)

    if talent_df is not None:
        # Display basic information about the dataframe
        print(f"Fetched team talent data from {start_year} to {end_year}")
        print(talent_df.info())

        # Display the first few rows
        print("\nSample team talent data:")
        display(talent_df.head())

        # Save to parquet
        output_path = "../data/01_raw/team_talent.parquet"
        talent_df.to_parquet(output_path, index=False)
        print(f"Team talent data saved to {output_path}")
    else:
        print("No team talent data was fetched.")

except Exception as e:
    print(f"An error occurred while fetching team talent data: {str(e)}")

Successfully fetched team talent data for 2015
Successfully fetched team talent data for 2016
Successfully fetched team talent data for 2017
Successfully fetched team talent data for 2018
Successfully fetched team talent data for 2019
Successfully fetched team talent data for 2020
Successfully fetched team talent data for 2021
Successfully fetched team talent data for 2022
Successfully fetched team talent data for 2023
Successfully fetched team talent data for 2024
Fetched team talent data from 2001 to 2024
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2144 entries, 0 to 2143
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   year    2144 non-null   int64  
 1   school  2144 non-null   object 
 2   talent  2144 non-null   float64
dtypes: float64(1), int64(1), object(1)
memory usage: 50.4+ KB
None

Sample team talent data:


Unnamed: 0,year,school,talent
0,2015,Alabama,981.9
1,2015,USC,926.71
2,2015,Ohio State,907.32
3,2015,Florida State,889.92
4,2015,LSU,889.37


Team talent data saved to ../data/team_talent.parquet
