# Data Collection and Transformation

...

In [1]:
# Autoreload so we're using most recent modules
%load_ext autoreload
%autoreload 2

import sys
import os
import pandas as pd
import random
import sqlite3

# Add the project root to the Python path
notebook_dir = os.path.dirname(os.path.abspath('__file__'))
project_root = os.path.dirname(notebook_dir)
sys.path.append(project_root)

# Import from src
from src.data.collection import (
    initialize_games_api,
    initialize_stats_api,
    initialize_teams_api,
    initialize_ratings_api,
    initialize_metrics_api,
    initialize_recruiting_api,
    fetch_games,
    fetch_team_game_stats,
    fetch_advanced_team_game_stats,
    fetch_team_talent,
    fetch_all_ratings,
    fetch_pregame_win_probabilities,
    fetch_team_recruiting,
    initialize_betting_api,
    fetch_betting_lines
)
from src.data.transformations import(
    main as transform_data
)
from src.data.warehouse import drop_table

from pprint import pprint

# Data Collection

## Inputs

In [2]:
start_year = 2001
end_year = 2024
use_last_season = True

## Games API

In [3]:
# Fetch and store team game stats data
print("Fetching team game stats data...")
fetch_team_game_stats(start_year, end_year, use_last_season)

Fetching team game stats data...
Successfully fetched team game stats for 2024, conference: SEC, season type: regular
Successfully fetched team game stats for 2024, conference: SEC, season type: postseason
Successfully fetched team game stats for 2024, conference: B1G, season type: regular
Successfully fetched team game stats for 2024, conference: B1G, season type: postseason
Successfully fetched team game stats for 2024, conference: ACC, season type: regular
Successfully fetched team game stats for 2024, conference: ACC, season type: postseason
Successfully fetched team game stats for 2024, conference: B12, season type: regular
Successfully fetched team game stats for 2024, conference: B12, season type: postseason
Successfully fetched team game stats for 2024, conference: PAC, season type: regular
Successfully fetched team game stats for 2024, conference: PAC, season type: postseason
Inserted/Updated data in team_game_stats
Updated/Appended team game stats data for year 2024
Finished 

In [4]:
# Initialize the API
games_api = initialize_games_api()

# Fetch and store games data
print("Fetching games data...")
fetch_games(start_year, end_year, games_api, use_last_season)

Fetching games data...
Successfully fetched regular season games for 2024, conference: SEC
No postseason games found for 2024, conference: SEC
Successfully fetched regular season games for 2024, conference: B1G
No postseason games found for 2024, conference: B1G
Successfully fetched regular season games for 2024, conference: ACC
No postseason games found for 2024, conference: ACC
Successfully fetched regular season games for 2024, conference: B12
No postseason games found for 2024, conference: B12
Successfully fetched regular season games for 2024, conference: PAC
No postseason games found for 2024, conference: PAC
Updated data for year 2024 in games
Updated/Appended data for year 2024
Finished fetching games data


## Stats API

In [5]:
## Advanced Team Stats
stats_api = initialize_stats_api()
print("Fetching advanced team game stats...")
fetch_advanced_team_game_stats(start_year, end_year, stats_api, use_last_season)

Fetching advanced team game stats...
Successfully fetched advanced team game stats for 2024 regular season
Successfully fetched advanced team game stats for 2024 postseason season
Inserted/Updated data in advanced_team_game_stats
Updated/Appended advanced team game stats data for year 2024
Finished fetching advanced team game stats data


## Ratings API

In [6]:
ratings_api = initialize_ratings_api()
print("Fetching all ratings data...")
fetch_all_ratings(start_year, end_year, ratings_api, use_last_season)

Fetching all ratings data...
Successfully fetched ELO ratings for 2024
Updated data for year 2024 in elo_ratings
Successfully stored ELO ratings data for year 2024
Successfully fetched FPI ratings for 2024
Updated data for year 2024 in fpi_ratings
Successfully stored FPI ratings data for year 2024
Successfully fetched SP ratings for 2024
Updated data for year 2024 in sp_ratings
Successfully stored SP ratings data for year 2024
Successfully fetched SRS ratings for 2024
Updated data for year 2024 in srs_ratings
Successfully stored SRS ratings data for year 2024
Finished fetching all ratings data


## Teams API

In [7]:
## Team Talent Composite
teams_api = initialize_teams_api()
print("Fetching team talent data...")
fetch_team_talent(start_year, end_year, teams_api, use_last_season)

Fetching team talent data...
Updated data for year 2024 in team_talent
Replaced team talent data for year 2024
Successfully fetched team talent data for 2024
Finished fetching team talent data


## Metrics API

In [8]:
# Pre-game Win Probabilities
metrics_api = initialize_metrics_api()
print("Fetching pre-game win probabilities...")
fetch_pregame_win_probabilities(start_year, end_year, metrics_api, use_last_season)

Fetching pre-game win probabilities...
Successfully fetched pregame win probabilities for 2024 regular season
Successfully fetched pregame win probabilities for 2024 postseason season
Updated data for year 2024 in pregame_win_probabilities
Updated/Appended pregame win probabilities data for year 2024
Finished fetching pregame win probabilities data


## Recruiting API

In [9]:
recruiting_api = initialize_recruiting_api()
print("Fetching recruiting data...")
fetch_team_recruiting(start_year, end_year, recruiting_api, use_last_season)

Fetching recruiting data...
Appended data in team_recruiting
Replaced team recruiting data for year 2024
Successfully fetched team recruiting data for 2024
Finished fetching team recruiting data


## Betting API

In [10]:
betting_api = initialize_betting_api()
print("Fetching betting lines...")
fetch_betting_lines(start_year, end_year, betting_api, use_last_season)

Fetching betting lines...
Successfully fetched betting lines for 2024 regular season
Successfully fetched betting lines for 2024 postseason season
Updated data for year 2024 in betting_lines
Updated/Appended betting lines data for year 2024
Finished fetching betting lines data


# Data Transformation

In [11]:
transform_data()

Transforming table: team_game_stats
Transforming table: games
Transforming table: elo_ratings
Transforming table: fpi_ratings
Transforming table: sp_ratings
Transforming table: srs_ratings
Transforming table: team_talent
Transforming table: pregame_win_probabilities
Transforming table: team_recruiting
Transforming table: betting_lines
Transforming table: advanced_team_game_stats
Transformation complete.


In [1]:
import sqlite3
import pandas as pd

DB_FILE = '../data/02_interim/college_football.db'

conn = sqlite3.connect(DB_FILE)
team_games_df = pd.read_sql_query("SELECT * FROM team_game_stats", conn)
conn.close()

team_games_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17686 entries, 0 to 17685
Data columns (total 41 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   id                   17686 non-null  int64 
 1   school_id            17686 non-null  int64 
 2   school               17686 non-null  object
 3   conference           17682 non-null  object
 4   home_away            17686 non-null  object
 5   points               17686 non-null  int64 
 6   fumblesRecovered     17686 non-null  object
 7   rushingTDs           17684 non-null  object
 8   puntReturnYards      13889 non-null  object
 9   puntReturnTDs        13889 non-null  object
 10  puntReturns          13889 non-null  object
 11  passingTDs           17649 non-null  object
 12  kickingPoints        17274 non-null  object
 13  firstDowns           17686 non-null  object
 14  thirdDownEff         17686 non-null  object
 15  fourthDownEff        17686 non-null  object
 16  tota