# Data Collection and Transformation

...

In [12]:
# Autoreload so we're using most recent modules
%load_ext autoreload
%autoreload 2

import sys
import os
import pandas as pd
import random
import sqlite3

# Add the project root to the Python path
notebook_dir = os.path.dirname(os.path.abspath('__file__'))
project_root = os.path.dirname(notebook_dir)
sys.path.append(project_root)

# Import from src
from src.data.collection import (
    initialize_games_api,
    initialize_stats_api,
    initialize_teams_api,
    initialize_ratings_api,
    initialize_metrics_api,
    initialize_recruiting_api,
    fetch_games,
    fetch_team_game_stats,
    # get_games_df,
    # get_team_game_stats_df,
    fetch_advanced_team_game_stats,
    fetch_team_talent,
    get_calendar,
    fetch_all_ratings,
    fetch_pregame_win_probabilities,
    fetch_team_recruiting,
    initialize_betting_api,
    fetch_betting_lines
)
from src.data.transformations import(
    main as transform_data
)

from pprint import pprint

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Data Collection

## Inputs

In [2]:
start_year = 2001
end_year = 2024

## Games API

In [4]:
# Initialize the API
games_api = initialize_games_api()

# Fetch and store games data
print("Fetching games data...")
fetch_games(start_year, end_year, games_api)

# Fetch and store team game stats data
print("Fetching team game stats data...")
fetch_team_game_stats(start_year, end_year)

# Fetch and store calendar data
print("Fetching calendar data...")
calendar_data = get_calendar(2024, games_api)

Fetching games data...
Table 'games' does not exist yet.
Successfully fetched regular season games for 2001, conference: SEC
Successfully fetched postseason games for 2001, conference: SEC
Successfully fetched regular season games for 2001, conference: B1G
Successfully fetched postseason games for 2001, conference: B1G
Successfully fetched regular season games for 2001, conference: ACC
Successfully fetched postseason games for 2001, conference: ACC
Successfully fetched regular season games for 2001, conference: B12
Successfully fetched postseason games for 2001, conference: B12
No regular season games found for 2001, conference: PAC
No postseason games found for 2001, conference: PAC
Created table games
Data appended in games
Appended data for year 2001
Successfully fetched regular season games for 2002, conference: SEC
Successfully fetched postseason games for 2002, conference: SEC
Successfully fetched regular season games for 2002, conference: B1G
Successfully fetched postseason game

## Stats API

In [5]:
## Advanced Team Stats
stats_api = initialize_stats_api()
print("Fetching advanced team game stats...")
fetch_advanced_team_game_stats(start_year, end_year, stats_api)

Fetching advanced team game stats...
Table 'advanced_team_game_stats' does not exist yet.
Successfully fetched advanced team game stats for 2001 regular season
Successfully fetched advanced team game stats for 2001 postseason season
Created table advanced_team_game_stats
Data appended in advanced_team_game_stats
Appended advanced team game stats data for year 2001
Successfully fetched advanced team game stats for 2002 regular season
Successfully fetched advanced team game stats for 2002 postseason season
Data appended in advanced_team_game_stats
Appended advanced team game stats data for year 2002
Successfully fetched advanced team game stats for 2003 regular season
Successfully fetched advanced team game stats for 2003 postseason season
Data appended in advanced_team_game_stats
Appended advanced team game stats data for year 2003
Successfully fetched advanced team game stats for 2004 regular season
Successfully fetched advanced team game stats for 2004 postseason season
Data appended 

## Ratings API

In [22]:
ratings_api = initialize_ratings_api()
print("Fetching all ratings data...")
fetch_all_ratings(start_year, end_year, ratings_api)

Fetching all ratings data...
Successfully fetched ELO ratings for 2001
Successfully fetched ELO ratings for 2002
Successfully fetched ELO ratings for 2003
Successfully fetched ELO ratings for 2004
Successfully fetched ELO ratings for 2005
Successfully fetched ELO ratings for 2006
Successfully fetched ELO ratings for 2007
Successfully fetched ELO ratings for 2008
Successfully fetched ELO ratings for 2009
Successfully fetched ELO ratings for 2010
Successfully fetched ELO ratings for 2011
Successfully fetched ELO ratings for 2012
Successfully fetched ELO ratings for 2013
Successfully fetched ELO ratings for 2014
Successfully fetched ELO ratings for 2015
Successfully fetched ELO ratings for 2016
Successfully fetched ELO ratings for 2017
Successfully fetched ELO ratings for 2018
Successfully fetched ELO ratings for 2019
Successfully fetched ELO ratings for 2020
Successfully fetched ELO ratings for 2021
Successfully fetched ELO ratings for 2022
Successfully fetched ELO ratings for 2023
Succe

## Teams API

In [7]:
## Team Talent Composite
teams_api = initialize_teams_api()
print("Fetching team talent data...")
fetch_team_talent(start_year, end_year, teams_api)

Fetching team talent data...
Table 'team_talent' does not exist yet.
Created table team_talent
Data appended in team_talent
Appended team talent data for year 2015
Successfully fetched team talent data for 2015
Data appended in team_talent
Appended team talent data for year 2016
Successfully fetched team talent data for 2016
Data appended in team_talent
Appended team talent data for year 2017
Successfully fetched team talent data for 2017
Data appended in team_talent
Appended team talent data for year 2018
Successfully fetched team talent data for 2018
Data appended in team_talent
Appended team talent data for year 2019
Successfully fetched team talent data for 2019
Data appended in team_talent
Appended team talent data for year 2020
Successfully fetched team talent data for 2020
Data appended in team_talent
Appended team talent data for year 2021
Successfully fetched team talent data for 2021
Data appended in team_talent
Appended team talent data for year 2022
Successfully fetched tea

## Metrics API

In [8]:
# Pre-game Win Probabilities
metrics_api = initialize_metrics_api()
print("Fetching pre-game win probabilities...")
fetch_pregame_win_probabilities(start_year, end_year, metrics_api)

Fetching pre-game win probabilities...
Table 'pregame_win_probabilities' does not exist yet.
Successfully fetched pregame win probabilities for 2001 regular season
Successfully fetched pregame win probabilities for 2001 postseason season
Successfully fetched pregame win probabilities for 2002 regular season
Successfully fetched pregame win probabilities for 2002 postseason season
Successfully fetched pregame win probabilities for 2003 regular season
Successfully fetched pregame win probabilities for 2003 postseason season
Successfully fetched pregame win probabilities for 2004 regular season
Successfully fetched pregame win probabilities for 2004 postseason season
Successfully fetched pregame win probabilities for 2005 regular season
Successfully fetched pregame win probabilities for 2005 postseason season
Successfully fetched pregame win probabilities for 2006 regular season
Successfully fetched pregame win probabilities for 2006 postseason season
Successfully fetched pregame win prob

## Recruiting API

In [9]:
recruiting_api = initialize_recruiting_api()
print("Fetching recruiting data...")
fetch_team_recruiting(start_year, end_year, recruiting_api)

Fetching recruiting data...
Table 'team_recruiting' does not exist yet.
Created table team_recruiting
Data appended in team_recruiting
Appended team recruiting data for year 2001
Successfully fetched team recruiting data for 2001
Data appended in team_recruiting
Appended team recruiting data for year 2002
Successfully fetched team recruiting data for 2002
Data appended in team_recruiting
Appended team recruiting data for year 2003
Successfully fetched team recruiting data for 2003
Data appended in team_recruiting
Appended team recruiting data for year 2004
Successfully fetched team recruiting data for 2004
Data appended in team_recruiting
Appended team recruiting data for year 2005
Successfully fetched team recruiting data for 2005
Data appended in team_recruiting
Appended team recruiting data for year 2006
Successfully fetched team recruiting data for 2006
Data appended in team_recruiting
Appended team recruiting data for year 2007
Successfully fetched team recruiting data for 2007
Da

## Betting API

In [10]:
betting_api = initialize_betting_api()
print("Fetching betting lines...")
fetch_betting_lines(start_year, end_year, betting_api)

Fetching betting lines...
Table 'betting_lines' does not exist yet.
Successfully fetched betting lines for 2013 regular season
Successfully fetched betting lines for 2013 postseason season
Created table betting_lines
Data appended in betting_lines
Appended betting lines data for year 2013
Successfully fetched betting lines for 2014 regular season
Successfully fetched betting lines for 2014 postseason season
Data appended in betting_lines
Appended betting lines data for year 2014
Successfully fetched betting lines for 2015 regular season
Successfully fetched betting lines for 2015 postseason season
Data appended in betting_lines
Appended betting lines data for year 2015
Successfully fetched betting lines for 2016 regular season
Successfully fetched betting lines for 2016 postseason season
Data appended in betting_lines
Appended betting lines data for year 2016
Successfully fetched betting lines for 2017 regular season
Successfully fetched betting lines for 2017 postseason season
Data ap

# Data Transformation

In [25]:
transform_data()

Transforming table: games
Transforming table: team_game_stats
Transforming table: calendar
Transforming table: advanced_team_game_stats
Transforming table: team_talent
Transforming table: pregame_win_probabilities
Transforming table: team_recruiting
Transforming table: betting_lines
Transforming table: elo_ratings
Transforming table: fpi_ratings
Transforming table: sp_ratings
Transforming table: srs_ratings
Transformation complete.


In [13]:
new_db_path = '../data/02_interim/college_football.db'
conn = sqlite3.connect(new_db_path)

# Get table names
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
table_names = [table[0] for table in cursor.fetchall()]

print("Tables in the new database:")
for table in table_names:
    print(table)

# Example: Load a table and display its structure
example_table = table_names[0]
df = pd.read_sql(f"SELECT * FROM {example_table} LIMIT 5", conn)
print(f"\nStructure of {example_table}:")
print(df.info())

conn.close()

Tables in the new database:


IndexError: list index out of range