In [21]:
import os
import sys
import numpy as np
import pandas as pd
import psycopg2 

from dotenv import load_dotenv
load_dotenv('../.env')

sys.path.append('../')

from sqlalchemy import create_engine
from db_utils import create_table, populate_table, insert_into_table

In [22]:
# loading up the postgres credentials
user = os.environ['DB_USER']
password = os.environ['DB_PASSWORD']
host = os.environ['DB_HOST']
database = os.environ['DB_NAME']
port = os.environ['DB_PORT']
    
URI = f'postgresql://{user}:{password}@{host}:{port}/{database}'

In [None]:
# getting the directory path
current_dir = os.getcwd()
parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))
nfl_pbp_dir = 'data/pbp'
nfl_depth_chart_dir = 'data/depth_charts'

# loading all of the parquets files at once using the directory path
# replacing backslashes with empty strings to avoid csv errors
df = pd.read_parquet(f'{parent_dir}/{nfl_pbp_dir}')
df['desc'] = df['desc'].str.replace('\\', '', regex=True)
df['season'] = [int(x.split('_')[0]) for x in df.game_id]

print("the shape of the pbp df is:", df.shape)
print("the memory usage of the pbp df is :", df.memory_usage(deep=True).sum() / 1024**2, "MB")

# dropping columns to reduce the memory required in the hopes of populating the db
# all at once, but it didn't work. I'll have to do it in chunks.
unneeded_cols = ['nfl_api_id', 'old_game_id', 'home_coach', 'away_coach', 'game_stadium', 'weather']
df.drop(columns=[x for x in df.columns if 'lateral' in x], inplace=True)
df.drop(columns=[x for x in df.columns if 'player_2' in x], inplace=True)
df.drop(columns=[x for x in df.columns if 'total' in x], inplace=True)
df.drop(columns=unneeded_cols, inplace=True)

print("the memory usage of the pbp after dropping is :", df.memory_usage(deep=True).sum() / 1024**2, "MB")

In [None]:
# creating the pbp table
create_table(df=df, table_name='pbp', URI=URI)

# creating a subset of the pbp df to test the populate_table function
testdf = df.iloc[:10000]
populate_table(df=testdf, table_name='pbp', URI=URI)

In [None]:
# using the insert function to get the pbp  into the database
# quickly - estimated time with 50k chunk sizes is 2.5 minutes
# the print statements keep track of the progress and 
# enable error identification

chunks = [x for x in range(10000, len(df), 50000)]

for i in range(len(chunks)):
    
    if i < (len(chunks) - 1):
        print(i)
        df_ = df.iloc[chunks[i] : chunks[i+1]].copy()
        print("chunk final index:", df_.index[-1])
        try:
            insert_into_table(df=df_, table_name='pbp', URI=URI)
        except:
            print("error in insert", i)
            print(df_.index[-1])
            continue
    else:
        print(i)
        df_ = df.iloc[chunks[i]:].copy()
        print("last chunk final index:", df_.index[-1])
        try:
            insert_into_table(df=df_, table_name='pbp', URI=URI)
        except:
            print("error for some other reason", i)

In [None]:
# loading and inserting the depth chart data into the database
# depth chart df needed to have line breaks replaced

depth_charts = pd.read_parquet(f'{parent_dir}/{nfl_depth_chart_dir}')
depth_charts['depth_position'] = depth_charts['depth_position'].str.replace("\n", '')

create_table(df=depth_charts, table_name='depth_charts', URI=URI)
populate_table(df=depth_charts, table_name='depth_charts', URI=URI)

In [None]:
# loading player stats data into the database
player_stats = pd.read_parquet(f'{parent_dir}/data/player_stats')

create_table(df=player_stats, table_name='player_stats', URI=URI)
populate_table(df=player_stats, table_name='player_stats', URI=URI)

In [None]:
# loading nextgen data into the database
nextgen = pd.read_parquet(f'{parent_dir}/data/nextgen')

create_table(df=nextgen, table_name='nextgen', URI=URI)
populate_table(df=nextgen, table_name='nextgen', URI=URI)

In [None]:
# loading misc data into the database
misc = pd.read_parquet(f'{parent_dir}/data/misc')

create_table(df=misc, table_name='misc', URI=URI)
populate_table(df=misc, table_name='misc', URI=URI)

In [None]:
# loading snap counts data into the database
snaps = pd.read_parquet(f'{parent_dir}/data/snap_counts')

create_table(df=snaps, table_name='snaps', URI=URI)
populate_table(df=snaps, table_name='snaps', URI=URI)

In [4]:
# loading Lee Sharpe's data into the database
lee_sharpe = 'https://raw.githubusercontent.com/nflverse/nfldata/master/data/games.csv'

ls = pd.read_csv(lee_sharpe)
ls = ls.sort_values(by='game_id')

create_table(df=ls, table_name='lee_sharpe', URI=URI)
populate_table(df=ls, table_name='lee_sharpe', URI=URI)

connected to the db..
connected to the database..
creating the cursor..
writing the csv to file..
data inserted into table successfully!


In [None]:
game_stat_query = """
    select 
        game_id,
        home_team,
        away_team,
        sum(case when posteam = home_team and play_type_nfl = 'RUSH' then yards_gained else 0 end) as home_rush_yards,
        sum(case when posteam = home_team and play_type_nfl = 'RUSH' then epa else 0 end) as home_rush_epa,
        sum(case when posteam = home_team and play_type_nfl = 'PASS' then yards_gained else 0 end) as home_pass_yards,
        sum(case when posteam = home_team and play_type_nfl = 'PASS' then epa else 0 end) as home_pass_epa,
        sum(case when posteam = away_team and play_type_nfl = 'SACK' then 1 else 0 end) as home_team_sacks,
        sum(case when posteam = away_team and play_type_nfl = 'RUSH' then yards_gained else 0 end) as away_rush_yards,
        sum(case when posteam = away_team and play_type_nfl = 'RUSH' then epa else 0 end) as away_rush_epa,
        sum(case when posteam = away_team and play_type_nfl = 'PASS' then yards_gained else 0 end) as away_pass_yards,
        sum(case when posteam = away_team and play_type_nfl = 'PASS' then epa else 0 end) as away_pass_epa,
        sum(case when posteam = home_team and play_type_nfl = 'SACK' then 1 else 0 end) as away_team_sacks
    from pbp
    where season >=2016
    group by game_id, home_team, away_team
    order by game_id asc
"""

game_stats = pd.read_sql(game_stat_query, con=engine)

In [24]:
# loading stadium locations into the database
stadium_file = os.path.join(parent_dir, 'data', 'misc', 'stadium_locations_07_17.csv')

stadium_df = pd.read_csv(stadium_file)

create_table(df=stadium_df, table_name='stadium_locations', URI=URI)
populate_table(df=stadium_df, table_name='stadium_locations', URI=URI)

connected to the db..
connected to the database..
creating the cursor..
writing the csv to file..
data inserted into table successfully!
