In [11]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import psycopg2 
import io

from sqlalchemy import create_engine


In [9]:
# gettin the directory path
current_dir = os.getcwd()
parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))
nfl_pbp_dir = 'data/pbp'

# loading all of the parquets files at once using the directory path
df = pd.read_parquet(f'{parent_dir}/{nfl_pbp_dir}')

print(df.shape)

(1148717, 372)
   play_id          game_id old_game_id home_team away_team season_type  week  \
0     35.0  1999_01_ARI_PHI  1999091200       PHI       ARI         REG     1   
1     60.0  1999_01_ARI_PHI  1999091200       PHI       ARI         REG     1   
2     82.0  1999_01_ARI_PHI  1999091200       PHI       ARI         REG     1   
3    103.0  1999_01_ARI_PHI  1999091200       PHI       ARI         REG     1   
4    126.0  1999_01_ARI_PHI  1999091200       PHI       ARI         REG     1   

  posteam posteam_type defteam  ... out_of_bounds  home_opening_kickoff  \
0     PHI         home     ARI  ...           0.0                   1.0   
1     PHI         home     ARI  ...           0.0                   1.0   
2     PHI         home     ARI  ...           0.0                   1.0   
3     PHI         home     ARI  ...           0.0                   1.0   
4     PHI         home     ARI  ...           0.0                   1.0   

     qb_epa  xyac_epa  xyac_mean_yardage  xyac_

In [10]:
lee_sharpe = 'https://raw.githubusercontent.com/nflverse/nfldata/master/data/games.csv'

ls_cols = ['game_id', 'overtime', 'home_rest', 'away_rest', 'div_game','roof', 
           'surface', 'temp', 'wind', 'home_coach', 'away_coach', 'referee']

ls = pd.read_csv(lee_sharpe, usecols=ls_cols)
ls = ls.sort_values(by='game_id')

In [None]:
df['year'] = pd.to_datetime(df['game_date']).dt.year
df['two_point_conv_result'] = (
    df['two_point_conv_result']
    .map(
        {'success' : 1,
         'failure' : 0,
         }
    )
    .fillna('None')
)

df['game_date'] = pd.to_datetime(df['game_date'])
df['spread_line'] = df['spread_line'] * -1
df['field_goal_result'] = np.where(df['field_goal_result'] == 'made', 1, 0)
df['time_between'] = df.groupby(['game_id'])['game_seconds_remaining'].transform(lambda x: x.sub(x.shift(-1)).fillna(0))
df['play_type'] = np.where(df['two_point_attempt'] > 0.5, 'two_point_att', df['play_type'])
df['air_yards_to_sticks'] = df['air_yards'].sub(df['ydstogo'])
df['season'] = [int(x.split('_')[0]) for x in df.game_id]
df['blocked_player_name'] = np.where(df['blocked_player_name'].notnull(), 1, 0)
df['fg_0_39'] = np.where(((df['play_type'] == 'field_goal') & (df['kick_distance'].between(0,39))), 1, 0)
df['fg_40_49'] = np.where(((df['play_type'] == 'field_goal') & (df['kick_distance'].between(40,49))), 1, 0)
df['fg_50_on'] = np.where(((df['play_type'] == 'field_goal') & (df['kick_distance'].between(50,100))), 1, 0)
df['extra_point_result'] = np.where(df['extra_point_result'] == 'good', 1, 0)

In [None]:
success = []

for a, b, c in zip(df['down'], df['ydstogo'], df['yards_gained']):
  frac = 0.4 * b
  
  if b == 0.0:
    success.append(np.nan)
  
  elif ((a==1) | (a==2)):
    if c >= frac:
      success.append(1)
    else:
      success.append(0)
  elif ((a==3) | (a==4)):
    if c >= b:
      success.append(1)
    else:
      success.append(0)

df['success'] = success

In [None]:
# getting the receiving box score basics

def get_receiving(df=df): 
  
  rec_df = (
      df.loc[df['qb_dropback'] == 1]
      .groupby(['game_id', 'receiver_player_id','posteam'])
      .agg({
          'receiver_player_name' : lambda x: x.unique()[0],
          'pass_attempt' : 'count',
          'complete_pass' : 'sum',
          'air_yards' : 'sum',
          'yards_after_catch' : 'sum',
          'yards_gained' : 'sum',
          'touchdown' : 'sum'
      })
      .sort_index()
      .reset_index()
      )

  rec_df.rename(
      columns={
      'receiver_player_id' : 'player_id',
      'receiver_player_name' : 'player',
      'pass_attempt' : 'targets',
      'complete_pass' : 'rec',
      'posteam' : 'team',
      'yards_after_catch' : 'yac',
      'yards_gained' : 'rec_yards',
      'touchdown' : 'td'},
      inplace=True)

  rec_df['team_targets'] = rec_df.groupby(['game_id', 'team'])['targets'].transform('sum')
  rec_df['target_share'] = rec_df['targets'].div(rec_df['team_targets']).round(3) * 100
  rec_df['aDOT'] = rec_df['air_yards'].div(rec_df['targets']).fillna(np.nan)
  rec_df['yrds_per_rec'] = rec_df['rec_yards'].div(rec_df['rec']).round(1)

  rec_df['team_air_yards'] = rec_df.groupby(['game_id', 'team'])['air_yards'].transform('sum')
  rec_df['air_yards_share'] = rec_df['air_yards'].div(rec_df['team_air_yards']).round(3) * 100
  rec_df['yac_per_rec'] = rec_df['yac'].div(rec_df['rec']).round(1)

  rec_df.drop(columns='team_targets', inplace=True)

  return rec_df

In [None]:
# creating the core table of quarterback results

def get_qb_pass(df=df):
  qb_df = (
      df.loc[((df['pass_attempt'] == 1) & (~df['play_type'].isin(['two_point_att']) & (df['sack'] ==0)))]
      .groupby(['game_id', 'passer_player_name', 'posteam'], as_index=False)
      .agg({
          'passer_player_id' : lambda x: x.unique()[0],
          'season_type' : lambda x: x.unique()[0],
          'pass_attempt' : 'sum',
          'complete_pass' : 'sum',
          'yards_gained' : 'sum',
          'air_yards' : 'sum',
          'yards_after_catch' : 'sum',
          'air_yards_to_sticks' : 'sum',
          'interception' : 'sum',
          'total_line' : 'max',
          'home_team' : lambda x: x.unique()[0],
          'away_team' : lambda x: x.unique()[0],
          'success' : 'sum',
          'temp' : 'max',
          'wind' : 'max',
          'epa' : 'sum',
          'cpoe' : 'sum'
      })
  )

  qb_df.rename(
      columns={
          'passer_player_name' : 'player',
          'passer_player_id' : 'player_id',
          'pass_attempt' : 'att',
          'complete_pass' : 'com',
          'yards_gained' : 'pass_yards',
          'posteam' : 'team',
          'air_yards_to_sticks' : 'AYTS'},
      inplace=True)
  
  sacks = (
    df.groupby(['game_id', 'posteam', 'passer_player_name'], as_index=False)['sack']
    .sum()
    .rename(columns={'passer_player_name' : 'player',
                     'posteam' : 'team'})
  )

  
  qb_df = qb_df.merge(sacks, how='left', on=['game_id', 'team', 'player'])
  qb_df['att'] = qb_df['att'].add(qb_df['sack'])
  qb_df['comp_perc'] = qb_df['com'].div(qb_df['att']).round(3) * 100


  qb_td = (
      df.loc[((df['pass_attempt'] == 1) & (~df['play_type'].isin(['two_point_att']) & (df['sack'] ==0) & (df['interception'] == 0)))]
      .groupby(['game_id', 'passer_player_id'], as_index=False)['touchdown']
      .sum()
      .rename(columns={'passer_player_id' : 'player_id'})
      )

  qb_df = qb_df.merge(qb_td, how='left', on=['game_id', 'player_id']) 

  ay_complete = (
      df[df['complete_pass'] == 1]
      .groupby(['game_id', 'posteam', 'passer_player_name'], as_index=False)['air_yards']
      .sum()
      .rename(columns={
          'air_yards' : 'ay_completions',
          'passer_player_name' : 'player',
          'posteam' : 'team'})
  )


  ay_incomplete = (
      df[((df['complete_pass'] == 0) & (df['play_type'] != 'two_point_att'))]
      .groupby(['game_id', 'posteam', 'passer_player_name', 'complete_pass'], as_index=False)['air_yards']
      .sum()
      .rename(columns={
          'passer_player_name' : 'player',
          'air_yards' : 'ay_incompletions',
          'posteam' : 'team'
      })
  )

  qb_df = qb_df.merge(ay_complete, how='left', on=['game_id', 'team', 'player'])
  qb_df['avg_ay_comp'] = qb_df['ay_completions'].div(qb_df['com']).round(1)
  qb_df = qb_df.merge(ay_incomplete, how='left', on=['game_id', 'team', 'player'])
  qb_df['avg_ay_incomp'] = qb_df['ay_incompletions'].div(qb_df['att'] - qb_df['com']).round(1)
  qb_df['att'] = qb_df['att'].sub(qb_df['sack'])
  qb_df['cpoe'] = qb_df['cpoe'].div(100)
  qb_df['epa_per_dropback'] = qb_df['epa'].div(qb_df['att'] + qb_df['sack']).round(3)

  # fixing qb names

  qb_df['player'].replace('Ty.Taylor', 'T.Taylor', inplace=True)
  qb_df['player'].replace('Aa.Rodgers', 'A.Rodgers', inplace=True)
  qb_df['player'].replace('Alex Smith', 'A.Smith', inplace=True)
  qb_df['player'].replace('Jos.Smith', 'J.Smith', inplace=True)

  qb_dict = {x:x.split('.') for x in qb_df['player'].unique()}

  new_values = []

  for values in qb_dict.values():
    new_last = values[1].lstrip()
    new_values.append('.'.join([values[0], new_last]))

  new_qb_dict = {k:v for k, v in zip(qb_dict.keys(), new_values)}
  qb_df['player'] = qb_df['player'].map(new_qb_dict).fillna(qb_df['player'])

  return qb_df

In [None]:
# creating the core table of rushing results

def get_rushing(df=df):
    
  run_df = (
      df[df['rush_attempt'] == 1]
      .groupby(['game_id', 'rusher_player_id', 'posteam'])
      .agg({
          'rush_attempt' : 'sum',
          'rusher_player_name' : lambda x: x.unique()[0],
          'yards_gained' : 'sum',
          'success' : 'sum',
          'touchdown' : 'sum',
          'total_line' : 'mean',
          'epa' : 'sum',
          'fumble_lost' : 'sum',
          'success' : 'sum',
          'home_team' : lambda x : x.unique()[0],
          'away_team' : lambda x : x.unique()[0]
      })
      .sort_index()
      .reset_index()
  )

  run_df.rename(
      columns={
          'rusher_player_id' : 'player_id',
          'rusher_player_name' : 'player',
          'rush_attempt' : 'rush_att',
          'posteam' : 'team',
          'fumble_lost' : 'fumbles',
          'touchdown' : 'rush_td',
          'yards_gained' : 'rush_yds',
          'epa' : 'rush_epa'}, inplace=True)
  
  run_df['rush_yds_per_att'] = run_df['rush_yds'].div(run_df['rush_att']).round(1)
  run_df['success_perc'] = run_df['success'].div(run_df['rush_att']).round(3)
  run_df['team_rush_atts'] = run_df.groupby(['game_id', 'team'])['rush_att'].transform('sum')
  run_df['rush_att_share'] = run_df['rush_att'].div(run_df['team_rush_atts']).round(2)
  
  return run_df

In [None]:
def get_opp_pass(df=df):

  opp_pass = (
      df[df['pass_attempt'] == 1]
      .groupby(['game_id', 'defteam'], as_index=False)['yards_gained']
      .sum()
      .rename(columns={'defteam' : 'team',
                      'yards_gained' : 'opp_pass_yds'})
  )

  return opp_pass

In [None]:
def get_opp_rush(df=df):

  opp_rush = (
      df[df['rush_attempt'] == 1]
      .groupby(['game_id', 'defteam'], as_index=False)['yards_gained']
      .sum()
      .rename(columns={'defteam' : 'team',
                      'yards_gained' : 'opp_rush_yds'})
  )

  return opp_rush

In [None]:
def get_def_stats(df=df):

  def_cols = ['interception', 'season', 'return_touchdown', 'fumble', 
              'sack', 'epa']

  def_stats = (
      df[~df['desc'].str.contains('Aborted')].copy()
      .groupby(['game_id', 'defteam'], as_index=False)
      .agg({
          'interception' : 'sum',
          'season' : lambda x: x.unique()[0],
          'return_touchdown' : 'sum',
          'fumble_lost' : 'sum',
          'sack' : 'sum',
          'safety' : 'sum',
          'blocked_player_name' : 'sum'
      })
      .rename(columns={
          'defteam' : 'team',
          'interception' : 'def_int',
          'return_touchdown' : 'def_td',
          'sack' : 'def_sack',
          'fumble' : 'def_fumble',
          'blocked_player_name' : 'kick_blocked'})
  )
  
  return def_stats

In [None]:
def get_kicker_stats(df=df):

  kicks = ['field_goal', 'extra_point']

  df = df[df['desc'].str.contains('GOOD')].copy()

  kick_df = (
      
      df[df['play_type'].isin(kicks)]
      .groupby(['game_id', 'kicker_player_id'], as_index=False)
      .agg({
          'kicker_player_name' : lambda x: x.unique()[0],
          'posteam' : lambda x: x.unique()[0],
          'field_goal_result' : 'sum',
          'extra_point_result' : 'sum',
          'fg_0_39' : 'sum',
          'fg_40_49' : 'sum',
          'fg_50_on' : 'sum'

      })
      .rename(columns={
          'kicker_player_name' : 'player',
          'field_goal_result' : 'fgs',
          'extra_point_result' : 'pats',
          'posteam' : 'team'
          })
  )

  return kick_df

In [None]:
kicker_df = get_kicker_stats()

In [None]:
def get_team_adjusted_epa(df=df):

  def_epa_cols = ['game_id', 'season', 'posteam', 'defteam', 'epa', 'play_type']

  season_epa_def = (
      df[df['play_type'].isin(['pass', 'run'])][def_epa_cols]
  )

  season_epa_def['season_epa_play'] = season_epa_def.groupby(['season', 'play_type'])['epa'].transform(lambda x: x.shift().expanding().mean())
  season_epa_def['season_epa_def'] = season_epa_def.groupby(['season', 'defteam', 'play_type'])['epa'].transform(lambda x: x.shift().expanding().mean())
  season_epa_def['season_epa_off'] = season_epa_def.groupby(['season', 'posteam', 'play_type'])['epa'].transform(lambda x: x.shift().expanding().mean())

  season_epa_def['team_adjusted_off_epa'] = season_epa_def['season_epa_off'].sub(season_epa_def['season_epa_play']).round(3)
  season_epa_def['team_adjusted_def_epa'] = season_epa_def['season_epa_def'].sub(season_epa_def['season_epa_play']).round(3)
  

  return season_epa_def

In [None]:
def get_team_pass_yds(df=df):

  team_pass_yds = (
      df.loc[df['pass_attempt'] == 1]
      .groupby(['game_id', 'posteam'], as_index=False)['yards_gained']
      .sum()
      .rename(columns={'posteam' : 'team',
                      'yards_gained' : 'pass_yards'})
  )

  return team_pass_yds

In [None]:
def get_team_rush_yds(df=df):

  team_rush_yds = (
      df.loc[df['rush_attempt'] == 1]
      .groupby(['game_id', 'posteam'], as_index=False)['yards_gained']
      .sum()
      .rename(columns={'posteam' : 'team',
                      'yards_gained' : 'rush_yards'})
  )

  return team_rush_yds

In [None]:
def get_team_scores(df=df):

  condition = df['td_team'] == df['posteam']

  team_scores = (
      df[condition]
      .groupby(['game_id', 'posteam'], as_index=False)[['touchdown', 'field_goal_result', 'two_point_conv_result']]
      .sum()
      .rename(columns={
          'touchdown' : 'off_td',
          'posteam' : 'team',
          'two_point_conv_result' : 'two_pts_conv',
          'field_goal_result' : 'fgs'
      })
      .fillna(0)
  )

  return team_scores

In [None]:
# creating the core table of game results

def get_game_results(df=df):

  game_results_cols = ['year', 'week', 'season_type', 'home_team', 'away_team',
                       'home_score', 'away_score', 'spread_line', 'total_line']
  game_results = (
      df
      .groupby(['game_id', 'posteam'], as_index=False)[game_results_cols]
      .max()
  )

  game_results['home'] = (game_results['posteam'] == game_results['home_team']).astype(int)
  game_results['spread_line'] = [x if y == 1 else (x * -1) for x, y in zip(game_results['spread_line'], game_results['home'])]
  game_results['actual_spread'] = (game_results['home_score'] - game_results['away_score']) * -1
  game_results['points'] = np.where(game_results['home'] == 1, game_results['home_score'], game_results['away_score'])
  game_results['opp_points'] = np.where(game_results['home'] == 0, game_results['home_score'], game_results['away_score'])

  game_results.rename(columns={'posteam' : 'team'}, inplace=True)
  # game_results.drop(columns={'home_score', 'away_score', 'home_team', 'away_team'}, inplace=True)

  game_results = game_results.merge(team_rush_yds, how='left', on=['game_id', 'team'])
  game_results = game_results.merge(team_pass_yds, how='left', on=['game_id', 'team'])
  game_results = game_results.merge(opp_rush, how='left', on=['game_id', 'team'])
  game_results = game_results.merge(opp_pass, how='left', on=['game_id', 'team'])
  game_results = game_results.merge(team_scores, how='left', on=['game_id', 'team'])
  game_results['actual_total'] = game_results['points'].add(game_results['opp_points'])
  game_results['over'] = (game_results['actual_total'] > game_results['total_line']).astype(int)

  game_results = game_results.merge(ls, how='left', on=['game_id'])
  game_results['rest'] = np.where(game_results['home'] == 1, game_results['home_rest'], game_results['away_rest'])
  game_results['opp_rest'] = np.where(game_results['home'] == 1, game_results['away_rest'], game_results['home_rest'])
  game_results['coach'] = np.where(game_results['home'] == 1, game_results['home_coach'], game_results['away_coach'])
  game_results['opp_coach'] = np.where(game_results['home'] == 1, game_results['away_coach'], game_results['home_coach'])
  # game_results.drop(columns=['home_rest', 'away_rest', 'home_coach', 'away_coach'], inplace=True)

  game_results['season'] = [int(x.split('_')[0]) for x in game_results.game_id]
  game_results['season'] = game_results['season'].astype('category')
  game_results = game_results[game_results['team'] != ''].copy()

  return game_results

In [None]:
def create_ref_dict():

  names = games_df['referee'].unique()
  ref_dict = {}

  for i in range(len(names)-1):
    sim = process.extractOne(names[i], names[i+1:])
    
    if sim[1] > 85:
      ref_dict[names[i]] = sim[0]

  # del ref_dict['Jim Sprenger']
  ref_rev_dict = {y : x  for x, y in ref_dict.items()}

  return ref_rev_dict

In [None]:
def get_drive_stats(df=df):

  drive_cols = ['time_between', 'score_differential', 'score_differential_post', 'rush_attempt', 
                'pass_attempt', 'yards_gained', 'interception', 'fumble', 'sack', 'epa',
                'success']


  drive_details = (
      df
      .groupby(['game_id', 'posteam', 'drive'], as_index=False)[drive_cols]
      .agg({
          'time_between' : 'sum',
          'score_differential' : 'first',
          'score_differential_post' : 'last',
          'rush_attempt' : 'sum',
          'pass_attempt' : 'sum',
          'yards_gained' : 'sum',
          'interception' : 'sum',
          'fumble' : 'sum',
          'sack' : 'sum',
          'success' : 'sum',
          'epa' : 'sum'
      }
      )
      .rename(
          columns=
          {'time_between' : 'poss_time',
          'score_differential' : 'score_diff_start',
          'score_differential_post' : 'score_diff_end',
          'interception' : 'int',
          'fumble' : 'fumble',
          'posteam' : 'team'}
      )
  )

  drive_details['score_gain'] = drive_details['score_diff_end'].sub(drive_details['score_diff_start'])
  drive_details['td'] = (drive_details['score_gain'] >=6).astype(int)
  drive_details['fg'] = (drive_details['score_gain'] == 3).astype(int)
  drive_details['total_plays'] = drive_details['rush_attempt'].add(drive_details['pass_attempt']).astype(int)
  drive_details.drop(columns=['score_gain'], inplace=True)

  return drive_details

In [None]:
def get_rolling_qb_epa(df=df):

  qb_epa_dropback = df[df['qb_dropback'] == 1][['game_id', 'passer_player_name', 'epa', 'wp']].copy()
  qb_epa_db = qb_epa_dropback[qb_epa_dropback['passer_player_name'].notnull()].copy()
  qb_epa_db.replace('Jos.Allen', 'J.Allen', inplace=True)
  qb_epa_db['1000_rolling_mean'] = qb_epa_db.groupby('passer_player_name')['epa'].transform(lambda x: x.shift().rolling(1000, min_periods=200).mean())
  qb_epa_db['exp_career_mean'] = qb_epa_db.groupby('passer_player_name')['epa'].transform(lambda x: x.shift().expanding().mean())

  return qb_epa_db.reset_index(drop=True)

In [None]:
# getting red zone statistics

rz_df = df[df['yardline_100'] <= 20].copy()

rz_rushing = get_rushing(df=rz_df)
rz_receiving = get_receiving(df=rz_df)
rz_qbs = get_qb_pass(df=rz_df)

rz_qbs = (
    rz_qbs
    .merge(rz_rushing.drop(columns='success'), how='left', on=['game_id', 'player', 'player_id', 'team', 'total_line', 'home_team', 'away_team'])
    .fillna(0)
)

In [None]:
# creating tables for rushing, passing, and receiving. merging the rushing
# and passing dataframes because sometimes quarterbacks run!

rush_df = get_rushing()
rec_df = get_receiving()
qbs = get_qb_pass()

qb_df = qbs.merge(rush_df.drop(columns='success'), how='left', on=['game_id', 'player', 'player_id', 'team', 'total_line', 'home_team', 'away_team'])
qb_df.fillna(0, inplace=True)
qb_df['total_epa'] = qb_df['rush_epa'].add(qb_df['epa'])

print(qb_df.shape)
print(rec_df.shape)
print(rush_df.shape)

(7686, 39)
(51751, 16)
(26009, 17)


In [None]:
team_pass_yds = get_team_pass_yds()
team_rush_yds = get_team_rush_yds()

opp_rush = get_opp_rush()
opp_pass = get_opp_pass()

def_stats = get_def_stats()
team_scores = get_team_scores()

  .sum()


In [None]:
games_df = get_game_results()

In [None]:
ref_rev_dict = create_ref_dict()
games_df['referee'] = games_df['referee'].map(ref_rev_dict).fillna(games_df['referee'])

NameError: name 'process' is not defined

In [None]:
drive_stats = get_drive_stats()
drive_summary = drive_stats.groupby(['game_id', 'team'], as_index=False).agg({'poss_time' : 'sum',
                                                                              'rush_attempt' : 'sum',
                                                                              'pass_attempt' : 'sum',
                                                                              'total_plays' : 'sum'})

In [None]:
drive_stats.groupby(['team', 'score_diff_start']).apply(lambda x: x['pass_attempt'].sum() / x['total_plays'].sum()).head(60)

  """Entry point for launching an IPython kernel.


team  score_diff_start
ARI   -58.0               1.000000
      -51.0               0.666667
      -48.0               0.500000
      -45.0               0.500000
      -38.0                    NaN
      -35.0               0.727273
      -34.0               0.760000
      -33.0               0.809524
      -32.0               0.514286
      -31.0               0.960000
      -30.0               0.900000
      -29.0               0.916667
      -28.0               0.558140
      -27.0               0.759259
      -26.0               0.722222
      -25.0               0.767857
      -24.0               0.800000
      -23.0               0.764706
      -22.0               0.800000
      -21.0               0.798742
      -20.0               0.692308
      -19.0               0.816327
      -18.0               0.862319
      -17.0               0.670782
      -16.0               0.716814
      -15.0               0.815789
      -14.0               0.698361
      -13.0               0.7388

In [None]:
games_df = games_df.merge(drive_summary, how='left', on=['game_id', 'team'])

In [None]:
games_df['pass_per_attempt'] = games_df['pass_yards'].div(games_df['pass_attempt']).round(1)
games_df['perc_pass'] = games_df['pass_attempt'].div(games_df['total_plays']).round(3)
games_df['sec_per_play'] = games_df['poss_time'].div(games_df['total_plays']).round(1)

In [None]:
def get_starting_qb():

  throwers = df.groupby('passer_player_name')['pass_attempt'].sum().reset_index()
  qbs = [qb for qb in throwers[throwers['pass_attempt'] > 18]['passer_player_name'].unique()]

  starting_qb = (
      df[df['passer_player_name'].isin(qbs)]
      .groupby(['game_id', 'posteam'], as_index=False)['passer_player_name']
      .first()
      .rename(columns={
          'posteam' : 'team',
          'passer_player_name' : 'starting_qb'}
          )
  )

  return starting_qb

starting_qb = get_starting_qb()
games_df = games_df.merge(starting_qb, how='left', on=['game_id', 'team'])

def get_home_away_qb():

  home_away_qbs = (
      games_df
      .groupby(['game_id', 'home'])['starting_qb']
      .apply(lambda x: x.unique()[0])
      .reset_index()
      .set_index(['game_id', 'home'])
      .unstack()

  )
  
  home_away_qbs.columns = ['away_start_qb', 'home_start_qb']
  home_away_qbs = home_away_qbs.reset_index()
  
  return home_away_qbs


home_away_qbs = get_home_away_qb()
games_df = games_df.merge(home_away_qbs, how='left', on=['game_id'])

In [None]:
adj_epa = get_team_adjusted_epa()

off_epa_game = (
    adj_epa
    .groupby(['game_id', 'posteam', 'play_type'], as_index=False)['epa']
    .agg(['sum'])
    .reset_index()
    .pivot_table(index=['game_id', 'posteam'],
                 columns='play_type',
                 values='sum')
    .reset_index()
    .rename(columns={
        'pass' : 'off_pass_epa',
        'run' : 'off_run_epa',
        'posteam' : 'team'
    })
    .round(2)
)

def_epa_game = (
    adj_epa
    .groupby(['game_id', 'defteam', 'play_type'], as_index=False)['epa']
    .agg(['sum'])
    .reset_index()
    .pivot_table(index=['game_id', 'defteam'],
                 columns='play_type',
                 values='sum')
    .reset_index()
    .rename(columns={
        'pass' : 'def_pass_epa',
        'run' : 'def_run_epa',
        'defteam' : 'team'
    })
    .round(2)
)

In [None]:
games_df = games_df.merge(off_epa_game, how='left', on=['game_id', 'team'])
games_df = games_df.merge(def_epa_game, how='left', on=['game_id', 'team'])

In [None]:
def_stats = get_def_stats()

games_df = games_df.merge(def_stats, how='left', on=['game_id', 'team', 'season'])

In [None]:
rolling_qb = get_rolling_qb_epa()

In [None]:


def populate_table(df, table_name, URI=URI):

    engine = create_engine(URI)
    print('connected to the database..')
    
    df.head(0).to_sql(table_name, engine, if_exists='replace',index=False) 

    conn = engine.raw_connection()
    cur = conn.cursor()
    print('creating the cursor..')

    output = io.StringIO()
    
    print('writing the csv to file..')
    
    df.to_csv(output, sep='\t', header=False, index=False)
    output.seek(0)
    
    
    contents = output.getvalue()
    
    cur.copy_from(output, table_name, null="") # null values become ''
    
    cur.close()
    conn.commit()
    conn.close()

In [None]:
def create_table(df, table_name, URI=URI):

  engine = create_engine(URI)
  print('connected to the db..')

  df.head(0).to_sql(table_name, engine, if_exists='replace', index=False)

In [None]:
create_table(df, 'pbp', URI=URI)

connected to the db..


In [None]:
def insert_into_table(df, table_name, URI=URI):

  engine = create_engine(URI)

  conn = engine.raw_connection()
  cur = conn.cursor()

  output = io.StringIO()

  df.to_csv(output, sep='\t', header=False, index=False)
  output.seek(0)

  contents = output.getvalue()

  cur.copy_from(output, table_name, null="")

  conn.commit()
  cur.close()
  conn.close()

In [None]:
date_threshold = '2021-10-16'
pbp_df = df[df['game_date'] >= date_threshold].copy()

for chunk in np.array_split(pbp_df, 3):
  df_ = pd.DataFrame(chunk)

  insert_into_table(df_, 'pbp', URI)

In [None]:
def drop_table(table_name, URI):
  
  psyco_conn = psycopg2.connect(URI)
  cursor = psyco_conn.cursor()
  psyco_conn.autocommit = True

  cursor.execute("""DROP TABLE %s;"""%table_name)
  cursor.close()
  psyco_conn.close()

In [None]:
populate_table(qb_df, 'qb_games', URI)

connected to the database..
creating the cursor..
writing the csv to file..


In [None]:
populate_table(games_df, 'games', URI)

connected to the database..
creating the cursor..
writing the csv to file..


In [None]:
populate_table(rec_df, 'receiving_games', URI)

connected to the database..
creating the cursor..
writing the csv to file..


In [None]:
populate_table(rush_df, 'rush_games', URI)

connected to the database..
creating the cursor..
writing the csv to file..


In [None]:
populate_table(def_stats, 'defense_games', URI)

connected to the database..
creating the cursor..
writing the csv to file..


In [None]:
populate_table(kicker_df, 'kicker_games', URI)

connected to the database..
creating the cursor..
writing the csv to file..


In [None]:
populate_table(rz_rushing, 'red_zone_rush', URI)

connected to the database..
creating the cursor..
writing the csv to file..


In [None]:
populate_table(rz_receiving, 'red_zone_rec', URI)

connected to the database..
creating the cursor..
writing the csv to file..


In [None]:
populate_table(rz_qbs, 'red_zone_qb', URI)

connected to the database..
creating the cursor..
writing the csv to file..
