In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import psycopg2 
import io
import sys

sys.path.append('../')

from sqlalchemy import create_engine

from db_utils import create_table, populate_table, insert_into_table
from pbp_utils import (get_qb_pass, get_game_results, get_receiving, get_rushing,
                       get_team_pass_yds, get_team_rush_yds, get_opp_pass,
                       get_def_stats)

In [45]:
# loading up the postgres credentials in a separate file. we could use environment variables
# but doing it this way to mix it up!

with open('../env.txt', 'r') as file:
    env = file.read().splitlines()
    user = env[0]
    password = env[1]
    host = env[2]
    database = env[3]
    port = env[4]
    

URI = f'postgresql://{user}:{password}@{host}:{port}/{database}'

In [3]:
# gettin the directory path
current_dir = os.getcwd()
parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))
nfl_pbp_dir = 'data/pbp'

# loading all of the parquets files at once using the directory path
df = pd.read_parquet(f'{parent_dir}/{nfl_pbp_dir}')

print(df.shape)

(1148717, 372)


In [4]:
lee_sharpe = 'https://raw.githubusercontent.com/nflverse/nfldata/master/data/games.csv'

ls_cols = ['game_id', 'overtime', 'home_rest', 'away_rest', 'div_game','roof', 
           'surface', 'temp', 'wind', 'home_coach', 'away_coach', 'referee']

ls = pd.read_csv(lee_sharpe, usecols=ls_cols)
ls = ls.sort_values(by='game_id')

In [5]:
df['year'] = pd.to_datetime(df['game_date']).dt.year
df['two_point_conv_result'] = (
    df['two_point_conv_result']
    .map(
        {'success' : 1,
         'failure' : 0,
         }
    )
    .fillna('None')
)

df['game_date'] = pd.to_datetime(df['game_date'])
df['spread_line'] = df['spread_line'] * -1
df['field_goal_result'] = np.where(df['field_goal_result'] == 'made', 1, 0)
df['time_between'] = df.groupby(['game_id'])['game_seconds_remaining'].transform(lambda x: x.sub(x.shift(-1)).fillna(0))
df['play_type'] = np.where(df['two_point_attempt'] > 0.5, 'two_point_att', df['play_type'])
df['air_yards_to_sticks'] = df['air_yards'].sub(df['ydstogo'])
df['season'] = [int(x.split('_')[0]) for x in df.game_id]
df['blocked_player_name'] = np.where(df['blocked_player_name'].notnull(), 1, 0)
df['fg_0_39'] = np.where(((df['play_type'] == 'field_goal') & (df['kick_distance'].between(0,39))), 1, 0)
df['fg_40_49'] = np.where(((df['play_type'] == 'field_goal') & (df['kick_distance'].between(40,49))), 1, 0)
df['fg_50_on'] = np.where(((df['play_type'] == 'field_goal') & (df['kick_distance'].between(50,100))), 1, 0)
df['extra_point_result'] = np.where(df['extra_point_result'] == 'good', 1, 0)

In [6]:
success = []

for a, b, c in zip(df['down'], df['ydstogo'], df['yards_gained']):
  frac = 0.4 * b
  
  if b == 0.0:
    success.append(np.nan)
  
  elif ((a==1) | (a==2)):
    if c >= frac:
      success.append(1)
    else:
      success.append(0)
  elif ((a==3) | (a==4)):
    if c >= b:
      success.append(1)
    else:
      success.append(0)

df['success'] = success

In [10]:
def get_opp_pass(df=df):

  opp_pass = (
      df[df['pass_attempt'] == 1]
      .groupby(['game_id', 'defteam'], as_index=False)['yards_gained']
      .sum()
      .rename(columns={'defteam' : 'team',
                      'yards_gained' : 'opp_pass_yds'})
  )

  return opp_pass

In [11]:
def get_opp_rush(df=df):

  opp_rush = (
      df[df['rush_attempt'] == 1]
      .groupby(['game_id', 'defteam'], as_index=False)['yards_gained']
      .sum()
      .rename(columns={'defteam' : 'team',
                      'yards_gained' : 'opp_rush_yds'})
  )

  return opp_rush

In [12]:
def get_def_stats(df=df):

  def_cols = ['interception', 'season', 'return_touchdown', 'fumble', 
              'sack', 'epa']

  def_stats = (
      df[~df['desc'].str.contains('Aborted')].copy()
      .groupby(['game_id', 'defteam'], as_index=False)
      .agg({
          'interception' : 'sum',
          'season' : lambda x: x.unique()[0],
          'return_touchdown' : 'sum',
          'fumble_lost' : 'sum',
          'sack' : 'sum',
          'safety' : 'sum',
          'blocked_player_name' : 'sum'
      })
      .rename(columns={
          'defteam' : 'team',
          'interception' : 'def_int',
          'return_touchdown' : 'def_td',
          'sack' : 'def_sack',
          'fumble' : 'def_fumble',
          'blocked_player_name' : 'kick_blocked'})
  )
  
  return def_stats

In [14]:
kicker_df = get_kicker_stats()

In [20]:
def create_ref_dict():

  names = games_df['referee'].unique()
  ref_dict = {}

  for i in range(len(names)-1):
    sim = process.extractOne(names[i], names[i+1:])
    
    if sim[1] > 85:
      ref_dict[names[i]] = sim[0]

  # del ref_dict['Jim Sprenger']
  ref_rev_dict = {y : x  for x, y in ref_dict.items()}

  return ref_rev_dict

In [22]:
def get_rolling_qb_epa(df=df):

  qb_epa_dropback = df[df['qb_dropback'] == 1][['game_id', 'passer_player_name', 'epa', 'wp']].copy()
  qb_epa_db = qb_epa_dropback[qb_epa_dropback['passer_player_name'].notnull()].copy()
  qb_epa_db.replace('Jos.Allen', 'J.Allen', inplace=True)
  qb_epa_db['1000_rolling_mean'] = qb_epa_db.groupby('passer_player_name')['epa'].transform(lambda x: x.shift().rolling(1000, min_periods=200).mean())
  qb_epa_db['exp_career_mean'] = qb_epa_db.groupby('passer_player_name')['epa'].transform(lambda x: x.shift().expanding().mean())

  return qb_epa_db.reset_index(drop=True)

In [23]:
# getting red zone statistics

rz_df = df[df['yardline_100'] <= 20].copy()

rz_rushing = get_rushing(df=rz_df)
rz_receiving = get_receiving(df=rz_df)
rz_qbs = get_qb_pass(df=rz_df)

rz_qbs = (
    rz_qbs
    .merge(rz_rushing.drop(columns='success'), how='left', on=['game_id', 'player', 'player_id', 'team', 'total_line', 'home_team', 'away_team'])
    .fillna(0)
)

In [24]:
# creating tables for rushing, passing, and receiving. merging the rushing
# and passing dataframes because sometimes quarterbacks run!

rush_df = get_rushing()
rec_df = get_receiving()
qbs = get_qb_pass()

qb_df = qbs.merge(rush_df.drop(columns='success'), how='left', on=['game_id', 'player', 'player_id', 'team', 'total_line', 'home_team', 'away_team'])
qb_df.fillna(0, inplace=True)
qb_df['total_epa'] = qb_df['rush_epa'].add(qb_df['epa'])

print(qb_df.shape)
print(rec_df.shape)
print(rush_df.shape)

(15634, 39)
(99175, 16)
(51035, 17)


In [25]:
team_pass_yds = get_team_pass_yds()
team_rush_yds = get_team_rush_yds()

opp_rush = get_opp_rush()
opp_pass = get_opp_pass()

def_stats = get_def_stats()
team_scores = get_team_scores()

  .sum()


In [26]:
games_df = get_game_results()

In [27]:
ref_rev_dict = create_ref_dict()
games_df['referee'] = games_df['referee'].map(ref_rev_dict).fillna(games_df['referee'])

NameError: name 'process' is not defined

In [28]:
drive_stats = get_drive_stats()
drive_summary = drive_stats.groupby(['game_id', 'team'], as_index=False).agg({'poss_time' : 'sum',
                                                                              'rush_attempt' : 'sum',
                                                                              'pass_attempt' : 'sum',
                                                                              'total_plays' : 'sum'})

In [29]:
games_df = games_df.merge(drive_summary, how='left', on=['game_id', 'team'])

In [30]:
games_df['pass_per_attempt'] = games_df['pass_yards'].div(games_df['pass_attempt']).round(1)
games_df['perc_pass'] = games_df['pass_attempt'].div(games_df['total_plays']).round(3)
games_df['sec_per_play'] = games_df['poss_time'].div(games_df['total_plays']).round(1)

In [31]:
def get_starting_qb():

  throwers = df.groupby('passer_player_name')['pass_attempt'].sum().reset_index()
  qbs = [qb for qb in throwers[throwers['pass_attempt'] > 18]['passer_player_name'].unique()]

  starting_qb = (
      df[df['passer_player_name'].isin(qbs)]
      .groupby(['game_id', 'posteam'], as_index=False)['passer_player_name']
      .first()
      .rename(columns={
          'posteam' : 'team',
          'passer_player_name' : 'starting_qb'}
          )
  )

  return starting_qb

starting_qb = get_starting_qb()
games_df = games_df.merge(starting_qb, how='left', on=['game_id', 'team'])

def get_home_away_qb():

  home_away_qbs = (
      games_df
      .groupby(['game_id', 'home'])['starting_qb']
      .apply(lambda x: x.unique()[0])
      .reset_index()
      .set_index(['game_id', 'home'])
      .unstack()

  )
  
  home_away_qbs.columns = ['away_start_qb', 'home_start_qb']
  home_away_qbs = home_away_qbs.reset_index()
  
  return home_away_qbs


home_away_qbs = get_home_away_qb()
games_df = games_df.merge(home_away_qbs, how='left', on=['game_id'])

In [32]:
adj_epa = get_team_adjusted_epa()

off_epa_game = (
    adj_epa
    .groupby(['game_id', 'posteam', 'play_type'], as_index=False)['epa']
    .agg(['sum'])
    .reset_index()
    .pivot_table(index=['game_id', 'posteam'],
                 columns='play_type',
                 values='sum')
    .reset_index()
    .rename(columns={
        'pass' : 'off_pass_epa',
        'run' : 'off_run_epa',
        'posteam' : 'team'
    })
    .round(2)
)

def_epa_game = (
    adj_epa
    .groupby(['game_id', 'defteam', 'play_type'], as_index=False)['epa']
    .agg(['sum'])
    .reset_index()
    .pivot_table(index=['game_id', 'defteam'],
                 columns='play_type',
                 values='sum')
    .reset_index()
    .rename(columns={
        'pass' : 'def_pass_epa',
        'run' : 'def_run_epa',
        'defteam' : 'team'
    })
    .round(2)
)

In [33]:
games_df = games_df.merge(off_epa_game, how='left', on=['game_id', 'team'])
games_df = games_df.merge(def_epa_game, how='left', on=['game_id', 'team'])

In [34]:
def_stats = get_def_stats()

games_df = games_df.merge(def_stats, how='left', on=['game_id', 'team', 'season'])

In [35]:
rolling_qb = get_rolling_qb_epa()

In [53]:
def populate_table(df=None, table_name=None, URI=URI):

    engine = create_engine(URI)
    print('connected to the database..')
    
    df.head(0).to_sql(table_name, engine, if_exists='replace',index=False) 

    conn = engine.raw_connection()
    cur = conn.cursor()
    print('creating the cursor..')

    output = io.StringIO()
    
    print('writing the csv to file..')
    
    df.to_csv(output, sep='\t', header=False, index=False)
    output.seek(0)
    
    
    contents = output.getvalue()
    
    cur.copy_from(output, table_name, null="") # null values become ''
    
    cur.close()
    conn.commit()
    conn.close()

In [51]:
def create_table(df, table_name, URI=URI):

  engine = create_engine(URI)
  print('connected to the db..')

  df.head(0).to_sql(table_name, engine, if_exists='replace', index=False)

In [54]:
create_table(df, 'pbp', URI=URI)

connected to the db..


In [55]:
populate_table(df=df, table_name='pbp', URI=URI)

connected to the database..
creating the cursor..
writing the csv to file..


MemoryError: 

In [None]:
def insert_into_table(df, table_name, URI=URI):

  engine = create_engine(URI)

  conn = engine.raw_connection()
  cur = conn.cursor()

  output = io.StringIO()

  df.to_csv(output, sep='\t', header=False, index=False)
  output.seek(0)

  contents = output.getvalue()

  cur.copy_from(output, table_name, null="")

  conn.commit()
  cur.close()
  conn.close()

In [None]:
date_threshold = '2021-10-16'
pbp_df = df[df['game_date'] >= date_threshold].copy()

for chunk in np.array_split(pbp_df, 3):
  df_ = pd.DataFrame(chunk)

  insert_into_table(df_, 'pbp', URI)

In [None]:
def drop_table(table_name, URI):
  
  psyco_conn = psycopg2.connect(URI)
  cursor = psyco_conn.cursor()
  psyco_conn.autocommit = True

  cursor.execute("""DROP TABLE %s;"""%table_name)
  cursor.close()
  psyco_conn.close()

In [None]:
populate_table(qb_df, 'qb_games', URI)

connected to the database..
creating the cursor..
writing the csv to file..


In [None]:
populate_table(games_df, 'games', URI)

connected to the database..
creating the cursor..
writing the csv to file..


In [None]:
populate_table(rec_df, 'receiving_games', URI)

connected to the database..
creating the cursor..
writing the csv to file..


In [None]:
populate_table(rush_df, 'rush_games', URI)

connected to the database..
creating the cursor..
writing the csv to file..


In [None]:
populate_table(def_stats, 'defense_games', URI)

connected to the database..
creating the cursor..
writing the csv to file..


In [None]:
populate_table(kicker_df, 'kicker_games', URI)

connected to the database..
creating the cursor..
writing the csv to file..


In [None]:
populate_table(rz_rushing, 'red_zone_rush', URI)

connected to the database..
creating the cursor..
writing the csv to file..


In [None]:
populate_table(rz_receiving, 'red_zone_rec', URI)

connected to the database..
creating the cursor..
writing the csv to file..


In [None]:
populate_table(rz_qbs, 'red_zone_qb', URI)

connected to the database..
creating the cursor..
writing the csv to file..
