In [57]:
import os
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_absolute_error

from sqlalchemy import create_engine

warnings.filterwarnings('ignore', category=pd.errors.PerformanceWarning)

In [2]:
# loading up the postgres credentials
user = os.environ['DB_USER']
password = os.environ['DB_PASSWORD']
host = os.environ['DB_HOST']
database = os.environ['DB_NAME']
port = os.environ['DB_PORT']
    
URI = f'postgresql://{user}:{password}@{host}:{port}/{database}'

In [3]:
lee_sharpe_query = """
    select *
    from lee_sharpe
    where season >= 2012
"""

ls = pd.read_sql(lee_sharpe_query, URI)

In [4]:
engine = create_engine(URI)

query = """
    select *
    from player_stats
    where season >=2012 and position = 'QB'
    order by season asc, week asc
"""
df = pd.read_sql(query, con=engine)

In [5]:
# creating a dictionary of all QB starters for each season and week.
# this will be used to assign starter status to each player
# in the main dataframe in order to include only games in which
# the quarterback started

ls_dict = (
    ls
    .groupby(['season', 'week'])[['away_qb_id', 'home_qb_id']]
    .agg({'away_qb_id' : 'unique', 'home_qb_id' : 'unique'})
    .reset_index()
    .assign(all_starters=lambda x: x[['away_qb_id', 'home_qb_id']]
                                 .apply(lambda row: np.concatenate(row), axis=1))
    .set_index(['season', 'week'])['all_starters']
    .to_dict()
)

df['starter'] = [1 if p in ls_dict[(s, w)] else 0 for p, s, w in zip(df['player_id'], df['season'], df['week'])]

# dropping all rows where the player did not start the game
df = df[df['starter'] == 1].copy()

In [16]:
def rolling_player_statistics(data=None, window=None, min_periods=None):
    """
    Returns a pandas Series with the rolling mean of the data given 
    a window and minimum number of periods. Shifted by one period.
    """   
    
    data_mean = data.shift().rolling(window=window, min_periods=min_periods).mean()
    
    return data_mean

In [32]:
rolling_stats_columns = ['attempts', 'completions', 'passing_yards', 'passing_tds', 'interceptions',
                        'sacks', 'sack_yards', 'sack_fumbles', 'passing_air_yards', 'passing_yards_after_catch',
                        'passing_first_downs', 'passing_epa', 'pacr', 'carries', 'rushing_yards', 'rushing_first_downs',
                        'fantasy_points_ppr']

# getting rolling statistics for 5 games
for col in rolling_stats_columns:
    df[f'rolling_{col}_5'] = df.groupby('player_id')[f'{col}'].transform(lambda x: rolling_player_statistics(x, 5, 1))
    
# getting rolling statistics for 10 games
for col in rolling_stats_columns:
    df[f'rolling_{col}_10'] = df.groupby('player_id')[f'{col}'].transform(lambda x: rolling_player_statistics(x, 10, 3))
    
# getting rolling statistics for 15 games
for col in rolling_stats_columns:
    df[f'rolling_{col}_15'] = df.groupby('player_id')[f'{col}'].transform(lambda x: rolling_player_statistics(x, 15, 3))

# getting expanding statistics for each player
for col in rolling_stats_columns:
    df[f'expanding_{col}_mean'] = df.groupby('player_id')[f'{col}'].transform(lambda x: x.shift().expanding().mean())
    df[f'expanding_{col}_std'] = df.groupby('player_id')[f'{col}'].transform(lambda x: x.shift().expanding().std())

In [61]:
compare_cols = ['interceptions', 'expanding_interceptions_mean']

mean_absolute_error(df.dropna(subset=compare_cols)[compare_cols[0]], df.dropna(subset=compare_cols)[compare_cols[1]])

0.7566192276387919

In [77]:
df.columns[:30]

Index(['player_id', 'player_name', 'player_display_name', 'position',
       'position_group', 'headshot_url', 'recent_team', 'season', 'week',
       'season_type', 'completions', 'attempts', 'passing_yards',
       'passing_tds', 'interceptions', 'sacks', 'sack_yards', 'sack_fumbles',
       'sack_fumbles_lost', 'passing_air_yards', 'passing_yards_after_catch',
       'passing_first_downs', 'passing_epa', 'passing_2pt_conversions', 'pacr',
       'dakota', 'carries', 'rushing_yards', 'rushing_tds', 'rushing_fumbles'],
      dtype='object')

In [75]:
ls = ls[['season', 'week', 'away_team', 'home_team', 'location', 'away_moneyline',
        'home_moneyline', 'away_spread_odds', 'home_spread_odds', 'total_line', 
        'surface', 'roof', 'temp', 'wind']].copy()

Index(['game_id', 'season', 'game_type', 'week', 'gameday', 'weekday',
       'gametime', 'away_team', 'away_score', 'home_team', 'home_score',
       'location', 'result', 'total', 'overtime', 'old_game_id', 'gsis',
       'nfl_detail_id', 'pfr', 'pff', 'espn', 'away_rest', 'home_rest',
       'away_moneyline', 'home_moneyline', 'spread_line', 'away_spread_odds',
       'home_spread_odds', 'total_line', 'under_odds', 'over_odds', 'div_game',
       'roof', 'surface', 'temp', 'wind', 'away_qb_id', 'home_qb_id',
       'away_qb_name', 'home_qb_name', 'away_coach', 'home_coach', 'referee',
       'stadium_id', 'stadium'],
      dtype='object')