In [11]:
import os
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.metrics import mean_absolute_error

from sqlalchemy import create_engine

warnings.filterwarnings('ignore', category=pd.errors.PerformanceWarning)

In [3]:
# loading up the postgres credentials
user = os.environ['DB_USER']
password = os.environ['DB_PASSWORD']
host = os.environ['DB_HOST']
database = os.environ['DB_NAME']
port = os.environ['DB_PORT']

# creating the URI for the database
URI = f'postgresql://{user}:{password}@{host}:{port}/{database}'

# establishing a connection to the database
engine = create_engine(URI)

In [4]:
lee_sharpe_query = """
    select *
    from lee_sharpe
    where season >= 2012
"""

ls = pd.read_sql(lee_sharpe_query, URI)

In [5]:
# changing the names in the Lee Sharpe dataframe to match the names in the 
# nflfastr dataframe. This will allow us to combine data more easily, 
# using game_id. I cannot believe nflfastr peeps
# did not include it in the player table (lmao), but it still doesnt seem
# efficient to recalculate it all from the pbp data -- maybe later.

lv_home_condition = (ls['home_team'] == 'OAK') & (ls['season'] < 2020)
lv_away_condition = (ls['away_team'] == 'OAK') & (ls['season'] < 2020)
ls.loc[lv_home_condition, 'home_team'] = 'LV'
ls.loc[lv_away_condition, 'away_team'] = 'LV'

la_home_condition = (ls['home_team'] == 'STL') & (ls['season'] < 2016)
la_away_condition = (ls['away_team'] == 'STL') & (ls['season'] < 2016)
ls.loc[la_home_condition, 'home_team'] = 'LA'
ls.loc[la_away_condition, 'away_team'] = 'LA'

lac_home_condition = (ls['home_team'] == 'SD') & (ls['season'] < 2017)
lac_away_condition = (ls['away_team'] == 'SD') & (ls['season'] < 2017)
ls.loc[lac_home_condition, 'home_team'] = 'LAC'
ls.loc[lac_away_condition, 'away_team'] = 'LAC'

In [6]:
query = """
    select *
    from player_stats
    where season >=2012 and position = 'QB'
    order by season asc, week asc
"""
df = pd.read_sql(query, con=engine)

In [7]:
# getting game ids by week from the lee sharpe data. starting with home then
# using update to get the away team game ids
game_id_dict = ls.set_index(['season', 'week', 'home_team'])['game_id'].to_dict()
ls_gameid_away = ls.set_index(['season', 'week', 'away_team'])['game_id'].to_dict()

game_id_dict.update(ls_gameid_away)

# assigning game ids to the player stats dataframe using the game id dictionary
df['game_id'] = 'NA'
df['game_id'] = [game_id_dict[(s, w, t)] if (s, w, t) in game_id_dict else 'NA' for s, w, t in zip(df.season, df.week, df.recent_team)]

print("there are now", df[df['game_id'] == 'NA'].shape[0], "missing game ids")

there are now 0 missing game ids


In [8]:
# merging the lee sharpe data with the player stats data
df = df.merge(ls, how='left', on=['game_id', 'season', 'week'], validate='many_to_one')

# limit the dataset to only quarterbacks who started the game
cond = (df['player_id'] == df['home_qb_id']) | (df['player_id'] == df['away_qb_id'])

starters = df[cond].copy()
starters.reset_index(drop=True, inplace=True)

set(ls.game_id.unique()).difference(set(starters.game_id.unique()))

set()

In [9]:
def rolling_player_statistics(data=None, window=None, min_periods=None):
    """
    Returns a pandas Series with the rolling mean of the data given 
    a window and minimum number of periods. Shifted by one period.
    """   
    
    data_mean = data.shift().rolling(window=window, min_periods=min_periods).mean()
    
    return data_mean

In [10]:
rolling_stats_columns = ['attempts', 'completions', 'passing_yards', 'passing_tds', 'interceptions',
                        'sacks', 'sack_yards', 'sack_fumbles', 'passing_air_yards', 'passing_yards_after_catch',
                        'passing_first_downs', 'passing_epa', 'pacr', 'carries', 'rushing_yards', 'rushing_first_downs',
                        'fantasy_points_ppr']

# getting rolling statistics for 5 games
for col in rolling_stats_columns:
    starters[f'rolling_{col}_5'] = starters.groupby('player_id')[f'{col}'].transform(lambda x: rolling_player_statistics(x, 5, 1))
    
# getting rolling statistics for 10 games
for col in rolling_stats_columns:
    starters[f'rolling_{col}_10'] = starters.groupby('player_id')[f'{col}'].transform(lambda x: rolling_player_statistics(x, 10, 3))
    
# getting rolling statistics for 15 games
for col in rolling_stats_columns:
    starters[f'rolling_{col}_15'] = starters.groupby('player_id')[f'{col}'].transform(lambda x: rolling_player_statistics(x, 15, 3))

# getting expanding statistics for each player
for col in rolling_stats_columns:
    starters[f'expanding_{col}_mean'] = starters.groupby('player_id')[f'{col}'].transform(lambda x: x.shift().expanding().mean())
    starters[f'expanding_{col}_std'] = starters.groupby('player_id')[f'{col}'].transform(lambda x: x.shift().expanding().std())

In [20]:
features = ['rolling_passing_yards_5', 'rolling_passing_yards_10', 'rolling_passing_yards_15', 'expanding_passing_yards_mean',
          'wind', 'passing_yards', 'rolling_attempts_15', 'rolling_completions_15', 'rolling_passing_tds_15', 'temp',
          'total_line']

model_df = starters[features].copy()
model_df['wind'] = model_df['wind'].fillna(0)
model_df['temp'] = model_df['temp'].fillna(72)
model_df.dropna(subset=['rolling_passing_yards_15'], inplace=True)

model = LassoCV()

X = model_df.drop('passing_yards', axis=1)
y = model_df['passing_yards']

X_train, X_test = X.iloc[:4000].copy(), X.iloc[4000:].copy()
y_train, y_test = y.iloc[:4000].copy(), y.iloc[4000:].copy()

model.fit(X_train, y_train)

In [22]:
model.coef_

array([ 0.13443835, -0.11864641,  0.17409532,  0.2431664 , -0.77215004,
        0.30120949,  0.98578967,  0.        ,  0.17428912,  3.1340437 ])

In [21]:
mean_absolute_error(y_test, model.predict(X_test))

59.360334019981686