<a href="https://colab.research.google.com/github/yaobviously/DS-Unit-2-Kaggle-Challenge/blob/main/first_stab_pymc_nfl.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install bambi 

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import bambi
import scipy.linalg

from sqlalchemy import create_engine

import theano.tensor as tt
import pymc3 as pm
import arviz as az
import xarray

az.style.use('arviz-darkgrid')

  import pandas.util.testing as tm


In [111]:
URI = 'postgresql://yeunwjcsjwwzge:24f76f29b196dbec6342d9cbe0588297b58bdbd2e058ac5da8eae280d7d2370c@ec2-18-215-44-132.compute-1.amazonaws.com:5432/d6mvs6tutt0f4m'

engine = create_engine(URI)

df = pd.read_sql('SELECT * FROM rolling_qb_dk', con=engine)
rb_df = pd.read_sql('SELECT * FROM rolling_rush_dk', con=engine)
wr_df = pd.read_sql('SELECT * FROM rolling_receiver_dk', con=engine)
games_df = pd.read_sql('SELECT * FROM games', con=engine)
roster_df = pd.read_sql('SELECT * FROM depth_charts WHERE season >=2016', con=engine)

In [5]:
def prepare_qb_data(df=df, min_season=2016, min_games=6):   

  df['season'] = [int(x.split('_')[0]) for x in df['game_id']]
  df = df[df['season'] >= min_season].copy()

  df = (
      df
      .groupby(['starting_qb'])
      .filter(lambda x: x['game_id'].count() >= min_games)
      .reset_index(drop=True)
  )

  df = (
      df
      .merge(games_df[['game_id', 'starting_qb', 'spread_line']],
             how='left',
             on=['game_id', 'starting_qb'])
  )

  df = df.fillna(0)

  return df

In [6]:
qb_df = prepare_qb_data()

In [7]:
qb_df_small = qb_df.groupby('starting_qb')[['rolling_dk_points', 'rolling_rush_tds']].last()

In [None]:
qb_df

In [17]:
qb_model = bambi.Model('dk_points ~ total_line + spread_line + rolling_dk_points + rolling_pass_tds + rolling_rush_tds + (1|starting_qb)', qb_df)

group_specific_sd = bambi.Prior("HalfNormal", sigma=10)
group_specific_prior = bambi.Prior("Normal", mu=0, sigma=group_specific_sd)
qb_model.set_priors(group_specific=group_specific_prior)

In [18]:
qb_results = qb_model.fit()

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Sequential sampling (2 chains in 1 job)
NUTS: [dk_points_sigma, 1|starting_qb_offset, 1|starting_qb_sigma, Intercept, rolling_rush_tds, rolling_pass_tds, rolling_dk_points, spread_line, total_line]


Sampling 2 chains for 1_000 tune and 1_000 draw iterations (2_000 + 2_000 draws total) took 43 seconds.
The number of effective samples is smaller than 25% for some parameters.


In [None]:
print(az.summary(qb_results).to_string())

In [None]:
az.plot_forest(
    qb_results,
    figsize=(10, 32),
    kind='ridgeplot'
)

In [21]:
qb_posterior_predictive = qb_model.predict(qb_results, kind='pps', draws=5000)


In [23]:
predictions = pd.DataFrame(qb_results.posterior_predictive["dk_points"][0].values)

qb_df['bayes_mean'] = predictions.mean().values

In [None]:
qb_df[qb_df['season'] == 2021][['starting_qb', 'total_line', 'spread_line', 'bayes_mean']].sort_values(by='bayes_mean', ascending=False)

In [None]:
intercepts = []

for i in range(50):
  int_ = qb_results.posterior["1|starting_qb"].stack(draws=("chain", "draw"))[i].values.mean()

  intercepts.append(int_)

In [60]:
def prepare_wr_data(df=wr_df, min_season=2016, min_games=12):

  df = df[wr_df['season'] >= min_season].copy()
  df = df.groupby('player_id').filter(lambda x: x['game_id'].count() >= min_games)
  df['week'] = [int(x.split('_')[1]) for x in df['game_id']]

  df = (
      df
      .merge(roster_df[['player_id', 'position', 'depth_team', 'season', 'week']], how='left', on=['player_id', 'season', 'week'])
  )

  df = df[df['position'] == 'WR'].copy()

  return df

In [61]:
wr_model_df = prepare_wr_data()

In [65]:
wr_model = bambi.Model('rec_dk_pts ~ total_line + spread_line + rolling_rec_dk_pts + rolling_targets  + rolling_target_share + (1|player_id)', wr_model_df)

In [66]:
wr_results = wr_model.fit()

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Sequential sampling (2 chains in 1 job)
NUTS: [rec_dk_pts_sigma, 1|player_id_offset, 1|player_id_sigma, Intercept, rolling_target_share, rolling_targets, rolling_rec_dk_pts, spread_line, total_line]


Sampling 2 chains for 1_000 tune and 1_000 draw iterations (2_000 + 2_000 draws total) took 224 seconds.
The number of effective samples is smaller than 25% for some parameters.


In [None]:
print(az.summary(wr_results).to_string())

In [None]:
az.plot_forest(
    wr_results,
    figsize=(10, 100),
    kind='ridgeplot'
)

In [154]:
wr_df_small = wr_df.groupby('player_id')[['player', 'rolling_rec_dk_pts', 'rolling_targets', 'rolling_target_share']].last()

In [157]:
wr_intercepts = []

for intercept in range(1599):
  int_ = wr_results.posterior['1|player_id'].stack(draws=('chain', 'draw'))[intercept].values.mean()
  wr_intercepts.append(int_)

In [71]:
def prepare_te_data(df=wr_df, min_season=2016, min_games=12):

  df = df[wr_df['season'] >= min_season].copy()
  df = df.groupby('player_id').filter(lambda x: x['game_id'].count() >= min_games)
  df['week'] = [int(x.split('_')[1]) for x in df['game_id']]

  df = (
      df
      .merge(roster_df[['player_id', 'position', 'depth_team', 'season', 'week']], how='left', on=['player_id', 'season', 'week'])
  )

  df = df[df['position'] == 'TE'].copy()

  return df

In [72]:
te_model_df = prepare_te_data()

In [73]:
te_model = bambi.Model('rec_dk_pts ~ total_line + spread_line + rolling_rec_dk_pts + rolling_targets  + rolling_target_share + (1|player_id)', te_model_df)

In [74]:
te_results = te_model.fit()

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Sequential sampling (2 chains in 1 job)
NUTS: [rec_dk_pts_sigma, 1|player_id_offset, 1|player_id_sigma, Intercept, rolling_target_share, rolling_targets, rolling_rec_dk_pts, spread_line, total_line]


Sampling 2 chains for 1_000 tune and 1_000 draw iterations (2_000 + 2_000 draws total) took 112 seconds.


In [None]:
print(az.summary(te_results).to_string())

In [81]:
def prepare_wr_te_data(df=wr_df, min_season=2016, min_games=12):

  df = df[wr_df['season'] >= min_season].copy()
  df = df.groupby('player_id').filter(lambda x: x['game_id'].count() >= min_games)
  df['week'] = [int(x.split('_')[1]) for x in df['game_id']]

  df = (
      df
      .merge(roster_df[['player_id', 'position', 'depth_team', 'season', 'week']], how='left', on=['player_id', 'season', 'week'])
  )

  df = df[df['position'].isin(['WR','TE'])].copy()

  return df

In [82]:
combined_model_df = prepare_wr_te_data()

In [87]:
combined_model = bambi.Model('rec_dk_pts ~ 0 + total_line + spread_line + rolling_rec_dk_pts + rolling_targets  + rolling_target_share + position + (1|player_id)', combined_model_df)

In [88]:
combined_results = combined_model.fit()

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Sequential sampling (2 chains in 1 job)
NUTS: [rec_dk_pts_sigma, 1|player_id_offset, 1|player_id_sigma, position, rolling_target_share, rolling_targets, rolling_rec_dk_pts, spread_line, total_line]


Sampling 2 chains for 1_000 tune and 1_000 draw iterations (2_000 + 2_000 draws total) took 459 seconds.
The number of effective samples is smaller than 25% for some parameters.


In [None]:
print(az.summary(combined_results).to_string())

In [None]:
az.plot_posterior(combined_results)

In [187]:
def prepare_rb_data(df=rb_df, min_season=2016, min_games=6):

  df = df[df['season'] >= min_season].copy()
  df = df.groupby('player_id').filter(lambda x: x['game_id'].count() >= min_games)
  df['week'] = [int(x.split('_')[1]) for x in df['game_id']]

  df = (
      df
      .merge(roster_df[['player_id', 'position', 'depth_team', 'team', 'season', 'week']], how='left', on=['player_id', 'team', 'season', 'week'])
  )

  df['total_dk'] = df['rec_dk_pts'].add(df['rush_dk_pts'])
  df['rolling_total'] = df.groupby('player_id')['total_dk'].transform(lambda x: x.shift().rolling(8, min_periods=2).mean())

  df = df[df['position'].isin(['RB'])].copy().dropna()

  return df

In [188]:
check = prepare_rb_data()

In [189]:
check

Unnamed: 0,game_id,player_id,player,team,rolling_rush_att,rolling_rush_yds,rolling_rush_td,rolling_rush_att_share,rolling_rush_epa,rolling_success,rolling_rush_dk_pts,rolling_rush_dk_max,rolling_80th,rush_dk_pts,total_line,spread_line,season,rolling_targets,rolling_receptions,rolling_air_yards,rolling_rec_yards,rolling_target_share,rolling_aDOT,rolling_td,rolling_over_100,rolling_rec_dk_pts,rolling_rec_dk_max,rec_dk_pts,total_dk_pts,week,position,depth_team,total_dk,rolling_total
2,2016_01_BUF_BAL,00-0024217,R.Bush,BUF,5.53,23.13,0.20,0.2,-0.74,2.47,3.51,12.1,5.62,-0.4,44.5,3.0,2016,4.33,2.73,5.00,15.53,12.78,2.44,0.00,0.0,4.29,10.4,0.0,-0.4,1,RB,3.0,-0.4,-0.400000
34,2016_01_CHI_HOU,00-0032404,T.Ervin,HOU,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.0,0.00,0.3,43.0,-5.5,2016,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.00,0.0,1.5,1.8,1,RB,2.0,1.8,1.800000
35,2016_01_CHI_HOU,00-0032404,T.Ervin,HOU,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.0,0.00,0.3,43.0,-5.5,2016,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.00,0.0,1.5,1.8,1,RB,2.0,1.8,1.800000
36,2016_01_CHI_HOU,00-0032404,T.Ervin,HOU,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.0,0.00,0.3,43.0,-5.5,2016,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.00,0.0,1.5,1.8,1,RB,2.0,1.8,1.800000
37,2016_01_CHI_HOU,00-0032404,T.Ervin,HOU,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.0,0.00,0.3,43.0,-5.5,2016,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.00,0.0,1.5,1.8,1,RB,2.0,1.8,1.800000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13434,2021_01_SF_DET,00-0031687,R.Mostert,SF,13.47,72.67,0.87,0.5,0.76,7.60,12.60,49.0,15.22,2.0,46.0,-9.5,2021,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.0,0.00,0.0,0.0,2.0,1,RB,1.0,2.0,13.337500
13436,2021_01_SF_DET,00-0033948,J.Williams,DET,8.87,38.40,0.13,0.3,0.51,5.27,4.64,13.7,6.74,11.4,46.0,9.5,2021,3.07,2.73,0.34,19.60,8.80,0.11,0.07,0.0,5.09,17.5,13.6,25.0,1,RB,2.0,25.0,5.862500
13437,2021_01_SF_DET,00-0035806,J.Hasty,SF,6.50,24.67,0.17,0.2,0.01,3.33,3.47,8.9,5.70,6.3,46.0,-9.5,2021,1.33,1.17,-3.82,5.50,3.90,-2.00,0.00,0.0,1.72,3.0,2.5,8.8,1,RB,3.0,8.8,5.183333
13438,2021_01_SF_DET,00-0035806,J.Hasty,SF,6.50,24.67,0.17,0.2,0.01,3.33,3.47,8.9,5.70,6.3,46.0,-9.5,2021,1.33,1.17,-3.82,5.50,3.90,-2.00,0.00,0.0,1.72,3.0,2.5,8.8,1,RB,3.0,8.8,5.700000


In [142]:
player_rb_dict = {}

for a, b in zip(roster_df['player_id'], roster_df['position']):

  if b == 'RB':
    player_rb_dict[a] = b

rb_model_df = rb_df[rb_df['player_id'].isin(player_rb_dict.keys())].copy()

In [None]:
rb_model_df

In [144]:
rb_model_df['total_dk'] = rb_model_df['rec_dk_pts'].add(rb_model_df['rush_dk_pts'])
rb_model_df['rolling_total_dk'] = rb_model_df.groupby('player_id')['total_dk'].transform(lambda x: x.shift().rolling(10, min_periods=1).mean())

In [173]:
rb_model_df = rb_model_df.merge(roster_df[['season', 'week', 'player_id', 'depth_team']], how='left', on=['season', 'week', 'player_id']).drop_duplicates(subset=['game_id', 'player_id'], keep='first')

In [180]:
rb_model_df['depth_team'] = rb_model_df['depth_team'].astype('category')

In [176]:
rb_model_df.dropna(inplace=True)

In [181]:
rb_model = bambi.Model('total_dk ~ 0 + total_line + spread_line + rolling_target_share + rolling_targets + rolling_td + depth_team + rolling_rush_att + rolling_rush_td + (1|player_id)', rb_model_df)

In [182]:
rb_results = rb_model.fit()

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Sequential sampling (2 chains in 1 job)
NUTS: [total_dk_sigma, 1|player_id_offset, 1|player_id_sigma, rolling_rush_td, rolling_rush_att, depth_team, rolling_td, rolling_targets, rolling_target_share, spread_line, total_line]


Sampling 2 chains for 1_000 tune and 1_000 draw iterations (2_000 + 2_000 draws total) took 229 seconds.
The acceptance probability does not match the target. It is 0.8878291792903208, but should be close to 0.8. Try to increase the number of tuning steps.


In [None]:
print(az.summary(rb_results).to_string())

In [201]:
rb_results.mean().posterior['total_line'] 

In [None]:
rb_intercepts = []

for intercept in range(190):
  int_ = rb_results.posterior['1|player_id'].stack(draws=('chain', 'draw'))[intercept].values.mean()
  rb_intercepts.append(int_)