In [None]:
!pip install category_encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting category_encoders
  Downloading category_encoders-2.5.0-py2.py3-none-any.whl (69 kB)
[K     |████████████████████████████████| 69 kB 3.2 MB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.5.0


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from category_encoders import TargetEncoder, CatBoostEncoder


from sqlalchemy import create_engine

from sklearn.preprocessing import StandardScaler, QuantileTransformer, MinMaxScaler, RobustScaler
from sklearn.ensemble import IsolationForest
from sklearn.metrics import mean_absolute_error

import tensorflow as tf
import tensorflow_probability as tfp
tfk = tf.keras
tf.keras.backend.set_floatx('float64')
tfd = tfp.distributions

## defining helper functions

neg_log_likelihood = lambda x, rv_x: -rv_x.log_prob(x)

  import pandas.util.testing as tm


In [None]:
URI = 'postgresql://yeunwjcsjwwzge:24f76f29b196dbec6342d9cbe0588297b58bdbd2e058ac5da8eae280d7d2370c@ec2-18-215-44-132.compute-1.amazonaws.com:5432/d6mvs6tutt0f4m'

engine = create_engine(URI)

qb_df = pd.read_sql('SELECT * FROM rolling_qb_dk', con=engine)
games_df = pd.read_sql('SELECT * FROM games', con=engine)

In [None]:
games_df['rolling_perc_pass'] = games_df.groupby(['team'])['perc_pass'].transform(lambda x: x.shift().rolling(10).mean().fillna(method='bfill'))

In [None]:
games_df['rolling_pass_def_epa'] = games_df.groupby('team', as_index=False)['def_pass_epa'].transform(lambda x: x.shift().ewm(span=15, adjust=True).mean())
qb_df['opp_team'] = np.where(qb_df['home_team'] == qb_df['team'], qb_df['away_team'], qb_df['home_team'])
games_df.rename(columns = {'team' : 'opp_team'}, inplace=True)
qb_df = qb_df.merge(games_df[['game_id', 'opp_team', 'rolling_pass_def_epa', 'spread_line', 'week', 'rolling_perc_pass']], how='left', on=['game_id', 'opp_team'])
games_df.rename(columns={'opp_team' : 'team'}, inplace=True)

In [None]:
def prepare_qb_data(df=qb_df, min_season=2013, min_games=6):   

  df['season'] = [int(x.split('_')[0]) for x in df['game_id']]
  df = df[df['season'] >= min_season].copy()

  df = (
      df
      .groupby(['starting_qb'])
      .filter(lambda x: x['game_id'].count() >= min_games)
      .reset_index(drop=True)
  )

  df = (
      df
      .merge(games_df[['game_id', 'starting_qb', 'spread_line']],
             how='left',
             on=['game_id', 'starting_qb', 'spread_line'])
  )

  df['spread_line'] = df['spread_line'].mul(-1)
  df = df.drop_duplicates()

  return df.round(3)

In [None]:
df = prepare_qb_data(min_games=6)
df.dropna(inplace=True)
df = df[df['dk_points'] >0].copy()
qb_names = df['starting_qb'].values

In [None]:
df['playoffs'] = np.where(df['season_type'] == 'REG', 0, 1)

df['playoffs'].value_counts()

0    4170
1     176
Name: playoffs, dtype: int64

In [None]:
df['fanduel_points'] = (
    df['rush_yds'].mul(0.1)
    .add(df['rush_td'].mul(6))
    .add(df['pass_yards'].mul(0.04))
    .add(df['interception'].mul(-1))
    .add(df['fumbles'].mul(-2))
    .add(df['touchdown'].mul(4))
)

In [None]:
df['rolling_fd_points'] = df.groupby('player_id')['fanduel_points'].transform(lambda x: x.shift().rolling(15).mean().fillna(method='bfill'))

In [None]:
df.dropna(subset=['rolling_fd_points'], inplace=True)

In [None]:
# ordinal encoding the QB column and mapping the qb entries in the df to the
# ordinal encodings

all_unique_qbs = set(df['starting_qb'])
ordinal_encoded_QBs = {qb:i for qb, i in zip(all_unique_qbs, np.arange(0, len(all_unique_qbs)))}
n_unique_qbs = len(all_unique_qbs)

df['starting_qb'] = df['starting_qb'].map(ordinal_encoded_QBs)
n_dim = int(np.ceil(n_unique_qbs ** (1/4)))

reverse_qb_dict = {v:k for k, v in ordinal_encoded_QBs.items()}

In [None]:
[reverse_qb_dict[x] for x in new_df.starting_qb]

In [None]:
# creating a dictionary of labels mapping qb names to labels
qbs = df[['starting_qb']].values

# creating a new dataframe that includes only the features in the model

model_cols = ['rolling_pass_attempts', 'rolling_pass_yards', 'rolling_air_yards',
             'rolling_pass_tds', 'rolling_rush_att', 'rolling_pass_def_epa', 
             'starting_qb', 'spread_line', 'season', 'total_line', 'rolling_fd_points',
              'rolling_perc_pass']

new_df = df[model_cols].copy()


# creating a list of columns that will be used when restoring the data to a frame
columns = new_df.columns.to_list()

# scaling the numerical columns
X_t = new_df.copy()
y_t = df[['fanduel_points']].copy()

# creating a new dataframe for the dataset
dataset = pd.DataFrame(X_t, columns=columns)

# #  
dataset['fanduel_points'] = y_t

# defining the input and output columns
inputs = [x for x in dataset.columns if x != 'fanduel_points']

dataset['fanduel_points'] = y_t

outputs = 'fanduel_points'

In [None]:
dataset.dropna(inplace=True)
ordinal_encoding_seasons = {season:code for season, code in zip(dataset.season.unique(), np.arange(0,100))}
dataset['season'] = dataset['season'].map(ordinal_encoding_seasons)

In [None]:
train_size = int(0.9 * dataset.shape[0])

train_data, test_data = dataset[:train_size], dataset[train_size:]

In [None]:
# creating a model with an embedding for the QBs using the Keras functional
# API


num_cols = train_data.select_dtypes(include=['float64'])
target = num_cols['fanduel_points'].values

num_cols.drop(columns='fanduel_points', inplace=True)
num_cols = num_cols.values

scaler = RobustScaler()
num_cols = scaler.fit_transform(num_cols)


qb_cols = train_data['starting_qb']
qb_cols = qb_cols.values
qb_cols = qb_cols.reshape(-1, 1)

season_cols = train_data['season']
season_cols = season_cols.values
season_cols = season_cols.reshape(-1,1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [None]:
# creating an embedding model with non-constant variance

num_inputs = tfk.layers.Input(shape=num_cols.shape[1],)
qb_inputs = tfk.layers.Input(shape=qb_cols.shape[1],)

qb_embedding = tfk.layers.Embedding(input_dim=n_unique_qbs, output_dim=3)(qb_inputs)
qb_flatten = tfk.layers.Flatten()(qb_embedding)

concatenate_qb = tfk.layers.Concatenate()([num_inputs, qb_flatten])

hidden = tfk.layers.Dense(8, activation='relu')(concatenate_qb)

params = tfk.layers.Dense(2, activation='linear')(hidden)

output = tfp.layers.DistributionLambda(
                            lambda t: tfd.Normal(loc=t[..., :1],
                                                 scale=1e-3 + tf.math.softplus(0.05 * t[...,1:])))(params)

embedding_model = tfk.Model(inputs=[num_inputs, qb_inputs], outputs=output)

embedding_model.compile(tfk.optimizers.Adam(learning_rate=0.03),
                               loss=neg_log_likelihood)

In [None]:
early_stopping = tfk.callbacks.EarlyStopping(patience=25, monitor='val_loss', restore_best_weights=True)

In [None]:
# fitting the embedding model

history = (
    embedding_model
    .fit(
        [num_cols, qb_cols],
         target,
         epochs=251,
         validation_split=0.1,
         callbacks=[early_stopping])
)

In [None]:
# extracting 500 predictions for each QB game

embedding_sims = []

for i in range(500):
  x = embedding_model.predict([num_cols, qb_cols])
  embedding_sims.append(x)

embedding_sims = np.concatenate(embedding_sims, axis=1)

# converting the predictions into a dataframe and comparing the median, boom,
# and bust predictions to actual outcomes

embedding_df = (
    pd.DataFrame(embedding_sims)
    .T
    .apply(lambda x: [x.mean(), x.median(), x.quantile(0.1), x.quantile(0.9)])
    .T
    .rename(columns={0:'mean', 1:'median', 2:'bust', 3:'boom'})
)

embedding_df.index = qb_cols.ravel()
embedding_df.index = [reverse_qb_dict[x] for x in embedding_df.index]

embedding_df['actual'] = target

embedding_df['over_median'] = (embedding_df['actual'] > embedding_df['median']).astype(int)
embedding_df['over_90th'] = (embedding_df['actual'] > embedding_df['boom']).astype(int)
embedding_df['under_10th'] = (embedding_df['actual'] > embedding_df['bust']).astype(int)

print(embedding_df.over_median.mean(), embedding_df.over_90th.mean(), 
      embedding_df.under_10th.mean())

In [None]:
embedding_df.groupby(embedding_df.index)[['actual', 'mean']].mean().sort_values(by='mean').tail(40)

Creating a Mixture Density Network

In [None]:
number_inputs = X_t.drop(columns=['starting_qb', 'season'])

scaler = RobustScaler()

number_inputs_scaled = scaler.fit_transform(number_inputs)

In [None]:
# creating a mixture density network model using only the numerical columns 

event_shape = [1]
num_components = 2
mixture_params = tfp.layers.MixtureNormal.params_size(num_components, event_shape)


mixture_model = tfk.Sequential([
    tfk.layers.Dense(8, activation='relu'),
    tfk.layers.Dense(4, activation='relu'),
    tfk.layers.Dense(mixture_params, activation='softplus'),
    tfp.layers.MixtureNormal(num_components=num_components, 
                             event_shape=event_shape
                             )
])

mixture_model.compile(optimizer=tfk.optimizers.Adam(learning_rate=0.02),
                      loss=lambda y, mixture_model: -mixture_model.log_prob(y))


In [None]:
mixture_history = (
    mixture_model
    .fit(
        number_inputs_scaled, y_t,
        batch_size=100,
        epochs=200,
        validation_split=0.1,
        callbacks=[early_stopping]
    )
)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [None]:
# extracting the model parameters from the mixturenormal model

x = number_inputs_scaled[333,:].reshape(1, -1)

weights = mixture_model(x).mixture_distribution.probs_parameter()
means = mixture_model(x).components_distribution.mean()
variances = mixture_model(x).components_distribution.variance()

np.round(weights.numpy(), 4)

array([[0.5971, 0.4029]])

In [None]:
# # getting 500 predictions for each game using the MixtureNormal model

sims = []

for i in range(500):
  x = mixture_model.predict(number_inputs_scaled)
  sims.append(x)

sims = np.concatenate(sims, axis=1)

# converting the MixtureNormal predictions into a DataFrame

sims_df = (
    pd.DataFrame(sims)
    .T
    .apply(lambda x: [x.mean(), x.median(), x.quantile(0.1), x.quantile(0.9)])
    .T
    .rename(columns={0:'mean', 1:'median', 2:'bust', 3:'boom'})
)


sims_df.index = new_df.starting_qb.values
sims_df.index = [reverse_qb_dict[x] for x in sims_df.index]

sims_df['actual'] = y_t.values

sims_df['over_median'] = (sims_df['actual'] > sims_df['median']).astype(int)
sims_df['over_90th'] = (sims_df['actual'] > sims_df['boom']).astype(int)
sims_df['under_10th'] = (sims_df['actual'] < sims_df['bust']).astype(int)

print(sims_df[['over_median', 'over_90th', 'under_10th']].mean())

over_median    0.508776
over_90th      0.114729
under_10th     0.103027
dtype: float64


In [None]:
sims_df['game_id'] = df.game_id.values
sims_df['wind'] = df.wind.values

sims_df['spread'] = new_df.spread_line.values
sims_df['total_line'] = new_df.total_line.values

sims_df.sort_values(by='mean', ascending=False).head(60)

Unnamed: 0,mean,median,bust,boom,actual,over_median,over_90th,under_10th,game_id,spread,total_line,wind
L.Jackson,29.421962,29.433133,15.476128,43.125767,17.56,0,0,0,2020_02_BAL_HOU,-7.5,49.5,0.0
L.Jackson,29.275052,29.045677,15.404691,43.293219,14.5,0,0,1,2020_05_CIN_BAL,-12.5,49.0,3.0
L.Jackson,29.032794,28.099088,16.343659,43.282144,26.16,0,0,0,2018_17_CLE_BAL,-7.0,41.0,2.0
P.Manning,28.649091,29.560903,14.719679,41.945399,14.1,0,0,1,2013_06_JAX_DEN,-27.0,53.0,7.0
L.Jackson,28.505695,28.00344,14.396472,42.289577,16.74,0,0,0,2018_15_TB_BAL,-8.5,44.5,10.0
L.Jackson,28.360067,28.236032,12.44077,44.193093,22.22,0,0,0,2018_12_OAK_BAL,-13.0,42.0,7.0
L.Jackson,28.323434,28.305176,15.54134,40.413637,26.02,0,0,0,2020_04_BAL_WAS,-14.5,45.0,5.0
L.Jackson,28.054216,28.26366,14.386558,40.987227,30.22,1,0,0,2020_15_JAX_BAL,-13.0,49.5,3.0
L.Jackson,27.831902,27.458616,16.257859,41.894078,27.5,1,0,0,2020_01_CLE_BAL,-7.0,47.0,5.0
J.Allen,27.809123,28.361785,15.401679,39.605614,21.02,0,0,0,2021_04_HOU_BUF,-19.0,47.5,0.0


In [None]:
sims_df[sims_df.index == 'T.Brady'].sort_values(by='mean', ascending=False).head(60)

Unnamed: 0,mean,median,bust,boom,actual,over_median,over_90th,under_10th,game_id,spread,total_line,wind
T.Brady,24.659751,24.245608,14.145884,35.949873,31.64,1,0,0,2021_02_ATL_TB,-13.5,52.0,0.0
T.Brady,24.059697,23.875036,13.406142,35.481194,18.98,0,0,0,2021_06_TB_PHI,-7.0,53.0,0.0
T.Brady,23.684671,23.356617,13.597568,34.448263,37.74,1,1,0,2021_05_MIA_TB,-11.0,48.0,0.0
T.Brady,23.149498,23.193385,13.140768,32.611849,18.68,0,0,0,2015_15_TEN_NE,-14.5,48.0,12.0
T.Brady,23.131461,22.94637,13.572707,32.974332,25.6,1,0,0,2017_11_NE_OAK,-7.0,55.0,0.0
T.Brady,23.029024,23.057887,13.564573,32.784747,22.72,0,0,0,2015_03_JAX_NE,-14.5,48.5,6.0
T.Brady,22.937406,22.877967,12.714013,32.556228,23.8,1,0,0,2018_06_KC_NE,-4.0,59.5,0.0
T.Brady,22.875194,22.755228,13.574293,32.37635,8.52,0,0,1,2018_03_NE_DET,-7.0,55.5,0.0
T.Brady,22.840676,23.034294,12.5125,33.73537,29.92,1,0,0,2020_16_TB_DET,-12.0,55.0,0.0
T.Brady,22.741976,23.296555,12.258751,32.766544,30.78,1,0,0,2017_02_NE_NO,-5.5,55.0,0.0


Creating and Evaluating a Negative Binomial model

In [None]:
# creating a model with a negative binomial output layer as an exercise. the 
# distribution of qb points doesn't really follow the neg binomial, but
# other positions do

num_cols = train_data.select_dtypes(include=['float64'])
target = num_cols['dk_points'].values
num_cols.drop(columns='dk_points', inplace=True)
num_cols = num_cols.values
cat_cols = train_data.select_dtypes(include=['int64'])
cat_cols = cat_cols.values


num_inputs = tfk.layers.Input(shape=num_cols.shape[1],)
cat_inputs = tfk.layers.Input(shape=cat_cols.shape[1],)

qb_embedding = tfk.layers.Embedding(input_dim=n_unique_qbs, output_dim=3)(cat_inputs)
flatten = tfk.layers.Flatten()(qb_embedding)

concatenate = tfk.layers.Concatenate()([num_inputs, flatten])
hidden = tfk.layers.Dense(12, activation='relu')(concatenate)


params = tfk.layers.Dense(2, activation='softplus')(hidden)

output = tfp.layers.DistributionLambda(
    lambda t: tfd.NegativeBinomial(
        total_count=tf.math.softplus(t[...,:1]),
        probs=tf.math.sigmoid(t[...,1:])
    )
)(params)


neg_bin_model = tfk.Model(inputs=[num_inputs, cat_inputs], outputs=output)

neg_bin_model.compile(tfk.optimizers.Adam(learning_rate=0.004),
                        loss=neg_log_likelihood)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [None]:
neg_bin_history = (
    neg_bin_model
    .fit(
        [num_cols, cat_cols],
         eg_bin_target, epochs=125,
         callbacks=[early_stopping],
         validation_split=0.2)
)

In [None]:
neg_sims = []

for i in range(500):
  x = neg_bin_model.predict([num_cols, cat_cols])
  neg_sims.append(x)

neg_sims = np.concatenate(neg_sims, axis=1)


neg_sims_df = (
    pd.DataFrame(neg_sims)
    .T
    .apply(lambda x: [x.mean(), x.median(), x.quantile(0.1), x.quantile(0.9)])
    .T
    .rename(columns={0:'mean', 1:'median', 2:'bust', 3:'boom'})
)

neg_sims_df['actual'] = neg_bin_target
neg_sims_df['over_median'] = (neg_sims_df['actual'] > neg_sims_df['median']).mean()
neg_sims_df['over_90th'] = (neg_sims_df['actual'] > neg_sims_df['boom']).mean()
neg_sims_df['under_1oth'] = (neg_sims_df['actual'] < neg_sims_df['bust']).mean()

print(neg_sims_df[['over_median', 'over_90th', 'under_1oth']].mean())