In [1]:
import nfl_data_py as nfl
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from sklearn.cluster import KMeans



In [7]:
class WindowGenerator():
    def __init__(self, input_width, label_width, shift,
               train_df, val_df, test_df,
               label_columns=None, shuffle = True):
        # Store the raw data.
        self.train_df = train_df
        self.val_df = val_df
        self.test_df = test_df
        
        self.shuffle = shuffle

        # Work out the label column indices.
        self.label_columns = label_columns
        if label_columns is not None:
            self.label_columns_indices = {name: i for i, name in
                                        enumerate(label_columns)}
        self.column_indices = {name: i for i, name in
                               enumerate(train_df.columns)}

        # Work out the window parameters.
        self.input_width = input_width
        self.label_width = label_width
        self.shift = shift

        self.total_window_size = input_width + shift

        self.input_slice = slice(0, input_width)
        self.input_indices = np.arange(self.total_window_size)[self.input_slice]

        self.label_start = self.total_window_size - self.label_width
        self.labels_slice = slice(self.label_start, None)
        self.label_indices = np.arange(self.total_window_size)[self.labels_slice]

    def __repr__(self):
        return '\n'.join([
            f'Total window size: {self.total_window_size}',
            f'Input indices: {self.input_indices}',
            f'Label indices: {self.label_indices}',
            f'Label column name(s): {self.label_columns}'])

    def split_window(self, features):
        inputs = features[:, self.input_slice, :]
        labels = features[:, self.labels_slice, :]
        if self.label_columns is not None:
            labels = tf.stack(
                [labels[:, :, self.column_indices[name]] for name in self.label_columns],
                axis=-1)

        # Slicing doesn't preserve static shape information, so set the shapes
        # manually. This way the `tf.data.Datasets` are easier to inspect.
        inputs.set_shape([None, self.input_width, None])
        labels.set_shape([None, self.label_width, None])

        return inputs, labels
    
    def plot(self, model=None, plot_col='receptions', max_subplots=3):
        inputs, labels = self.example
        plt.figure(figsize=(12, 8))
        plot_col_index = self.column_indices[plot_col]
        max_n = min(max_subplots, len(inputs))
        for n in range(max_n):
            plt.subplot(max_n, 1, n+1)
            plt.ylabel(f'{plot_col} [normed]')
            plt.plot(self.input_indices, inputs[n, :, plot_col_index],
                     label='Inputs', marker='.', zorder=-10)

            if self.label_columns:
                label_col_index = self.label_columns_indices.get(plot_col, None)
            else:
                label_col_index = plot_col_index

            if label_col_index is None:
                continue

            plt.scatter(self.label_indices, labels[n, :, label_col_index],
                        edgecolors='k', label='Labels', c='#2ca02c', s=64)
            if model is not None:
                predictions = model(inputs)
                plt.scatter(self.label_indices, predictions[n, :, label_col_index],
                          marker='X', edgecolors='k', label='Predictions',
                          c='#ff7f0e', s=64)

            if n == 0:
                plt.legend()

        plt.xlabel('Week')
        return
    
    def make_dataset(self, data):
        data = np.array(data, dtype=np.float32)
        ds = tf.keras.utils.timeseries_dataset_from_array(
          data=data,
          targets=None,
          sequence_length=self.total_window_size,
          sequence_stride=1,
          shuffle=self.shuffle,
          batch_size=4,)

        ds = ds.map(self.split_window)

        return ds

    
    @property
    def train(self):
        return self.make_dataset(self.train_df)

    @property
    def val(self):
        return self.make_dataset(self.val_df)

    @property
    def test(self):
        return self.make_dataset(self.test_df)

    @property
    def example(self):
        """Get and cache an example batch of `inputs, labels` for plotting."""
        result = getattr(self, '_example', None)
        if result is None:
            # No example batch was found, so get one from the `.train` dataset
            result = next(iter(self.train))
            # And cache it for next time
            self._example = result
        return result

In [2]:
class PlayByPlay:
    def __init__(self, pbp, players):
        self.unique_players = players[["player_id", "player_name", "season"]].drop_duplicates()

        
        passers = pbp.groupby(['passer_id', 'week', 'game_date', 'spread_line', 'total_line', 'possession_team', 'home_team', 'season'], as_index = False).agg({'passing_yards': 'sum', 'pass_touchdown':'sum'}).rename(columns = {'pass_touchdown': 'passing_tds'})
        passers = passers.merge(self.unique_players, left_on=["passer_id", 'season'], right_on=["player_id", 'season']).drop(columns = ['passer_id', 'season'])
        
        receivers = pbp.groupby(['receiver_id', 'week', 'game_date', 'spread_line', 'total_line', 'possession_team', 'home_team', 'season'], as_index = False).agg({'complete_pass': 'sum', 'receiving_yards': 'sum', 'touchdown':'sum'}).rename(columns = {'complete_pass': 'receptions', 'touchdown': 'receiving_tds'})
        receivers = receivers.merge(self.unique_players, left_on=["receiver_id", 'season'], right_on=["player_id", 'season']).drop(columns = ['receiver_id', 'season'])
        
        rushers = pbp.groupby(['rusher_id', 'week', 'game_date', 'spread_line', 'total_line', 'possession_team', 'home_team', 'season'], as_index = False).agg({'rush': 'sum', 'rushing_yards': 'sum', 'touchdown':'sum'}).rename(columns = {'rush': 'carries', 'touchdown': 'rushing_tds'})
        rushers = rushers.merge(self.unique_players, left_on=["rusher_id", 'season'], right_on=["player_id", 'season']).drop(columns = ['rusher_id', 'season'])
        
        playerStats = receivers.merge(rushers, on =[ 'week', 'player_name', 'player_id',  'game_date', 'spread_line', 'total_line', 'possession_team', 'home_team'], how = 'outer')
        playerStats = playerStats.merge(passers, on =[ 'week', 'player_name', 'player_id',  'game_date', 'spread_line', 'total_line', 'possession_team', 'home_team'], how = 'outer').fillna(0)
        playerStats['spread_line'] = playerStats['spread_line']*(-2*(playerStats['possession_team'] == playerStats['home_team']) + 1) #Turns spread into the value relative to player rather than away team
        playerStats = playerStats.drop(columns=['possession_team', 'home_team']) #No longer need to keep track of teams after fixing spread
        playerStats['pred_score_for'] = (playerStats['total_line']-playerStats['spread_line'])/2
        playerStats['pred_score_against'] = (playerStats['total_line']+playerStats['spread_line'])/2
        playerStats = playerStats[['game_date', 'week', 'player_id', 'player_name', 'pred_score_for', 'pred_score_against', 'receptions', 'receiving_yards', 'receiving_tds', 'carries', 'rushing_yards', 'rushing_tds', 'passing_yards', 'passing_tds']]
        self.playerStats = playerStats.sort_values(by = ['game_date'])
        self._stats = self.playerStats[['pred_score_for', 'pred_score_against', 'receptions', 'receiving_yards', 'receiving_tds', 'carries', 'rushing_yards', 'rushing_tds', 'passing_yards', 'passing_tds']].copy()
        self.stat_labels = ['receptions', 'receiving_yards', 'receiving_tds', 'carries', 'rushing_yards', 'rushing_tds', 'passing_yards', 'passing_tds']
        
        defStats = pbp.groupby(['game_date', 'week', 'defteam'], as_index=False).agg({'passing_yards': 'sum', 'pass_touchdown':'sum', 'complete_pass': 'sum', 'receiving_yards': 'sum', 'touchdown':'sum', 'rushing_yards': 'sum', 'sack':'sum', 'interception':'sum', 'fumble_lost': 'sum'})
        self.defStats = defStats.rename(columns = {'complete_pass': 'receptions', 'sack': 'sacks', 'fumble_lost': 'fumbles', 'interception': 'interceptions'})
        
        self._makeModel()
        return
    
    def getPlayer(self, name):

        return self.playerStats[self.playerStats['player_name'] == name]
    
    def _getPlayerFeatures(self, name):
        """Returns player stats, but with everything stripped except the features to train on."""
        playerFeatures = self.getPlayer(name)
        playerFeatures = playerFeatures[['pred_score_for', 'pred_score_against', 'receptions', 'receiving_yards', 'receiving_tds', 'carries', 'rushing_yards', 'rushing_tds', 'passing_yards', 'passing_tds']]
        playerFeatures[['pred_score_for', 'pred_score_against']] = playerFeatures[['pred_score_for', 'pred_score_against']].shift(-1)
        
        return playerFeatures
    
    def _makeModel(self):
        self.lstm_model = tf.keras.models.Sequential([
            # Shape [batch, time, features] => [batch, time, lstm_units]
            tf.keras.layers.LSTM(32, return_sequences=True),
            # Shape => [batch, time, features]
            tf.keras.layers.Dense(units=len(self.stat_labels))
                ])
        self.lstm_model.compile(loss=tf.keras.losses.MeanSquaredError(),
                        optimizer=tf.keras.optimizers.Adam(),
                        metrics=[tf.keras.metrics.MeanAbsoluteError()])
        return
    
    def _trainModel(self, patience=2, max_epochs = 20, verbose = 1):
        
        for player in self.unique_players['player_name'].sample(frac=1): #Shuffle player order
            playerdata = self._getPlayerFeatures(player)
            n = len(playerdata)
            if n < 15:
                continue
            train_df= playerdata[0:int(n*0.7)]
            val_df = playerdata[int(n*0.7):int(n*0.9)]
            test_df = playerdata[int(n*0.9):]

            train_df = (train_df - self._stats.mean()) / self._stats.std()
            val_df = (val_df - self._stats.mean()) / self._stats.std()
            test_df = (test_df - self._stats.mean()) / self._stats.std()
            
            playerdata_window = WindowGenerator(
                            input_width=3, label_width=3, shift=1, train_df = train_df, val_df = val_df, test_df = test_df, label_columns = self.stat_labels)
            
            early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                        patience=patience,
                                                        mode='min')
            self.history = self.lstm_model.fit(playerdata_window.train, epochs=max_epochs,
                          validation_data=playerdata_window.val,
                          callbacks=[early_stopping], verbose = verbose)

        return

    def _calcfpoints(self, stats_df):
        """stats_df is a DataFrame with receptions, receiving_yards, receiving_tds, rushing_yards, rushing_tds, passing_yards, and passing_tds as column headers"""
        ppr = 0.5
        pp_passingyd = 0.04
        pp_rushingyd = 0.1
        pp_receivingyd = 0.1
        pp_td = 6
        pp_passingtd = 4
        stats_df = stats_df[-1:]
        return (pp_passingyd*stats_df['passing_yards'] + pp_passingtd*stats_df['passing_tds'] + ppr*stats_df['receptions'] + pp_receivingyd*stats_df['receiving_yards'] 
                + pp_td*stats_df['receiving_tds'] + pp_rushingyd*stats_df['rushing_yards'] + pp_td*stats_df['rushing_tds'])


    
    def _predPlayer(self, name):
        playerdata = self._getPlayerFeatures(name)[-5:-1] #Not including last point because the vegas odds are not input for those, will need to feed those in
        playerdata = (playerdata - self._stats.mean())/self._stats.std()
        if len(playerdata) < 4:
            return 0
        playerdata_tfds = tf.keras.utils.timeseries_dataset_from_array(
          data=playerdata,
          targets=None,
          sequence_length=4,
          sequence_stride=1,
          shuffle=True,
          batch_size=4,)
        
        player_pred = self.lstm_model.predict(playerdata_tfds)
        player_pred_df = pd.DataFrame(player_pred.reshape((4,8)))
        player_pred_df.columns = self.stat_labels
        player_pred_df_scaled = (player_pred_df*self._stats[self.stat_labels].std() + self._stats[self.stat_labels].mean())
        
        
        return self._calcfpoints(player_pred_df_scaled)
    
    def predictAll(self):
        fpoints = {}
        for player in self.unique_players['player_name'].sample(frac=1): #Shuffle player order
            fpoints[player] = self._predPlayer(player)
        fpoints_df = pd.DataFrame(fpoints).transpose()
        fpoints_df.columns = ['Fantasy Points']
        return fpoints_df.sort_values(by=['Fantasy Points'], ascending = False)

In [3]:
largeyears = [2008 + i for i in range(14)]
print(largeyears)

[2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021]


In [4]:
pbp_largeyears = nfl.import_pbp_data(largeyears)
players_largeyears = nfl.import_rosters(largeyears)

2008 done.
2009 done.
2010 done.
2011 done.
2012 done.
2013 done.
2014 done.
2015 done.
2016 done.
2017 done.
2018 done.
2019 done.
2020 done.
2021 done.
Downcasting floats.


In [5]:
PBPnew = PlayByPlay(pbp_largeyears, players_largeyears)

In [8]:
%%time
PBPnew._trainModel(max_epochs = 100, verbose = 0, patience = 5)

CPU times: total: 42min 50s
Wall time: 2h 35min 58s


In [9]:
fpoints = PBPnew.predictAll()



In [13]:
fpoints[50:100]

Unnamed: 0,Fantasy Points
Chris Carson,11.201167
Amon-Ra St. Brown,11.114491
Kendrick Bourne,11.067294
Colt McCoy,11.050848
Rod Smith,10.818093
Jaret Patterson,10.797392
Philip Rivers,10.692574
Derek Anderson,10.688589
Sean Mannion,10.653093
Tyler Huntley,10.610986


In [655]:
fpoints_df = pd.DataFrame(fpoints ).transpose()

In [656]:
fpoints_df.columns = ['Fantasy Points']

In [None]:
fpoints_df.sort_values(by=['Fantasy Points'], ascending = False)[:50]

In [14]:
PBPnew.lstm_model.save('saved_model/multyear_model')



INFO:tensorflow:Assets written to: saved_model/multyear_model\assets


INFO:tensorflow:Assets written to: saved_model/multyear_model\assets


In [721]:
new_model = tf.keras.models.load_model('saved_model/multyear_model')

In [None]:
Sum_of_squared_distances = []
K = range(1,10)
for num_clusters in K :
    kmeans = KMeans(n_clusters=num_clusters)
    kmeans.fit(data_frame)
    Sum_of_squared_distances.append(kmeans.inertia_)
plt.plot(K,Sum_of_squared_distances,’bx-’)
plt.xlabel(‘Values of K’) 
plt.ylabel(‘Sum of squared distances/Inertia’) 
plt.title(‘Elbow Method For Optimal k’)
plt.show()