# faNFL - Exploring the possibilities of predicting NFL player performance for Fantasy NFL

[Greg Sieranski](http://wonbyte.com) (1) and [Samuel John](http://samueljohn.de) (2)


1.  Walmart, USA
2.  HörSys GmbH, Hannover, Germany

An [IPython](http://ipython.org/ipython-doc/dev/interactive/htmlnotebook.html) notebook can be downloaded from **todo: insert URL here after open sourcing**.

#### How to cite
*todo* after publication (PyCon 2015 poster)

## Setup and imports

In [0]:
import numpy as np
import pandas as pd  # tabular data thingy
import seaborn as sns  # nicer statistical plots
import matplotlib.pyplot as plt  # the plots work-horse
import sklearn  # machine learning
import itertools
from collections import defaultdict
import pyprind
from pathlib import Path

Import our local version of nflgame (ported to Python3)

In [0]:
data

In [0]:
data.groupby(level='player_id').mean()[:14].plot()


In [0]:
import sys; sys.path.insert(0, "./nflgame")
import nflgame  # load NFL statistics

In [0]:
# import qgrid  # seems to throw an err on pip install lately
#qgrid.nbinstall()

Tune IPython notebook towards inline retina graphics

In [0]:
%matplotlib inline
%config InlineBackend.figure_format='svg'
%config InlineBackend.figure_format='retina'
sns.set()

## Some helper functions

In [0]:
def players_in_game(game, side=None):
    """Return unique set of `player_id`s for a certain `nflgame.Game` for the given `side`."""
    cats = ['receiving', 'defense', 'punting', 'kickret', 'kicking', 'rushing', 'puntret', 'passing', 'fumbles']
    players_home = (list(game.data['home']['stats'][cat].keys())
                    for cat in cats if cat in game.data['home']['stats'])
    players_away = (list(game.data['away']['stats'][cat].keys())
                    for cat in cats if cat in game.data['away']['stats'])
    re = {'home':set(itertools.chain(*players_home)), 'away':set(itertools.chain(*players_away))}
    if side is not None:
        return re[side]
    else:
        return re

Getting the players for a certain year that are in the stats (not all players are)

In [0]:
def active_players_in_year(year):
    players = set()
    for game in nflgame.games(year):
        for side in ['away', 'home']:
            players.update(players_in_game(game, side=side))
    return players

In [0]:
def lookup_player_name(player_id):
    """
    Return the name (str) of a player, given his player_id (str).
    """
    return nflgame.players[player_id].full_name

In [0]:
def opposite_side(side):
    if side == 'home':
        return 'away'
    elif side == 'away':
        return 'home'

In [0]:
def collect_stats_for_keys(keys, games):
    cats = {cat for cat, key in keys}
    columns = ['site', 'date', 'week', 'team', 'op_team', 'season']
    columns += [ cat+"_"+key for cat, key in keys]
    tmp_list = []
    tmp_index = []
    for game in games:
        playing = {}
        for site in ['away', 'home']:
            for cat in cats:
                if cat not in game.data[site]['stats']:
                    continue
                stat = game.data[site]['stats'][cat]
                for player_id in stat:
                    if player_id not in playing:
                        playing[player_id] = defaultdict(lambda: None)
                    for key in [key for c, key in keys if c == cat]:
                        if key in stat[player_id]:
                            playing[player_id][cat+"_"+key] = stat[player_id][key]
                    playing[player_id]['site'] = site
                    playing[player_id]['date'] = pd.datetime(game.schedule['year'],
                                                             game.schedule['month'],
                                                             game.schedule['day'],
                                                             int(game.schedule['time'].split(':')[0]))
                    playing[player_id]['week'] = game.schedule['week']
                    playing[player_id]['team'] = game.data[site]['abbr']
                    playing[player_id]['op_team'] = game.data[opposite_side(site)]['abbr']
                    playing[player_id]['season'] = game.season()
        for player_id in playing:
            tmp_index.append([player_id, game.eid])
            tmp_list.append([playing[player_id][col] for col in columns])
    return pd.DataFrame(tmp_list,
                        index=pd.MultiIndex.from_tuples(tmp_index, names=('player_id', 'eid')),
                        columns=columns)

### Training and test data set

These are just lists of all nflgame games in `nflgame.Game` objects. We transform them to an useful pandas data frame further below.

In [0]:
train_set = list(itertools.chain(*(nflgame.games(year) for year in [2009, 2010, 2011, 2012, 2013])))

In [0]:
test_set = list(itertools.chain(*(nflgame.games(year) for year in [2014])))

We need to define the `nflgame` keys (for the games) that are interesting to us in order to build up a DataFrame.

In [0]:
offense_keys= [('receiving', 'tds'),
               ('receiving', 'yds'),
               ('receiving', 'rec'),
               ('receiving', 'lng'),
               ('receiving', 'twoptm'),
               ('passing', 'yds'),
               ('passing', 'tds'),
               ('passing', 'att'),
               ('passing', 'cmp'),
               ('passing', 'ints'),
               ('rushing', 'yds'),
               ('rushing', 'tds'),
               ('rushing', 'att'),
               ('rushing', 'lng'),
               ('fumbles', 'yds'),
               ('fumbles', 'rcv'),
               ('fumbles', 'tot'),
               ('kickret', 'tds'),
               ('kickret', 'avg'),
               ('kickret', 'ret'),
               ('kicking', 'fga'),
               ('kicking', 'fgyds'),
               ('kicking', 'xpb'),
               ('kicking', 'xpmade'),
               ('kicking', 'fgm'),
               ('puntret', 'lng'),
               ('puntret', 'ret'),
               ('puntret', 'tds'),
               ('puntret', 'avg')]

In [0]:
defense_keys=[('defense', 'ffum'),
              ('defense', 'ast'),
              ('defense', 'int'),
              ('defense', 'sk'),
              ('defense', 'tkl'),
              ('punting', 'yds'),
              ('punting', 'i20'),
              ('punting', 'avg'),
              ('punting', 'lng')]

In [0]:
# keep_keys = ['site', 'date', 'week', 'team', 'op_team', 'season', 'n_op']
# keep_keys += [cat+"_"+key for cat, key in offense_keys+defense_keys]
# keep_keys += ["ops_"+cat+"_"+key for cat, key in offense_keys+defense_keys]

# d = data_test.copy()
# for c in d.columns:
#     if c in keep_keys:
#         continue
#     else:
#         del d[c]
#         print(c)
# data_test = d

### Appending columns that sum up the opposing team performance

... by summing over the mean performance of the players of the opposing team. Hopefully this is a valuable input to the predictors.

In [0]:
def append_ops_columns(data, keys=None):
    mean_values = data.groupby(level='player_id').mean()
    grouped_by_game = data.groupby(level='eid').groups
    game_ids = data.index.levels[1]
    
    # Creating the new column names by going through the offense_keys
    # and prepending "ops_".
    new_columns = {}
    for key in keys:
        new_columns["ops_{0}_{1}".format(*key)] = []
    new_columns['n_op'] = []
    bar = pyprind.ProgBar(len(game_ids))
    for game_id in game_ids:
        players = players_in_game(nflgame.game.Game(game_id))
        for player_id, row in data.loc[grouped_by_game[game_id]].iterrows():
            opposing_players = players[opposite_side(row['site'])]
            n = len(opposing_players)
            for what in keys:
                what_key = "{0}_{1}".format(*what)
                new_columns["ops_"+what_key].append(
                    np.nansum([mean_values.loc[op_id, what_key] for op_id in opposing_players])/n)
            new_columns['n_op'].append(n)
        bar.update()
    
    # making a new DataFrame for the new columns (ops = opposing player's sum)
    for column_name in new_columns:
        data[column_name] = new_columns[column_name]

## Filling pandas table

Let's have a big DataFrame (that is a table) with all games (we are interested in) and how all players performed in that games. When a player wasn't in a game, we denote that by `NaN` (that is `None`).

Now these are the DataFrames for training ("data") and test

In [0]:
hdf_file = Path(".")/"data.h5"

In [0]:
if hdf_file.exists():
    print("Loading HDF5 file {} that was last updated on".format(hdf_file))
    print(str(pd.datetime.fromtimestamp(hdf_file.stat().st_mtime)), flush=True)
    store = pd.HDFStore(str(hdf_file))
    data = store['data']
    data_test = store['data_test']
    store.close()
else:
    print("Collecting stats for offense and defense keys", flush=True)
    data = collect_stats_for_keys(keys=offense_keys+defense_keys,
                                 games=train_set)
    data_test = collect_stats_for_keys(keys=offense_keys+defense_keys,
                                 games=test_set)
    data.insert(0,'pos', [nflgame.players[v[0]].position for v in data.index.values])
    data_test.insert(0,'pos', [nflgame.players[v[0]].position for v in data_test.index.values])
    print("Computing normalized ops values for data")
    append_ops_columns(data, keys=offense_keys+defense_keys)
    print("Computing normalized ops values for data_test")
    append_ops_columns(data_test, keys=offense_keys+defense_keys)
    store = pd.HDFStore(str(hdf_file))
    store['data'] = data
    store['data_test'] = data_test
    store.close()

In [0]:
all_data_columns = [cat+"_"+key for cat, key in offense_keys+defense_keys]
all_data_ops_columns = ["ops_"+cat+"_"+key for cat, key in offense_keys+defense_keys]

In [0]:
team_home_stats_per_game = data.fillna(0)[data['site']=='home'].sum(level='eid').loc[:,all_data_columns]
team_away_stats_per_game = data.fillna(0)[data['site']=='away'].sum(level='eid').loc[:,all_data_columns]

### Getting a set of player we are interested in

#### Defining active players of 2014

...by basically saying that they need to participate in at least `n` games. Participate means that nfl has some stats for them in the database.

In [0]:
n = 10
# Doing a set comprehension here to get unique ids after we grouped and counted the 'site' column.
# The 'site' column is probably always there, so it is a safe bet, but we could used other colums 
# that do not contain nans.
active_players_in_data_test = {ind[0]
    for ind in data_test.groupby(level='player_id').filter(lambda x: x['site'].count() >= n).index.values}

So lets see how many players we removed by looking at the 0-th element for all the indices in `data_test.index.values`.

In [0]:
len({ v[0] for v in data_test.index.values})

... compared to the number of interesting players in 2014:

In [0]:
len(active_players_in_data_test)

#### Defining active players of the train `data`

In [0]:
n = 10
# Doing a set comprehension here to get unique ids after we grouped and counted the 'site' column.
# The 'site' column is probably always there, so it is a safe bet, but we could used other colums 
# that do not contain nans.
active_players_in_data = {ind[0]
    for ind in data.groupby(level='player_id').filter(lambda x: x['site'].count() >= n).index.values}

In [0]:
len(active_players_in_data)

How many players are in the train data?

In [0]:
len(data.groupby(level='player_id').groups)

## Does the performance (measures) correlate on the performance of the opponent players *in that one game*?

Now we want to see if our idea about the **opponent player summed up average stats (ops)** has any predictive power on the performance of a player. Therefore we plot a performance measure like `receiving_yds` vs. different ops values

In [0]:
team_stats_per_game = team_away_stats_per_game.join(team_home_stats_per_game[all_data_columns], lsuffix="_h")

In [0]:
#sns.set(context="paper", font="monospace")

corrmat = team_stats_per_game.corr().iloc[:len(team_away_stats_per_game.columns),
                                           len(team_away_stats_per_game.columns):]
f, ax = plt.subplots(figsize=(12, 12))
sns.heatmap(corrmat,
            vmax=.4, linewidths=0,
            ax=ax)

f.tight_layout()

In [0]:
sns.set(style="darkgrid")
color = sns.color_palette()[1]
g = sns.jointplot(team_home_stats_per_game["rushing_att"],
                  team_away_stats_per_game["rushing_att"], kind="reg",
                  color=color, size=9, joint_kws=dict())


# $f(k,player_{home},game) = \frac{\sum_{p=1}^{N(players_{away,game})}\frac{\sum_{g=1}^{N(games_p)}C_k(g,p)}{N(games_p)}}{N(players_{away,game})}$

In [0]:
sns.jointplot?

In [0]:
sns.set(style="darkgrid")
color = sns.color_palette()[1]
d = normalized_data
d = d[(d['pos']=='QB')]#.loc[nflgame.find("Shaun Hill")[0].player_id]
g = sns.jointplot(d["passing_yds"],
                  d["ops_receiving_yds"]*20, kind="reg",
                  color=color, size=9)
d = d[(d['pos']=='QB')]#.loc[nflgame.find("Shaun Hill")[0].player_id]
g = sns.jointplot(d["passing_yds"],
                  d["ops_defense_tkl"]*20, kind="reg",
                  color=color, size=9)
d = d[(d['pos']=='QB')]#.loc[nflgame.find("Shaun Hill")[0].player_id]
g = sns.jointplot(d["passing_yds"],
                  d["ops_kicking_fga"]*20, kind="reg",
                  color=color, size=9)
d = d[(d['pos']=='QB')]#.loc[nflgame.find("Shaun Hill")[0].player_id]
g = sns.jointplot(d["passing_yds"],
                  d["ops_defense_ffum"]*20, kind="reg",
                  color=color, size=9)




"ops_receiving_yds", "ops_defense_tkl", "ops_kicking_fga", "ops_defense_ffum"

In [0]:
sns.jointplot?

In [0]:
color = sns.color_palette()[1]
d = normalized_data[normalized_data["pos"] == "RB"]#.loc[nflgame.find("Peyton Manning")[0].player_id]
g = sns.jointplot(d["kickret_ret"],
                  d["ops_kicking_xpmade"]*100, kind="reg",
                  color=color, size=9)
d = normalized_data_test[normalized_data_test["pos"] == "RB"]#.loc[nflgame.find("Peyton Manning")[0].player_id]
color = sns.color_palette()[2]
g = sns.jointplot(d["kickret_ret"],
                  d["ops_kicking_xpmade"]*100, kind="reg",
                  color=color, size=9)




# Running scikit-learn

In [0]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
linreg_models = {}
linreg_generic_model = {}

In [0]:
measures_to_predict = ["{0}_{1}".format(cat, key) for cat, key in offense_keys]
players_to_predict = active_players_in_data_test & active_players_in_data

In [0]:
ops_values_per_team = data.groupby('op_team').mean().loc[:,all_data_ops_columns]

In [0]:
scale_ops_values = StandardScaler().fit(ops_values_per_team)

In [0]:
def makelike(arr,df):
    return pd.DataFrame(arr,index=df.index,columns=df.columns)

In [0]:
fig, ax = plt.subplots(figsize=(12, 12))
sns.set_palette("husl")
sns.heatmap(makelike(scale_ops_values.transform(ops_values_per_team),
                     ops_values_per_team),
            linewidths=.4)
fig.tight_layout()

In [0]:
sns.set_palette("pastel")
makelike(scale_ops_values.transform(ops_values_per_team),ops_values_per_team)[['ops_passing_yds','ops_passing_tds']].plot(kind='bar')

## Cluster the teams based on their ops values 
to see which teams are similar to each other

In [0]:
from sklearn.decomposition import PCA,  KernelPCA
from sklearn.manifold import MDS, TSNE

In [0]:
pca =  KernelPCA(n_components=2, kernel="linear", gamma=10, fit_inverse_transform=True)
#pca = TSNE(n_components=2)
tmp_data = makelike(scale_ops_values.transform(ops_values_per_team),
                    ops_values_per_team)
teams_in_2d = pca.fit_transform(tmp_data)
teams_in_2d = pd.DataFrame(teams_in_2d,index=ops_values_per_team.index)

In [0]:
fig, ax = plt.subplots(figsize=(12,6))
sns.set_palette("pastel")
teams_in_2d.plot(0,1,kind='scatter', s=100,ax=ax)
for team in teams_in_2d.index.values:
    ax.annotate(team, teams_in_2d.loc[team])

## Training phase

### Normalizing data

In [0]:
player_means = data.groupby(level='player_id').mean()[all_data_columns]

In [0]:
normalized_data = data.copy()

In [0]:
all_player_in_data = list(set(v[0] for v in data.index.values))
for pid in pyprind.prog_bar(all_player_in_data):
    normalized_data.loc[[pid],all_data_columns] -= player_means.loc[[pid],all_data_columns]

In [0]:
normalized_data_test = data_test.copy()

In [0]:
player_means_test = data_test.groupby(level='player_id').mean()[all_data_columns]
all_player_in_data_test = list(set(v[0] for v in data_test.index.values))
for pid in pyprind.prog_bar(all_player_in_data_test):
    normalized_data_test.loc[[pid],all_data_columns] -= player_means_test.loc[[pid],all_data_columns]

Computing coefficients

In [0]:
ops_corrmat = normalized_data.corr()

In [0]:
ops_corrmat = data.loc[data['pos']=='QB'].corr()

In [0]:
fig, ax = plt.subplots(figsize=(9, 9))
sns.set_palette("husl")
sns.heatmap(ops_corrmat.loc[all_data_columns,
                            all_data_ops_columns],
            linewidths=.4, vmin=-.2, vmax=0.2)
fig.tight_layout()

In [0]:
ops_test_corrmat = data_test.loc[data_test['pos']=='QB'].corr()
fig, ax = plt.subplots(figsize=(9, 9))
sns.set_palette("husl")
sns.heatmap(ops_test_corrmat.loc[all_data_columns,
                                 all_data_ops_columns],
            linewidths=.4, vmin=-.3, vmax=0.3)
fig.tight_layout()

In [0]:
fig, ax = plt.subplots(figsize=(9, 9))
corr_agreement = (ops_test_corrmat*ops_corrmat)#.mul(data[all_data_columns].count()/len(data), axis=0)
corr_agreement = corr_agreement.loc[all_data_columns,
                                    all_data_ops_columns]
sns.set_palette("husl")
sns.heatmap(corr_agreement,
            linewidths=.4, vmin=-0.1, vmax=.1)
fig.tight_layout()

In [0]:
fig, ax = plt.subplots(figsize=(9, 9))
corr_agreement = ops_test_corrmat*ops_corrmat
corr_agreement = corr_agreement.loc[all_data_columns,
                                    all_data_ops_columns]
sns.set_palette("husl")
sns.heatmap(corr_agreement,
            linewidths=.4, vmax=0.01)
fig.tight_layout()


### Using PCA projections as summary for the opposing team for training a LinearRegression on each player indivudually

In [0]:

bar = pyprind.ProgBar(len(players_to_predict)*len(measures_to_predict))
player_models = {}
for player in players_to_predict:
    player_models[player] = {}
    for measure in measures_to_predict:
        bar.update()
        player_models[player][measure] = LinearRegression()
        colums_that_matter = [col for col in corr_agreement.loc[measure].sort(inplace=False, ascending=False)[:top_n].index.values]
        if data.loc[player][measure].count() < 40:
            #print("skipping {0} for {1}".format(measure,lookup_player_name(player)))
            continue
        try:
            player_models[player][measure].fit(teams_in_2d.loc[data.loc[player]['op_team']],
                                               data.loc[player][measure].fillna(data.loc[player][measure].mean()).fillna(0))
#             linreg_models[player][measure].fit(data.loc[player,colums_that_matter].dropna(axis=1),
#                                                data.loc[player][measure].fillna(data.loc[player][measure].mean()).fillna(0))
        except ValueError:
            print(data.loc[player,colums_that_matter].dropna())
            print(measure)
            raise
        

### Using plain top_n columns_that_matter of the oponent teams summed up stats (OPS) to predict each players performance

In [0]:
top_n = 6
bar = pyprind.ProgBar(len(players_to_predict)*len(measures_to_predict))
player_models = {}
for player in players_to_predict:
    player_models[player] = {}
    for measure in measures_to_predict:
        bar.update()
        player_models[player][measure] = LinearRegression()
        colums_that_matter = [col for col in corr_agreement.loc[measure].sort(inplace=False, ascending=False)[:top_n].index.values]
        if data.loc[player][measure].count() < 40:
            #print("skipping {0} for {1}".format(measure,lookup_player_name(player)))
            continue
        try:
#             player_models[player][measure].fit(teams_in_2d.loc[data.loc[player]['op_team']],
#                                                data.loc[player][measure].fillna(data.loc[player][measure].mean()).fillna(0))
            player_models[player][measure].fit(data.loc[player,colums_that_matter].dropna(axis=1),
                                               data.loc[player][measure].fillna(data.loc[player][measure].mean()).fillna(0))
        except ValueError:
            print(data.loc[player,colums_that_matter].dropna())
            print(measure)
            raise
        
        

### A bag for each player type (position)

In [0]:
top_n = 5
linreg_models = {}
bags = ["QB", "WR"]
bar = pyprind.ProgBar(len(measures_to_predict)*len(bags))
for bag in bags:
    all_in_bag = [ pid for pid in players_to_predict if nflgame.players[pid].position == bag] 
    linreg_models[bag] = {}
    for measure in measures_to_predict:
        bar.update()
        linreg_models[bag][measure] = LinearRegression()
        colums_that_matter = [col for col in corr_agreement.loc[measure].sort(inplace=False, ascending=False)[:top_n].index.values]
        bag_data = data.loc[data.index.isin([ pid for pid in players_to_predict
                                              if nflgame.players[pid].position == bag],
                                            level='player_id')]
#             linreg_models[bag][measure].fit(teams_in_2d.loc[data.loc[player]['op_team']],
#                                                data.loc[player][measure].fillna(data.loc[player][measure].mean()).fillna(0))
        linreg_models[bag][measure].fit(bag_data[colums_that_matter].dropna(axis=1),
                                        bag_data[measure].fillna(0))
    

### Try to cluster players of a certain type (position) into subgroups in each bag

We start with one of the most interesting type, the QB:

In [0]:
QB_data = data[data['pos'] == 'QB'][all_data_columns]

In [0]:
QB_data.dropna(thresh=len(QB_data)/2, axis=1, how='all', inplace=True)
QB_data.fillna(0, inplace=True)

In [0]:
np.atleast_2d(QB_data["passing_yds"].groupby(level='player_id').mean())

In [0]:
from sklearn.cluster import MeanShift
from sklearn.mixture import GMM
cluster = MeanShift(bandwidth=6)
cluster.fit(QB_data[["passing_yds"]].groupby(level='player_id').mean())
clusters = cluster.predict(QB_data[["passing_yds"]].groupby(level='player_id').mean())
fig, ax = plt.subplots(figsize=(16,8))
QB_data[["passing_yds","passing_att"]].groupby(level='player_id').mean().plot(0,1,ax=ax, kind='scatter', s=100,c=clusters, cmap='Set2')
tmp = QB_data.iloc[:50]
tmp[["passing_yds","passing_att"]].plot(0,1,kind='scatter', ax=ax,
                                            s=10,
                                            c=pd.Categorical(
                                                [v[0] for v in tmp.index.values],
                                                categories=set([v[0] for v in tmp.index.values])).codes,
                                            cmap='Set2')
for v in tmp.index.values:
    ax.text(v[0], tmp.loc[v[0]][['passing_yds', 'passing_att']] )

In [0]:
pd.Categorical([v[0] for v in df.index.values],
               categories=set([v[0] for v in tmp.index.values]))

In [0]:
from sklearn.manifold import TSNE
#kpca = KernelPCA(n_components=2, kernel="rbf", gamma=1/20, fit_inverse_transform=True)
kpca = TSNE(n_components=2)
clean_up_normalized_data = normalized_data[["pos"]+all_data_columns]
clean_up_normalized_data = clean_up_normalized_data[clean_up_normalized_data['pos'] == 'QB']
clean_up_normalized_data.dropna(thresh=len(clean_up_normalized_data)/2, axis=1, how='all', inplace=True)
clean_up_normalized_data.fillna(0, inplace=True)
norm_data_2d = kpca.fit_transform(clean_up_normalized_data.drop(["pos"], axis=1))
fig, ax = plt.subplots(figsize=(16,8))
df = pd.DataFrame(norm_data_2d, index=clean_up_normalized_data.index)
df.plot(0,1,
        kind='scatter',
        s=100,ax=ax,
        c=pd.Categorical([v[0] for v in df.index.values],
               categories=set([v[0] for v in df.index.values])).codes, cmap="Set2")


In [0]:
from mpl_toolkits.mplot3d.axes3d import Axes3D
fig, ax = plt.subplots(subplot_kw=dict(projection='3d'), figsize=(12,12))
tmp = []
colors = sns.color_palette("Set2",24)
groupmean = data["receiving_yds"].mean()
for i, pid in enumerate(p for p in players_to_predict if nflgame.players[p].position == 'QB'):
    xs = normalized_data.loc[[pid],'receiving_yds']
    ax.scatter(xs,
               data.loc[[pid],"ops_defense_ffum"],
               data.loc[[pid],"ops_defense_tkl"],
#                teams_in_2d.loc[data.loc[[pid],'op_team']][0],
#                teams_in_2d.loc[data.loc[[pid],'op_team']][1],
               c="black", cmap="Set2", marker=".",
               label=lookup_player_name(pid))
    xs = data.loc[[pid],'receiving_yds']
    ax.scatter(xs-groupmean,
               data.loc[[pid],"ops_defense_ffum"],
               data.loc[[pid],"ops_defense_tkl"],
#                teams_in_2d.loc[data.loc[[pid],'op_team']][0],
#                teams_in_2d.loc[data.loc[[pid],'op_team']][1],
                marker="x", cmap="Set2",
               label=lookup_player_name(pid))


    
ax.set_xlabel("receiving_yds")
ax.set_ylabel("ops_defense_ffum")
ax.set_zlabel("ops_defense_tkl")
ax.legend()
ax.view_init(90,0)

In [0]:
from sklearn.cluster import MeanShift, KMeans
from sklearn.mixture import GMM
from sklearn.linear_model import Ridge
from copy import deepcopy
top_n = 2
n_cluster = 1
linreg_models = {}
player_models = {}
columns_for_bag = {}
ops_columns_that_matter ={}
bags = ["QB"]
kmeans = {}
columns_that_matter = None
bar = pyprind.ProgBar(len(measures_to_predict)*len(bags)*n_cluster)
for bag in bags:
    linreg_models[bag] = {} 
    # get all the data for the players of the type (position) as in `bag`:
    bag_data_ops = normalized_data[normalized_data['pos'] == bag][all_data_ops_columns]
    bag_data = normalized_data[normalized_data['pos'] == bag][['op_team']+all_data_columns]
    bag_data.dropna(thresh=len(bag_data)/40, axis=1, how='all', inplace=True)
    bag_data.fillna(0, inplace=True)
    
    # Split into clusters
    kmeans[bag] = KMeans(n_clusters=n_cluster)
    if bag == "QB":
        cluster_based_on = bag_data.drop("op_team",axis=1)[['passing_yds']]
        columns_that_matter = ["ops_receiving_yds", "ops_defense_tkl"]
    elif bag == "RB":
        cluster_based_on = bag_data.drop("op_team",axis=1)[['kickret_ret']]
        columns_that_matter = ["ops_kicking_fgm", "ops_kicking_xpmade"]
    else:
        raise Exception("palyer type not yet supported")
    ops_columns_that_matter[bag] = columns_that_matter
    columns_for_bag[bag] = cluster_based_on.columns
    kmeans[bag].fit(cluster_based_on)
    clusters = kmeans[bag].predict(cluster_based_on)
    for cluster_nr in set(clusters):
        linreg_models[bag][cluster_nr] = {}
        cluster_bag_data = bag_data_ops.loc[clusters == cluster_nr]
        for measure in measures_to_predict:
            try:
                bar.update()
                linreg_models[bag][cluster_nr][measure] = linear_model.Ridge(fit_intercept=True, alpha=.1)
#                columns_that_matter = [col for col in corr_agreement.loc[measure].sort(inplace=False, ascending=False)[:top_n].index.values]
                
                linreg_models[bag][cluster_nr][measure].fit(cluster_bag_data[columns_that_matter],
                                                            bag_data.loc[clusters==cluster_nr][measure])

#.fit(teams_in_2d.loc[data.loc[player]['op_team']],
#                                                data.loc[player][measure].fillna(data.loc[player][measure].mean()).fillna(0))
 
# code for fitting based on 2d reduced ops team representation
#                linreg_models[bag][cluster_nr][measure].fit(teams_in_2d.loc[bag_data.loc[clusters==cluster_nr]['op_team']],
#                                                            bag_data.loc[clusters==cluster_nr][measure])

                # Get all players in this cluster and assign the model to them:
                for pid in [ v[0] for v in bag_data.loc[clusters==cluster_nr].index.values]:
                    if not pid in player_models:
                        player_models[pid] = {}
                    player_models[pid][measure] = linreg_models[bag][cluster_nr][measure]
                    #player_models[pid][measure].intercept_ = (player_models[pid][measure].coef_ * -player_means.loc[pid][columns_that_matter])+player_means.loc[pid,measure]

            except KeyError as e:
                ...
                #del linreg_models[bag][cluster_nr][measure]


### Applying the prediction on test data set

In [0]:
# Exclude outlier 
pid = nflgame.find("Brian Hoyer")[0].player_id
players_to_predict = {p for p in players_to_predict if p != pid}

In [0]:
predicted = {}
true = {}
d = normalized_data 
weighting_of_bag_model = .4
measure = "passing_yds"
#measure = "kickret_ret"

for pid in pyprind.prog_bar([ pid for pid in players_to_predict if nflgame.players[pid].position == 'QB']):
    _predicted = []
    _true = []
    bag = nflgame.players[pid].position
    try:
        for game in d.loc[pid].index.values:
            # next line for prediction based on 2d
#            _predicted.append(player_models[pid][measure].predict(
#                  teams_in_2d.loc[d.loc[(pid,game)]['op_team']]))
            # next line for based on columns that matter
            _predicted.append(
                player_models[pid][measure].predict(
                    d.loc[(pid,game)][ops_columns_that_matter[bag]])*weighting_of_bag_model
                          + player_means.loc[pid][measure] )
            _true.append(data.loc[(pid,game)][measure])
            if np.isnan(_true[-1]):
                print("Warning: NAN for "+lookup_player_name(pid)+" in game "+game+ " for measure "+ measure)

        true[pid] = np.array(_true)
        predicted[pid] = np.array(_predicted).reshape(-1)
    except sklearn.utils.validation.NotFittedError:
        continue    
    

In [0]:
predictor_off = []
mean_off = []
#predictions = []
for pid in predicted.keys():
    df = pd.DataFrame(index=d.loc[pid].index.values) 
    df['true'] = true[pid]
    df['lin_predictor'] = predicted[pid]
    df['mean_predictor'] = data.loc[pid][measure].mean()
    df['lin_predictor'] -= true[pid]
    df['mean_predictor'] -= true[pid]
    df['true'] -= true[pid]
    mean_off.append(df['mean_predictor'].abs().sum())
    predictor_off.append(df['lin_predictor'].abs().sum())
result = pd.DataFrame({'lin_pred':predictor_off,'mean':mean_off},
                      index=[lookup_player_name(pid) for pid in predicted.keys()])

In [0]:
sns.set_palette('pastel')
print(result.sum())
result.plot(kind='bar', figsize=(13,6))

In [0]:
measure="passing_yds"
pid = nflgame.find("Aaron Rodgers")[0].player_id
p = []
m = []
t = []
d = normalized_data_test ## XXXXXX FIX ME
d2 = data_test
for game in d.loc[pid].index.values:
#     print("predicted based on bag model:")
#     print(p[-1])
#     print("predicted based on individual player mean:")
    m.append(player_means.loc[pid][measure])
    p.append(m[-1]+10*weighting_of_bag_model*player_models[pid][measure].predict(d.loc[(pid,game)][ops_columns_that_matter['QB']])[0])


    #     print(m[-1])
#     print("True:")
    t.append(d2.loc[(pid,game)][measure])
#     print(t[-1])
#     print("-----")
pd.DataFrame(dict(lin_pred=p, mean=m, true=t)).plot(marker="o", lw=.5)

#### version info

In [0]:
%install_ext https://raw.githubusercontent.com/rasbt/watermark/master/watermark.py
%load_ext watermark
%watermark --updated -c %Y-%m-%d -v -m -p numpy,scipy,matplotlib,seaborn,pandas -g