# Statistics and Regression for NFL player performance

You should start the IPython notebook in the dir where this faNFL.ipynb file is located:

    ipython3 notebook .
    
For the server with certificate and with anaconda:
 
    ~/anaconda/bin/ipython notebook --profile=faNFL

### Imports

In [0]:
import sys; sys.path.insert(0, "./nflgame")
from collections import defaultdict, OrderedDict
import nflgame  # load NFL statistics
import seaborn as sns  # nicer statistical plots
import matplotlib.pyplot as plt  # the plots work-horse
import sklearn  # machine learning
import pandas as pd  # tabular data thingy
from pandas import Series
import numpy as np
import itertools
import qgrid
import pyprind

In [0]:
qgrid.nbinstall()

In [0]:
%matplotlib inline
%config InlineBackend.figure_format='svg'
%config InlineBackend.figure_format='retina'

In [0]:
pd.__version__

## Defense team score

Three questions we want to address here at first ...

- You have to find all of the statistical plays that contribute to the teams score.
- You have to find out how many points the other team scored on that defense.
- You have to find out how many yards were given up by that defense.

In [0]:
games = nflgame.games(2014, week=1, kind='REG')

In [0]:
team_defense_score = defaultdict(int)
# Also going to keep track of the scores that aren't by the offense
not_against_team_defense = defaultdict(int)

for p in nflgame.combine_plays(games).players():
    team_defense_score[p.team] += p.kickret_tds*6
    team_defense_score[p.team] += p.puntret_tds*6
    not_against_team_defense[p.team] += p.kickret_tds*6
    not_against_team_defense[p.team] += p.puntret_tds*6

for p in nflgame.combine_plays(games).players().defense():
    team_defense_score[p.team] += p.defense_int_tds*6
    team_defense_score[p.team] += p.defense_frec_tds*6
    team_defense_score[p.team] += p.defense_misc_tds*6
    team_defense_score[p.team] += p.defense_int*2
    team_defense_score[p.team] += p.defense_frec*2
    team_defense_score[p.team] += p.kicking_fgb*2
    team_defense_score[p.team] += p.kicking_xpb*2
    team_defense_score[p.team] += p.punting_block*2
    team_defense_score[p.team] += p.defense_safe*2
    team_defense_score[p.team] += p.defense_sk*1
    not_against_team_defense[p.team] += p.defense_int_tds*6
    not_against_team_defense[p.team] += p.defense_frec_tds*6
    not_against_team_defense[p.team] += p.defense_misc_tds*6
    not_against_team_defense[p.team] += p.defense_safe*2

In [0]:
team_defense_score

In [0]:
# 2)
team_points_against = defaultdict(int)
for g in games:
    team_points_against[g.home] = g.score_away-not_against_team_defense[g.away]
    team_points_against[g.away] = g.score_home-not_against_team_defense[g.home]
for team in team_points_against:
    pts = team_points_against[team]
    if pts == 0: team_defense_score[team] += 5
    elif pts <= 6: team_defense_score[team] += 4
    elif pts <= 13: team_defense_score[team] += 3
    elif pts <= 17: team_defense_score[team] += 1
    elif pts <= 27: team_defense_score[team] += 0
    elif pts <= 34: team_defense_score[team] -= 1
    elif pts <= 45: team_defense_score[team] -= 3
    elif pts > 45: team_defense_score[team] -= 5

# 3)
team_yards_against = defaultdict(int)
plays = nflgame.combine_plays(games)
for p in plays.players().defense():
    team_yards_against[p.team] += p.rushing_yds
    team_yards_against[p.team] += p.passing_yds
    team_yards_against[p.team] -= p.defense_sk_yds  # ???
for team in team_yards_against:
    yds = team_yards_against[team]
    if yds < 100: team_defense_score[team] += 5
    elif yds < 200: team_defense_score[team] += 3
    elif yds < 300: team_defense_score[team] += 2
    elif yds < 350: team_defense_score[team] += 0
    elif yds < 400: team_defense_score[team] -= 1
    elif yds < 450: team_defense_score[team] -= 3
    elif yds < 500: team_defense_score[team] -= 5
    elif yds < 550: team_defense_score[team] -= 6
    elif yds >= 550: team_defense_score[team] -= 7

print(team_defense_score['NE'])
print(team_defense_score['CHI'], team_points_against['CHI'])
print(team_defense_score['MIN'], team_points_against['MIN'])
print(team_defense_score['OAK'])

## Computing a fantasy league score

for a player in a game.

In [0]:
def yahoo_expected_points(yahoo_scoring_system=dict(points_passing_yds=25,
                                              points_passing_ints=-1,
                                              points_passing_twoptm=2,
                                              points_passing_tds=4,
                                              points_rushing_yds=10,
                                              points_rushing_tds=6),
                    player_passing_yds=None,
                    player_passing_ints=None,
                    player_passing_twoptm=None,
                    player_passing_tds=None,
                    player_rushing_yds=None,
                    points_rushing_tds=None):
    score = 0
    
    score += player_passing_yds / yahoo_scoring_system["points_passing_yds"]
    score += player_passing_ints * yahoo_scoring_system["points_passing_ints"]
    score += player_passing_twoptm * yahoo_scoring_system["points_passing_twoptm"]
    score += player_passing_tds * yahoo_scoring_system["points_passing_tds"]
    score += player_rushing_yds / yahoo_scoring_system["points_rushing_yds"]
    score += player_rushing_tds * yahoo_scoring_system["points_rushing_tds"]
    
    return score

In [0]:
def fantasy_score(game, player_id,
                  points_passing_yards=25,
                  points_passing_ints=-1,
                  points_passing_twoptm=2,
                  points_passing_tds=4,
                  points_rushing_yds=10,
                  points_rushing_tds=6):
    score = 0
    # passing
    if player_id in game.data['away']['stats']['passing'].keys():
        player_passing_data = game.data['away']['stats']['passing'][player_id]
    elif player_id in game.data['home']['stats']['passing'].keys():
        player_passing_data = game.data['home']['stats']['passing'][player_id]
    else:
        raise Exception("Given player_id {0} not found 'passing' in that game {1}".format(
            player_id,
            game.nice_score()))
        
    score += player_passing_data['yds'] / points_passing_yards
    score += player_passing_data['ints'] * points_passing_ints
    score += player_passing_data['twoptm'] * points_passing_twoptm
    score += player_passing_data['tds'] * points_passing_tds
    
    # rushing
    if player_id in game.data['away']['stats']['rushing'].keys():
        player_rushing_data = game.data['away']['stats']['rushing'][player_id]
    elif player_id in game.data['home']['stats']['rushing'].keys():
        player_rushing_data = game.data['home']['stats']['rushing'][player_id]
    else:
        raise Exception("Given player_id {0} not found 'rushing' in that game {1}".format(
            player_id,
            game.nice_score()))

    score += player_rushing_data['yds'] / points_rushing_yds
    score += player_rushing_data['tds'] * points_rushing_tds
    return score

## Filling pandas table

Let's have a big DataFrame (that is a table) with all games (we are interested in) and how all players performed in that games. When a player wasn't in a game, we denote that by `NaN` (that is `None`).

We need two helper functions to extract all the player ids of a certain game (certain year)

In [0]:
def players_in_game(game, side='away'):
    """Return unique set of `player_id`s for a certain `nflgame.Game` for the given `side`."""
    cats = ['receiving', 'defense', 'punting', 'kickret', 'kicking', 'rushing', 'puntret', 'passing', 'fumbles']
    list_of_list_of_player_ids = (list(game.data[side]['stats'][cat].keys())
                                  for cat in cats if cat in game.data[side]['stats'])
    all_player_id = itertools.chain(*list_of_list_of_player_ids)
    return set(all_player_id)

In [0]:
def active_players_in_year(year):
    players = set()
    for game in nflgame.games(year):
        for side in ['away', 'home']:
            players.update(players_in_game(game, side=side))
    return players

In [0]:
def lookup_player_name(player_id):
    """
    Return the name (str) of a player, given his player_id (str).
    """
    return nflgame.players[player_id].full_name

In [0]:
def opposite_side(side):
    if side == 'home':
        return 'away'
    elif side == 'away':
        return 'home'

In [0]:
cats=('receiving', 'defense', 'punting', 'kickret', 'kicking', 'rushing', 'puntret', 'passing', 'fumbles')

In [0]:
def collect_stats_for_keys(keys, games):
    cats = [ cat for cat, key in keys]
    columns = ['site', 'date', 'week', 'team', 'op_team', 'season']
    columns += [ cat+"_"+key for cat, key in keys]
    tmp_list = []
    tmp_index = []
    for game in games:
        playing = {}
        for site in ['away', 'home']:
            for cat in cats:
                if cat not in game.data[site]['stats']:
                    continue
                stat = game.data[site]['stats'][cat]
                for player_id in stat:
                    playing[player_id] = defaultdict(lambda: None)
                    for key in [key for c, key in keys if c == cat]:
                        if key in stat[player_id]:                            
                            playing[player_id][cat+"_"+key] = stat[player_id][key]
                    playing[player_id]['site'] = site
                    playing[player_id]['date'] = pd.datetime(game.schedule['year'],
                                                             game.schedule['month'],
                                                             game.schedule['day'],
                                                             int(game.schedule['time'].split(':')[0]))
                    playing[player_id]['week'] = game.schedule['week']
                    playing[player_id]['team'] = game.data[site]['abbr']
                    playing[player_id]['op_team'] = game.data[opposite_side(site)]['abbr']
                    playing[player_id]['season'] = game.season()
        for player_id in playing:
            tmp_index.append([player_id, game.eid])
            tmp_list.append([playing[player_id][col] for col in columns])
    return pd.DataFrame(tmp_list,
                        index=pd.MultiIndex.from_tuples(tmp_index, names=('player_id', 'eid')),
                        columns=columns)

In [0]:
train_set = list(itertools.chain(*(nflgame.games(year) for year in [2009, 2010, 2011, 2012, 2013])))

In [0]:
test_set = list(itertools.chain(*(nflgame.games(year) for year in [2014])))

In [0]:
g1 = train_set[0]

In [0]:
offense_keys= [('receiving', 'tds'),
               ('receiving', 'yds'),
               ('receiving', 'rec'),
               ('receiving', 'lng'),
               ('passing', 'yds'),
               ('passing', 'tds'),
               ('passing', 'att'),
               ('passing', 'cmp'),
               ('passing', 'ints'),
               ('rushing', 'yds'),
               ('rushing', 'tds'),
               ('rushing', 'att'),
               ('rushing', 'lng'),
               ('fumbles', 'yds'),
               ('fumbles', 'rcv'),
               ('fumbles', 'trcv'),
               ('fumbles', 'tot'),
               ('fumbles', 'lost'),
               ('kickret', 'tds'),
               ('kickret', 'avg'),
               ('kickret', 'ret'),
               ('kicking', 'fga'),
               ('kicking', 'fgyds'),
               ('kicking', 'xpb'),
               ('kicking', 'xptot'),
               ('kicking', 'xpa'),
               ('kicking', 'xpmissed'),
               ('kicking', 'xpmade'),
               ('kicking', 'fgm'),
               ('kicking', 'totpfg'),
               ('puntret', 'lng'),
               ('puntret', 'ret'),
               ('puntret', 'tds'),
               ('puntret', 'avg')]

In [0]:
defense_keys=[('defense', 'ffum'),
              ('defense', 'ast'),
              ('defense', 'int'),
              ('defense', 'sk'),
              ('defense', 'tkl'),
              ('punting', 'pts'),
              ('punting', 'yds'),
              ('punting', 'i20'),
              ('punting', 'avg'),
              ('punting', 'lng')]

In [0]:
data = collect_stats_for_keys(keys=offense_keys+defense_keys,
                                 games=train_set)

In [0]:
offense = defense = data

In [0]:
data.info()

In [0]:
#offense['passing_success_rate'] = offense['passing_cmp'] / offense['passing_att']

In [0]:
opposite_players = [ players_in_game(nflgame.game.Game(i[1]), side=opposite_side(offense.loc[i]['site'])) for i in offense.index.values]

In [0]:
[ position(pid) for pid in opposite_players[4] ]

In [0]:
offense_means = offense.groupby(level='player_id').mean()

In [0]:
offense_means.loc['00-0010346']['passing_yds']

In [0]:
offense_means.loc['00-0023459']['passing_yds']

In [0]:
lookup_player_name('00-0023459')

In [0]:
offense_means.sort('passing_tds', ascending=False)['passing_tds'].head(200).plot()

In [0]:
offense.sort('passing_yds', ascending=False)['passing_yds'].head(200).plot()

In [0]:
g = sns.PairGrid(offense, vars=['passing_yds', 'passing_tds', 'passing_success_rate'],
                 hue='op_team', palette='husl')
g.map(plt.scatter, alpha=0.5)
g.add_legend();

In [0]:
offense[offense['season']>=2012].groupby('op_team').mean()[['receiving_tds', 'receiving_yds','passing_yds','passing_tds', 'passing_att', 'passing_cmp', 'passing_ints', 'rushing_yds', 'rushing_tds', 'passing_success_rate']]

In [0]:
qgrid.show_grid(_)

In [0]:
offense[(offense['season']==2013) & (offense['op_team'] == 'CAR')]

In [0]:
pd.scatter_matrix(offense, alpha=0.2, figsize=(14, 14), diagonal='kde');

In [0]:
from pandas.tools.plotting import lag_plot

In [0]:
offense2014_by_player = offense2014.groupby(level='player_id')

### Defining active players of 2014

...by basically saying that they need to participate in at least 10 games. Participate means that nfl has some stats for them in the database.

In [0]:
n = 10
# Doing a set comprehension here to get unique ids after we grouped and counted the 'site' column.
# The 'site' column is probably always there, so it is a safe bet, but we could used other colums 
# that do not contain nans.
active_players_2014 = {ind[0]
    for ind in offense2014_by_player.filter(lambda x: x['site'].count() >= n).index.values}

So lets see how many players we removed by looking at the 0-th element for all the indices in `offense2014`.

In [0]:
len({ v[0] for v in offense2014.index.values})

... compared to the number of interesting players in 2014:

In [0]:
len(active_players_2014)

In [0]:
for player in active_players_2014:
    try:
        lag_plot(offense2014.sort('date').loc[player]['receiving_yds'], s=1)
    except KeyError:
        print(".",end="")

In [0]:
autocorrelation_plot?

In [0]:
from pandas.tools.plotting import autocorrelation_plot
for player in active_players_2014:
    try:
        autocorrelation_plot(offense2014.sort('date').loc[player]['receiving_yds'],lw=0.5)
    except KeyError:
        print(".",end="")

In [0]:
class SimpleOracle():

    def __init__(self, offense, defense):
        self.offense = offense
        self.defense = defense
    
    def predict(self, player_id, what='passing_yds'):
        try:
            return offense.groupby(level='player_id').mean().loc[player_id][what]
        except KeyError as e:
            return np.nan
    
    def predict_passing_yds(self, player_id):
        return self.predict(player_id, what='passing_yds')
    

In [0]:
class SimpleOpTeamBasedOracle():
    
    def __init__(self, offense, defense):
        self.offense = offense
        self.defense = defense
    
    def predict(self, player_id, what='passing_yds', op_team=None):
        try:
            return offense[offense['op_team'] == op_team].groupby(level='player_id').mean().loc[player_id][what]
        except KeyError:
            return np.nan
    
    def predict_passing_yds(self, player_id, op_team):
        return self.predict(player_id, what='passing_yds', op_team=op_team)

In [0]:
class OpTeamBasedOracle():
    
    def __init__(self, offense, defense):
        self.offense = offense
        self.defense = defense
    
    def predict(self, what='passing_yds', op_team=None):
        try:
            return offense[offense['op_team'] == op_team][what].mean()
        except KeyError:
            return np.nan
    
    def predict_passing_yds(self, op_team):
        return self.predict(what='passing_yds', op_team=op_team)

In [0]:
def append_ops_columns(data, player_ids=None, keys=None):
    mean_values = data.groupby(level='player_id').mean()
    
    if player_ids is None:
        # If not provided, make a unique `Set` of player ids from the offense data
        player_ids = { i[0] for i in data.index.values }
    # Creating the new column names by going through the offense_keys
    # and prepending "ops_".
    new_columns = {}
    for key in keys:
        new_columns["ops_{0}_{1}".format(*key)] = []
    bar = pyprind.ProgBar(len(player_ids))
    for player_id in player_ids:
        for game_id, row in data.loc[player_id].iterrows():
            opposing_players = players_in_game(nflgame.game.Game(game_id),
                                               side=opposite_side(row['site']))
            for what in keys:
                what_key = "{0}_{1}".format(*what)
                tmp = 0.0
                for op_id in opposing_players:
                    value = mean_values.loc[op_id][what_key]
                    if not np.isnan(value):
                        tmp += value
                new_columns["ops_"+what_key].append(tmp)
        bar.update()
    
    # making a new DataFrame for the new columns (ops = opposing player's sum)
    for column_name in new_columns:
        data[column_name] = new_columns[column_name]

In [0]:
append_ops_columns(data, keys=offense_keys)

Some ideas for better oracles:

- For the simple predictor don't use so many seasons but just the last season
- check if there is a correlation if the game was on "home" or "away". That would be an easy plot based on the data we already have.
- Incorporating the time (so treating performance as a time series) by looking at the previous n games.
- we don't have these: (get value if injuries are reported)
- weather
- how about the position of the team in the league? Can we get the position from some kind of top list to the time of each game?
- looking at the opposing players (their ids or something)
- instead of individual opponents: Just use the mean value of each opponent that is going to play to make up new columns like "op_intercepts", "op_fumble_successes" ... and these columns could be the sum over all player of the opposing team. So we have certain fixed columns and don't have to care about individual player_ids and such.
- same as for opponents, the mean value of the stats of his teammates might also matter
- 

In [0]:
predict(playerid, team,

Seeing how often the `op_team` was "HOU" for a certain "player_id", we realize that this is a *very* small sample size.

In [0]:
offense[offense['op_team'] == 'HOU'].groupby(level='player_id').groups[nflgame.find("Peyton Manning")[0].playerid]

In [0]:
offense2014 = collect_stats_for_keys(keys=[('receiving', 'tds'),
                                       ('receiving', 'yds'),
                                       ('passing', 'yds'),
                                       ('passing', 'tds'),
                                       ('passing', 'att'),
                                       ('passing', 'cmp'),
                                       ('passing', 'ints'),
                                       ('rushing', 'yds'),
                                       ('rushing', 'tds')],
                                    games=test_set)

In [0]:
defense2014 = collect_stats_for_keys(keys=[('defense','ffum'),
                                       ('defense','ast'),
                                       ('defense','int'),
                                       ('defense','sk')],
                                    games=test_set)

In [0]:
simple_oracle_predictions = []
simple_opteam_based_oracle_predictions = []
opteam_based_oracle_predictions = []
progbar = pyprind.ProgBar(len(offense2014))
simple = SimpleOracle(offense, defense)
simple_opteam = SimpleOpTeamBasedOracle(offense, defense)
opteam = OpTeamBasedOracle(offense, defense)

for index, row in offense2014.iterrows():
    progbar.update()
    pred1 = simple.predict_passing_yds(index[0])
    pred2 = simple_opteam.predict_passing_yds(
        index[0],
        op_team=row['op_team'])
    pred3 = opteam.predict_passing_yds(op_team=row['op_team'])
    simple_oracle_predictions.append(pred1)
    simple_opteam_based_oracle_predictions.append(pred2)
    opteam_based_oracle_predictions.append(pred3)
    

offense2014['sp_pass_yds'] = simple_oracle_predictions
offense2014['so_pass_yds'] = simple_opteam_based_oracle_predictions
offense2014['op_pass_yds'] = opteam_based_oracle_predictions
offense2014['sp_pass_yds_off'] = offense2014['passing_yds'] - offense2014['sp_pass_yds']
offense2014['so_pass_yds_off'] = offense2014['passing_yds'] - offense2014['so_pass_yds']
offense2014['op_pass_yds_off'] = offense2014['passing_yds'] - offense2014['op_pass_yds']

In [0]:
offense2014[['op_team','passing_yds','sp_pass_yds', 'so_pass_yds', 'op_pass_yds', 'sp_pass_yds_off', 'so_pass_yds_off', 'op_pass_yds_off']].dropna()

In [0]:
offense2014[['sp_pass_yds_off','op_pass_yds_off']].dropna().boxplot(return_type='axes')
offense2014[['sp_pass_yds','op_pass_yds']].dropna().plot()

In [0]:
seaborn.jointplot?

In [0]:
pd.scatter_matrix(offense2014[['passing_yds','sp_pass_yds','op_pass_yds']].dropna(), alpha=1, figsize=(10, 10), diagonal='kde')

In [0]:
offense2014['so_pass_yds_off'].dropna().abs().plot()

In [0]:
offense2014['so_pass_yds_off'].dropna().abs().mean()

In [0]:
offense2014[['sp_pass_yds_off', 'op_pass_yds_off', 'passing_yds']].

In [0]:
offense2014[['sp_pass_yds_off', 'op_pass_yds_off', 'passing_yds']].plot

In [0]:
g1 = games[0]

In [0]:
g1.data['home']['stats'].keys()

In [0]:
g1.data['home']['stats']['team']

In [0]:
def color_players(x):
    if x == '00-0010346':
        return 'orange'
    elif x == '00-0023459':
        return 'green'
    else:
        return 'white'

In [0]:
lookup_player_name('00-0023459')

In [0]:
offense_means.loc[['00-0023459','00-0010346']]['passing_yds']

In [0]:
offense.groupby(level='player_id').std().loc[['00-0023459','00-0010346']]['passing_yds']

In [0]:
np.unique(np.array(list(color_players(v[0]) for v in offense.index.values)))

In [0]:
offense.plot(kind='scatter', x='passing_success_rate', y='passing_yds',
             c=list(color_players(v[0]) for v in offense.index.values),
             s=offense['passing_att'], figsize=(16, 6))

In [0]:
offense_means.index.values

In [0]:
offense_means.plot(kind='scatter', x='passing_success_rate', y='passing_yds',
                   c=list(color_players(v) for v in offense_means.index.values),
                   s=offense_means['passing_att']*2, figsize=(16, 6))

I set the size of the circles to the passting_att. right.
Now you see the "outliers" with success_rate 1.0 so tiny and it is clear that they don't matter.

I see. I dunno....
good point ... 

In [0]:
offense.plot(kind='scatter', x='passing_att', y='passing_yds', c='passing_tds', s=100)

In [0]:
offense.plot(kind='scatter', x='passing_yds', y='passing_tds', s=50)

In [0]:
tmp = offense_means['passing_yds'].dropna().mean(level=0).sort(inplace=False).tail(50)

In [0]:
tmp.plot(kind='bar')

In [0]:
offense.loc[tmp.index[-1], '2009091312' ][['season','week','team','team']]

In [0]:
g1 = nflgame.games(*offense.loc[tmp.index[-1], '2009091312'][['season','week','team','team']])[0]
g1

In [0]:
g1.stats_home

Example for lookup_player_name of that `tmp` list that has the exceptionally good average

In [0]:
lookup_player_name(tmp.index[-1])

What are the minimum number of players that have played in a game (by looking at all the statistics we have for each game)?

In [0]:
players = active_players_in_year(2013)
games = list(itertools.chain(*(nflgame.games(year) for year in [2009, 2010, 2011, 2012, 2013])))

In [0]:
games

## Just messing around

In [0]:
def players_in_game(game, side='away'):
    """
    Given a nflgame.Game object, return all the `player_id`s that are found in the stats for the given `side`.
    """
    list_of_list_of_player_ids = [ list(game.data[side]['stats'][key].keys()) for key in
                               ['receiving', 'defense', 'punting', 'kickret', 'kicking', 'rushing', 'puntret', 'passing', 'fumbles']
                               if key in game.data[side]['stats']]
    all_player_id = itertools.chain(*list_of_list_of_player_ids)
    return set(all_player_id)

Demonstrate the ussage of nice_score of any game

In [0]:
for game in nflgame.games(2014,1, home="IND", away="IND"):
    print(game.nice_score())

How to use `nflgame.find` to lookup a player by (full) name and get his `player_id`

In [0]:
pl = nflgame.find("Andrew Luck")[0]
pl.player_id

How to get the nicely (text) summary of the total stats for a certain period:

In [0]:
pl.stats(2013).formatted_stats()

The scheme after the ponints are assingned in Gregs fantasy league

In [0]:
greg_league_points = dict(points_passing_yards=20,
                          points_passing_ints=-2,
                          points_passing_twoptm=2,
                          points_passing_tds=6)

In [0]:
fantasy_score(g1, player_id=pl.player_id, **greg_league_points)