In [524]:
import psycopg2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from psycopg2.extensions import AsIs
from sklearn import linear_model
from __future__ import division
%matplotlib inline

class NflTeam:
    
    def __init__(self, team_name, game_date, home_team, db_conn):
        self.team_name = team_name
        self.db_conn = db_conn
        self.game_date = game_date
        self.last_game_id = self.get_last_game_id()
        if home_team == 1:
            self.prefix = "home_"
        else:
            self.prefix = "away_"
    
    def collect_data(self):
        agg_play = self.get_last_agg_play_data()
        game = self.get_last_game_data()
        play = self.get_last_play_data()
        return dict(agg_play.items() + game.items() + play.items())
        
    def get_last_agg_play_data(self):
        pass_var = self.prefix + "prev_game_pass_yds"
        rush_var = self.prefix + "prev_game_rush_yds"
        all_vars = [pass_var, rush_var]
        cur = self.db_conn.cursor()
        cur.execute(""" 
                        select sum(plays.passing_yds) as %(pass)s,
                               sum(plays.rushing_yds) as %(rush)s
                               from agg_play plays
                        left join drive drive on plays.gsis_id = drive.gsis_id and
                            plays.drive_id = drive.drive_id
                        where plays.gsis_id = %(gsis)s and drive.pos_team = %(team)s
                    """, {'gsis' : self.last_game_id, 'team' : self.team_name, 
                          'pass' : AsIs(pass_var), 'rush' : AsIs(rush_var)})
        yds_data = cur.fetchone()
        cur.close()
        yds_data = [data for data in yds_data]
        return self.db_to_dict(yds_data, all_vars)
    
    def get_last_game_data(self):
        cur = self.db_conn.cursor()
        all_vars = [self.prefix + "won_prev_game", self.prefix + "prev_game_spread"]
        cur.execute("""
                        select case when home_team = %(team)s and home_score >= away_score then 1
                                    when home_team = %(team)s and home_score < away_score then 0
                                    when away_team = %(team)s and home_score > away_score then 0
                                    when away_team = %(team)s and home_score <= away_score then 1
                                end as won_last_game,
                               case when home_team = %(team)s then home_score - away_score
                                    when away_team = %(team)s then away_score - home_score
                                end as game_spread
                        from game
                        where gsis_id = %(gsis)s;""", {'team' : self.team_name, 'gsis' : self.last_game_id})
        game_data = cur.fetchone()
        cur.close()
        return self.db_to_dict(game_data, all_vars)
    
    def get_last_play_data(self):
        cur = self.db_conn.cursor()
        all_vars = [self.prefix + "prev_num_first_downs"]
        cur.execute("""
                        select sum(first_down)
                        from play
                        where gsis_id = %(gsis)s and pos_team = %(team)s;""", 
                    {'team' : self.team_name, 'gsis' : self.last_game_id})
        game_data = cur.fetchone()
        cur.close()
        return self.db_to_dict(game_data, all_vars) 
    
    def get_last_game_id(self):
        cur = self.db_conn.cursor()
        cur.execute(""" select gsis_id, start_time, home_team, away_team
                        from game 
                        where start_time < date %(date)s and
                            (home_team = %(team)s or away_team = %(team)s)
                        order by start_time desc
                        limit 1; """, {'date' : self.game_date, 'team' : self.team_name})
        last_game_id = cur.fetchone()[0]
        cur.close()
        return last_game_id
    
    def db_to_dict(self, db_values, variables):
        values = [data for data in db_values]
        return dict(zip(variables, values))
    
class NflGame:
    
    def __init__(self, home_team, away_team, conn):
        assert home_team.game_date == away_team.game_date
        self.home_team = home_team
        self.away_team = away_team
        self.db_conn = conn
    
    def get_outcome(self):
        cur = self.db_conn.cursor()
        cur.execute(""" select gsis_id, case when home_score >= away_score then 1 
                               when home_score < away_score then 0 end as home_won
                        from game 
                        where home_team = %(home)s and away_team = %(away)s and
                            date(start_time) = date %(date)s; """,
                    {'date' : self.home_team.game_date, 'home' : self.home_team.team_name,
                     'away' : self.away_team.team_name})
        outcome_data = [data for data in cur.fetchone()]
        all_vars = ['gsis_id', 'home_won']
        cur.close()
        return dict(zip(all_vars, outcome_data))
    
    def get_data(self):
        home_data = self.home_team.collect_data().items()
        away_data = self.away_team.collect_data().items()
        return dict(home_data + away_data)
    
    def get_pred_row(self):
        data = self.get_data()
        outcome = self.get_outcome()
        return dict(data.items() + outcome.items())
    
class NflSeason:
    
    def __init__(self, season, conn):
        self.season = season
        self.db_conn = conn
        self.games = self.get_games()
    
    def get_games(self):
        cur = self.db_conn.cursor()
        cur.execute(""" select home_team, away_team, date(start_time)
                        from game
                        where season_year = %(season)s and season_type = 'Regular'; """,
                    {'season' : self.season})
        games = cur.fetchall()
        cur.close()
        return games
    
    def get_game_data(self):
        game_rows = []
        for game in self.games:
            game_date = str(game[2])
            home_team = NflTeam(game[0], game_date, 1, self.db_conn)
            away_team = NflTeam(game[1], game_date, 0, self.db_conn)
            game_row = NflGame(home_team, away_team, self.db_conn).get_pred_row()
            game_rows.append(game_row)
        return game_rows
    
    def train_on_season(self):
        X, y = self.get_X_y()
        logreg = linear_model.LogisticRegression()
        logreg.fit(X, y)
        return logreg
    
    def test_on_season(self, trained_model):
        X, y = self.get_X_y()
        pred_y = trained_model.predict(X)
        return pred_y, y
    
    def get_X_y(self):
        games = pd.DataFrame(self.get_game_data())
        y = games['home_won'].values
        del games['home_won']
        del games['gsis_id']
        X = games.values
        return X, y

def connect_to_nfl_db():
    conn = psycopg2.connect("dbname=nfldb user=nfldb")
    return conn

def get_teams_in_season(season, conn):
    return pd.read_sql("select distinct home_team from game where season_year = {}".format(season), conn)

In [512]:
conn = connect_to_nfl_db()

In [513]:
sanfran = NflTeam("SF", "2014-11-16", 0, conn)
nyg = NflTeam("NYG", "2014-11-16", 1, conn)
game = NflGame(nyg, sanfran, conn)

In [514]:
season_2013 = NflSeason("2013", conn)
trained_season = season_2013.train_on_season()

In [517]:
season_2014 = NflSeason("2014", conn)
pred_home_wins, home_wins = season_2014.test_on_season(trained_season)

In [525]:
sum(pred_home_wins == home_wins) / pred_home_wins.shape[0]

0.57421875

256

In [503]:
y = games['home_won'].values
del games['home_won']
del games['gsis_id']
X = games.values

array([[272,  44,  -3, ...,  -8,  20,   0],
       [208, 116,   8, ..., -22,  11,   0],
       [297,  53,  16, ...,  15,  13,   1],
       ..., 
       [400, 114,  24, ..., -13,  14,   0],
       [153, 155, -16, ...,  13,  24,   1],
       [158, 129,  10, ...,  -7,  10,   0]])