## Import library

In [1]:
import pandas as pd
import numpy as np
import csv
import json
import requests
from xml.etree import ElementTree
import glob
import re
import sklearn.linear_model as lm

## Load dataset

In [242]:
points = pd.read_csv('data/2014_point_spread_footballlocks.csv')
points.head()

Unnamed: 0,eid,season,week,Home,Away,HomeScore,AwayScore,Day,Time,Favorite,Underdog,Spread,CoverOrNot
0,2014090400,2014,1,SEA,GB,36,16,Thu,8:30,SEA,GB,-5.0,1.0
1,2014090700,2014,1,ATL,NO,37,34,Sun,1:00,NO,ATL,-3.0,-1.0
2,2014090701,2014,1,BAL,CIN,16,23,Sun,1:00,BAL,CIN,-1.0,-1.0
3,2014090702,2014,1,CHI,BUF,20,23,Sun,1:00,CHI,BUF,-7.0,-1.0
4,2014090703,2014,1,HOU,WAS,17,6,Sun,1:00,HOU,WAS,-3.0,1.0


In [243]:
a = points.Underdog.unique()
b = points.Favorite.unique()
c = np.setdiff1d(np.union1d(a, b), np.intersect1d(a, b))
c

array(['OAK'], dtype=object)

In [3]:
# ## TODO: change to complete dataset
# covers = []
# for i, r in points.iterrows():
#     h = r['Home']
#     a = r['Away']
#     f = r['Favorite']
#     u = r['Underdog']
#     hs = r['HomeScore']
#     vs = r['AwayScore']
#     fs = -1
#     us = -1
#     if h == f:
#         fs = hs
#         us = vs
#     else:
#         fs = vs
#         us = hs
#     if (fs == -1) or (us == -1):
#         print "Favorite or Underdog not match"
#         print r
#         break
#     if (fs - us) > -r['Spread']:
#         cover = 1
#     elif (fs - us) == -r['Spread']:
#         cover = 0
#     else:
#         cover = -1
#     covers.append(cover)
# points["CoverOrFail"] = pd.Series(covers)

In [192]:
len(points.Favorite.unique())

31

In [191]:
len(points.Underdog.unique())

32

## Take Season 2014 week 1 game 1 as example

In [245]:
eid = 2014090400

In [246]:
plays = pd.read_csv('data/game_data/'+str(eid)+'_plays.csv')
plays = pd.read_csv(str(eid)+'.csv')

In [247]:
plays_parse = plays.loc[:, ['time', 'desc', 'qtr', 'yrdln', 'posteam', 'note']].copy()
plays_parse = plays_parse.sort_values(by=['qtr', 'time'], ascending=[True, False], axis=0).reset_index()

# [time, Missing Value] make endtime row as 0, since they are nan
time_null_idx = plays_parse['time'].index[pd.isnull(plays_parse['time'].values)]
plays_parse.loc[time_null_idx, 'time'] = '0:00'
plays_parse['time'] = pd.Series([float(a.split(':')[0])*60+float(a.split(':')[1]) for a in plays_parse['time'].values])
plays_parse['lefttime'] = 3600 - 15*60*(plays_parse['qtr']-1) - plays_parse['time']

# [yrdln, Missing Value] Missing Value: make nan the same as previous one
plays_parse['yrdln'] = plays_parse['yrdln'].fillna(method='ffill')

plays_parse.head()

Unnamed: 0,index,time,desc,qtr,yrdln,posteam,note,lefttime
0,86,900.0,S.Hauschka kicks 71 yards from SEA 35 to GB -6...,1,SEA 35,SEA,KICKOFF,2700.0
1,84,896.0,(14:56) E.Lacy right tackle to GB 19 for 6 yar...,1,GB 13,GB,,2704.0
2,90,870.0,(14:30) E.Lacy left tackle to GB 22 for 3 yard...,1,GB 19,GB,PENALTY,2730.0
3,88,851.0,(14:11) (Shotgun) E.Lacy up the middle to GB 3...,1,GB 24,GB,,2749.0
4,87,812.0,(13:32) (No Huddle) J.Starks right guard to GB...,1,GB 39,GB,,2788.0


In [248]:
# Assign Home, Away, HomeScore, AwayScore, CoverOrNot to current match
plays_parse['Home'] = points[points['eid'] == eid].Home.iloc[0]
plays_parse['Away'] = points[points['eid'] == eid].Away.iloc[0]
# plays_parse['Favorite'] = points[points['eid'] == eid].Favorite.iloc[0]
# plays_parse['Underdog'] = points[points['eid'] == eid].Underdog.iloc[0]

plays_parse['HomeScore'] = 0
plays_parse['AwayScore'] = 0
plays_parse['CoverOrNot'] = points[points['eid'] == eid].CoverOrNot.iloc[0]
plays_parse.head()

Unnamed: 0,index,time,desc,qtr,yrdln,posteam,note,lefttime,Home,Away,HomeScore,AwayScore,CoverOrNot
0,86,900.0,S.Hauschka kicks 71 yards from SEA 35 to GB -6...,1,SEA 35,SEA,KICKOFF,2700.0,SEA,GB,0,0,1.0
1,84,896.0,(14:56) E.Lacy right tackle to GB 19 for 6 yar...,1,GB 13,GB,,2704.0,SEA,GB,0,0,1.0
2,90,870.0,(14:30) E.Lacy left tackle to GB 22 for 3 yard...,1,GB 19,GB,PENALTY,2730.0,SEA,GB,0,0,1.0
3,88,851.0,(14:11) (Shotgun) E.Lacy up the middle to GB 3...,1,GB 24,GB,,2749.0,SEA,GB,0,0,1.0
4,87,812.0,(13:32) (No Huddle) J.Starks right guard to GB...,1,GB 39,GB,,2788.0,SEA,GB,0,0,1.0


In [249]:
# Add score according FG and TD
for i, r in plays_parse.iterrows():
    
    # Dealing with points
    if r['note'] == 'TD':
        if r['posteam'] == r['Home']:
            plays_parse.loc[i:, 'HomeScore'] = plays_parse.loc[i:, 'HomeScore'] + 6
        elif r['posteam'] == r['Away']:
            plays_parse.loc[i:, 'AwayScore'] = plays_parse.loc[i:, 'AwayScore'] + 6
    if r['note'] == 'FG':
        if r['posteam'] == r['Home']:
            plays_parse.loc[i:, 'HomeScore'] = plays_parse.loc[i:, 'HomeScore'] + 3
        elif r['posteam'] == r['Away']:
            plays_parse.loc[i:, 'AwayScore'] = plays_parse.loc[i:, 'AwayScore'] + 3
    if r['note'] == 'XP':
        if r['posteam'] == r['Home']:
            plays_parse.loc[i:, 'HomeScore'] = plays_parse.loc[i:, 'HomeScore'] + 1
        elif r['posteam'] == r['Away']:
            plays_parse.loc[i:, 'AwayScore'] = plays_parse.loc[i:, 'AwayScore'] + 1
    if r['note'] == 'SAF':
        if r['posteam'] == r['Home']:
            plays_parse.loc[i:, 'AwayScore'] = plays_parse.loc[i:, 'AwayScore'] + 2
        elif r['posteam'] == r['Away']:
            plays_parse.loc[i:, 'HomeScore'] = plays_parse.loc[i:, 'HomeScore'] + 2
            
    # yrdln: home 0, away 100
    side = r['yrdln'].split(' ')[0]
    yrdln = int(r['yrdln'].split(' ')[1])
    if side == r['Home']:
        h_yrdln = yrdln
        v_yrdln = 100 - yrdln
    elif side == r['Away']:
        v_yrdln = yrdln
        h_yrdln = 100 - yrdln
    plays_parse.loc[i, 'h_yrdln'] = h_yrdln
    plays_parse.loc[i, 'v_yrdln'] = v_yrdln

plays_parse.head()

Unnamed: 0,index,time,desc,qtr,yrdln,posteam,note,lefttime,Home,Away,HomeScore,AwayScore,CoverOrNot,h_yrdln,v_yrdln
0,86,900.0,S.Hauschka kicks 71 yards from SEA 35 to GB -6...,1,SEA 35,SEA,KICKOFF,2700.0,SEA,GB,0,0,1.0,35.0,65.0
1,84,896.0,(14:56) E.Lacy right tackle to GB 19 for 6 yar...,1,GB 13,GB,,2704.0,SEA,GB,0,0,1.0,87.0,13.0
2,90,870.0,(14:30) E.Lacy left tackle to GB 22 for 3 yard...,1,GB 19,GB,PENALTY,2730.0,SEA,GB,0,0,1.0,81.0,19.0
3,88,851.0,(14:11) (Shotgun) E.Lacy up the middle to GB 3...,1,GB 24,GB,,2749.0,SEA,GB,0,0,1.0,76.0,24.0
4,87,812.0,(13:32) (No Huddle) J.Starks right guard to GB...,1,GB 39,GB,,2788.0,SEA,GB,0,0,1.0,61.0,39.0


In [250]:
print plays_parse.iloc[-1]['HomeScore'] == points.iloc[0]['HomeScore'] and plays_parse.iloc[-1]['AwayScore'] == points.iloc[0]['AwayScore']

True


## Go through season 2014, take REG 1-16 as training set, 17 as testing set

In [332]:
def parseMatches(df, eid, points):
    try:
        df = df.loc[:, ['time', 'desc', 'qtr', 'yrdln', 'posteam', 'note']].copy()
        df = df.sort_values(by=['qtr', 'time'], ascending=[True, False], axis=0).reset_index()

        # [time, Missing Value] make endtime row as 0, since they are nan
        time_null_idx = df['time'].index[pd.isnull(df['time'].values)]
        df.loc[time_null_idx, 'time'] = '0:00'
        df['time'] = pd.Series([float(a.split(':')[0])*60+float(a.split(':')[1]) for a in df['time'].values])

        if len(df.qtr.unique()) == 4:
            df['lefttime'] = 3600 - 15*60*(df['qtr']) + df['time'] # unit: seconds
        else: # suppose only 1 overtime
            df['lefttime'] = 3600 + 15*60 - df.iloc[-1].time - 15*60*(df['qtr']) + df['time'] # unit: seconds


        # [yrdln, Missing Value] Missing Value: make nan the same as previous one
        df['yrdln'] = df['yrdln'].fillna(method='ffill')

        # Assign Home, Away, HomeScore, AwayScore, CoverOrNot to current match
        df['Home'] = points[points['eid'] == eid].Home.iloc[0]
        df['Away'] = points[points['eid'] == eid].Away.iloc[0]
        df['Favorite'] = points[points['eid'] == eid].Favorite.iloc[0]
        df['Underdog'] = points[points['eid'] == eid].Underdog.iloc[0]
        df['HomeScore'] = 0
        df['AwayScore'] = 0
        df['eid'] = eid
        df['CoverOrNot'] = points[points['eid'] == eid].CoverOrNot.iloc[0]
        
        h_inter_i = []
        v_inter_i = []
        h_inter = 0
        v_inter = 0
        
        

        for i, r in df.iterrows():
            # Add score according FG and TD
            if r['note'] == 'TD':
                # In case the touch down has been recovered
                if 'RECOVERED' in r['desc']:
                    rec_team = r['desc'].split("RECOVERED by ", 1)[1][0:3]
                    if not rec_team[-1].isalpha():
                        rec_team = rec_team[0:2]

                    if r['Home'] == rec_team:
                        df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 6
                        h_inter += 1
                        h_inter_i.append(i)
                    elif r['Away'] == rec_team:
                        df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 6
                        v_inter += 1
                        v_inter_i.append(i)


                # In case the touch down is catch by kick and run
                elif 'kicks' in r['desc'] and r['posteam'] == r['Home']:
                    df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 6
                    v_inter += 1
                    v_inter_i.append(i)
                elif 'kicks' in r['desc'] and r['posteam'] == r['Away']:
                    df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 6
                    h_inter += 1
                    h_inter_i.append(i)




                # In case the touch down complished by intercept
                elif 'INTERCEPTED' in r['desc'] and r['posteam'] == r['Home']:
                    df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 6
                    v_inter += 1
                    v_inter_i.append(i)
                elif 'INTERCEPTED' in r['desc'] and r['posteam'] == r['Away']: # In case the touch down has been recovered
                    df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 6
                    h_inter += 1
                    h_inter_i.append(i)

                # In case the touch down complished by punts
                elif 'punts' in r['desc']:
                    if r['Home'] == r['posteam']:
                        df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 6
                        v_inter += 1
                        v_inter_i.append(i)
                    elif r['Away'] == r['posteam']:
                        df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 6
                        h_inter += 1
                        h_inter_i.append(i)

                # TODO: check, In case there's other penalty while touch down
#                 elif 'PENALTY' in r['desc']:
#                     pen_team = r['desc'].split("PENALTY on ", 1)[1][0:3]
#                     if r['Home'] == pen_team:
#                         df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 6
#                     elif r['Away'] == pen_team:
#                         df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 6

                # default touchdown
                elif r['posteam'] == r['Home']:
                    df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 6
                elif r['posteam'] == r['Away']:
                    df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 6

            if r['note'] == 'PENALTY':
                if 'TWO-POINT CONVERSION ATTEMPT' in r['desc']:
                    pen_team = r['desc'].split("PENALTY on ", 1)[1][0:3]
                    if r['Away'] == pen_team:
                        df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 2
                    elif r['Home'] == pen_team:
                        df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 2



            if r['note'] == 'FG':
                if r['posteam'] == r['Home']:
                    df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 3
                elif r['posteam'] == r['Away']:
                    df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 3
            if r['note'] == 'XP':
                if r['posteam'] == r['Home']:
                    df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 1
                elif r['posteam'] == r['Away']:
                    df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 1
            if r['note'] == '2PS':
                if r['posteam'] == r['Home']:
                    df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 2
                elif r['posteam'] == r['Away']:
                    df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 2

            if r['note'] == '2PR':
                if r['posteam'] == r['Home']:
                    df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 2
                elif r['posteam'] == r['Away']:
                    df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 2


            if r['note'] == 'SAF':
                if r['posteam'] == r['Home']:
                    df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 2
                elif r['posteam'] == r['Away']:
                    df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 2

            # yrdln: home 0, away 100
            yrdln_split = r['yrdln'].split(' ')
            if len(yrdln_split) == 1 and yrdln_split[0] == str(50):
                f_yrdln = 50
                u_yrdln = 50
            elif len(yrdln_split) == 2:
                side = r['yrdln'].split(' ')[0]
                yrdln = int(r['yrdln'].split(' ')[1])
                if side == r['Favorite']:
                    f_yrdln = yrdln
                    u_yrdln = 100 - yrdln
                elif side == r['Underdog']:
                    u_yrdln = yrdln
                    f_yrdln = 100 - yrdln
            else:
                print 'Error in parsing yrdln, eid=', eid, 'yrdln=', r['yrdln']

            df.loc[i, 'f_yrdln'] = f_yrdln
            df.loc[i, 'u_yrdln'] = u_yrdln
    except AttributeError:
        print 'AttributeError:', eid, df['time'].values
        
    # Assign Favorite score and Underdog score
    if df['Home'].values[0] == df['Favorite'].values[0]:
        df['FavoriteScore'] = df['HomeScore']
        df['UnderdogScore'] = df['AwayScore']
    elif df['Home'].values[0] == df['Underdog'].values[0]:
        df['FavoriteScore'] = df['AwayScore']
        df['UnderdogScore'] = df['HomeScore']


    ## Verify with final value
    parse_hs = df.iloc[-1]['HomeScore']
    parse_vs = df.iloc[-1]['AwayScore']
    real_hs = points[points['eid'] == eid]['HomeScore'].values[0]
    real_vs = points[points['eid'] == eid]['AwayScore'].values[0]
    
    if parse_hs == real_hs and parse_vs == real_vs:
        print 'Successfully finish:', eid
    else:
        # TODO: check which interception actually had following extra point, right now just start from first one
        if (0 < (real_hs - parse_hs) <= h_inter) or (0 < (real_vs - parse_vs) <= v_inter):
            try:
                for i in range(real_hs - parse_hs):
                    idx = h_inter_i[i]
                    df.loc[idx:, 'HomeScore'] += 1
                    print 'Adjust Home Score', eid
                for i in range(real_vs - parse_vs):
                    idx = v_inter_i[i]
                    df.loc[idx:, 'AwayScore'] += 1
                    print 'Adjust Away Score', eid
            except IndexError:
                print 'IndexError:', h_inter_i, v_inter_i, real_vs - parse_vs

        else:
            print 'Fail finishing:', eid
            print 'parse home score =', df.iloc[-1]['HomeScore'], 'real home score =', points[points['eid'] == eid]['HomeScore'].values[0]
            print 'parse away score =', df.iloc[-1]['AwayScore'], 'real away score =', points[points['eid'] == eid]['AwayScore'].values[0]
    return df

In [344]:
for i, r in points[(points['season'] == 2014) & (points['week'] == 9)].iterrows():
    eid = r['eid']
    plays = pd.read_csv('data/game_data/'+str(eid)+'_plays.csv')
    plays = parseMatches(plays, eid, points)
    f_plays = plays.loc[:, ['qtr', 'lefttime', 'FavoriteScore', 'f_yrdln', 'CoverOrNot']]
    f_plays.rename(columns={
        'qtr': 'qtr',
        'lefttime': 'lefttime',
        'FavoriteScore': 'score',
        'f_yrdln': 'yrdln',
        'CoverOrNot': 'cover',
    }, inplace=True)
    f_plays['team'] = plays.Favorite.unique()[0]
    u_plays = plays.loc[:, ['qtr', 'lefttime', 'UnderdogScore', 'u_yrdln', 'CoverOrNot']]
    u_plays.rename(columns={
        'qtr': 'qtr',
        'lefttime': 'lefttime',
        'UnderdogScore': 'score',
        'u_yrdln': 'yrdln',
        'CoverOrNot': 'cover',
    }, inplace=True)
    u_plays['team'] = plays.Underdog.values[0]
    u_plays['cover'] = -u_plays['cover']

Successfully finish: 2014103000
Successfully finish: 2014110201
Adjust Home Score 2014110202
Adjust Home Score 2014110203
Successfully finish: 2014110204
Successfully finish: 2014110200
Successfully finish: 2014110205
Successfully finish: 2014110206
Successfully finish: 2014110207
Adjust Home Score 2014110208
Adjust Home Score 2014110209
Adjust Away Score 2014110209
Fail finishing: 2014110210
parse home score = 43 real home score = 43
parse away score = 25 real away score = 23
Successfully finish: 2014110300


### Validation Block

In [330]:
eid = 2014100512
df = pd.read_csv('data/game_data/'+str(eid)+'_plays.csv')
df = df.loc[:, ['time', 'desc', 'qtr', 'yrdln', 'posteam', 'note']].copy()
df = df.sort_values(by=['qtr', 'time'], ascending=[True, False], axis=0).reset_index()

# [time, Missing Value] make endtime row as 0, since they are nan
time_null_idx = df['time'].index[pd.isnull(df['time'].values)]
df.loc[time_null_idx, 'time'] = '0:00'
df['time'] = pd.Series([float(a.split(':')[0])*60+float(a.split(':')[1]) for a in df['time'].values])
if len(df.qtr.unique()) == 4:
    df['lefttime'] = 3600 - 15*60*(df['qtr']) + df['time'] # unit: seconds
else: # suppose only 1 overtime
    df['lefttime'] = 3600 + 15*60 - df.iloc[-1].time - 15*60*(df['qtr']) + df['time'] # unit: seconds

# [yrdln, Missing Value] Missing Value: make nan the same as previous one
df['yrdln'] = df['yrdln'].fillna(method='ffill')

# Assign Home, Away, HomeScore, AwayScore, CoverOrNot to current match
df['Home'] = points[points['eid'] == eid].Home.iloc[0]
df['Away'] = points[points['eid'] == eid].Away.iloc[0]
df['eid'] = eid

df['HomeScore'] = 0
df['AwayScore'] = 0
df['CoverOrNot'] = points[points['eid'] == eid].CoverOrNot.iloc[0]

h_inter_i = []
v_inter_i = []
h_inter = 0
v_inter = 0
for i, r in df.iterrows():
    # Add score according FG and TD
    if r['note'] == 'TD':
        # In case the touch down has been recovered
        if 'RECOVERED' in r['desc']:
            rec_team = r['desc'].split("RECOVERED by ", 1)[1][0:3]
            if not rec_team[-1].isalpha():
                rec_team = rec_team[0:2]
            if r['Home'] == rec_team:
                df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 6
                h_inter += 1
                h_inter_i.append(i)
            elif r['Away'] == rec_team:
                df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 6
                v_inter += 1
                v_inter_i.append(i)


         # In case the touch down is catch by kick and run
        elif 'kicks' in r['desc'] and r['posteam'] == r['Home']:
            df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 6
            v_inter += 1
            v_inter_i.append(i)
        elif 'kicks' in r['desc'] and r['posteam'] == r['Away']:
            df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 6
            h_inter += 1
            h_inter_i.append(i)



        # In case the touch down complished by intercept
        elif 'INTERCEPTED' in r['desc'] and r['posteam'] == r['Home']:
            df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 6
            v_inter += 1
            v_inter_i.append(i)
        elif 'INTERCEPTED' in r['desc'] and r['posteam'] == r['Away']: # In case the touch down has been recovered
            df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 6
            h_inter += 1
            h_inter_i.append(i)

        # In case the touch down complished by punts
        elif 'punts' in r['desc']:
            if r['Home'] == r['posteam']:
                df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 6
                v_inter += 1
                v_inter_i.append(i)
            elif r['Away'] == r['posteam']:
                df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 6
                h_inter += 1
                h_inter_i.append(i)

        # TODO: check, In case there's other penalty while touch down
#         elif 'PENALTY' in r['desc']:
#             pen_team = r['desc'].split("PENALTY on ", 1)[1][0:3]
#             if r['Home'] == pen_team:
#                 df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 6
#             elif r['Away'] == pen_team:
#                 df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 6

        # default touchdown
        elif r['posteam'] == r['Home']:
            df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 6
            print r['posteam'], r['desc']
        elif r['posteam'] == r['Away']:
            df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 6
            print r['posteam'], r['desc']

    if r['note'] == 'PENALTY':
        if 'TWO-POINT CONVERSION ATTEMPT' in r['desc']:
            pen_team = r['desc'].split("PENALTY on ", 1)[1][0:3]
            if r['Away'] == pen_team:
                df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 2
            elif r['Home'] == pen_team:
                df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 2

    if r['note'] == '2PR':
        if r['posteam'] == r['Home']:
            df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 2
        elif r['posteam'] == r['Away']:
            df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 2



    if r['note'] == 'FG':
        if r['posteam'] == r['Home']:
            df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 3
        elif r['posteam'] == r['Away']:
            df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 3
    if r['note'] == 'XP':
        if r['posteam'] == r['Home']:
            df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 1
        elif r['posteam'] == r['Away']:
            df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 1
    if r['note'] == '2PS':
        if r['posteam'] == r['Home']:
            df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 2
        elif r['posteam'] == r['Away']:
            df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 2

    if r['note'] == 'SAF':
        if r['posteam'] == r['Home']:
            df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 2
        elif r['posteam'] == r['Away']:
            df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 2



    # yrdln: home 0, away 100
    yrdln_split = r['yrdln'].split(' ')
    if len(yrdln_split) == 1 and yrdln_split[0] == str(50):
        h_yrdln = 50
        v_yrdln = 50
    elif len(yrdln_split) == 2:
        side = r['yrdln'].split(' ')[0]
        yrdln = int(r['yrdln'].split(' ')[1])
        if side == r['Home']:
            h_yrdln = yrdln
            v_yrdln = 100 - yrdln
        elif side == r['Away']:
            v_yrdln = yrdln
            h_yrdln = 100 - yrdln
    else:
        print 'Error in parsing yrdln, eid=', eid, 'yrdln=', r['yrdln']


    df.loc[i, 'h_yrdln'] = h_yrdln
    df.loc[i, 'v_yrdln'] = v_yrdln

## Verify with final value
parse_hs = df.iloc[-1]['HomeScore']
parse_vs = df.iloc[-1]['AwayScore']
real_hs = points[points['eid'] == eid]['HomeScore'].values[0]
real_vs = points[points['eid'] == eid]['AwayScore'].values[0]

if parse_hs == real_hs and parse_vs == real_vs:
    print 'Successfully finish:', eid
else:
    # TODO: check which interception actually had following extra point, right now just start from first one
    if (0 < (real_hs - parse_hs) <= h_inter) or (0 < (real_vs - parse_vs) <= v_inter):
        for i in range(real_hs - parse_hs):
            idx = h_inter_i[i]
            df.loc[idx:, 'HomeScore'] += 1
            print 'Adjust Home Score', eid
        for i in range(real_vs - parse_vs):
            idx = v_inter_i[i]
            df.loc[idx:, 'AwayScore'] += 1
            print 'Adjust Away Score', eid
    else:
        print 'Fail finishing:', eid
        print 'parse home score =', df.iloc[-1]['HomeScore'], 'real home score =', points[points['eid'] == eid]['HomeScore'].values[0]
        print 'parse away score =', df.iloc[-1]['AwayScore'], 'real away score =', points[points['eid'] == eid]['AwayScore'].values[0]

NE (10:06) J.Devey reported in as eligible.  S.Ridley left guard for 1 yard, TOUCHDOWN.
NE (3:16) T.Brady pass short left to T.Wright for 17 yards, TOUCHDOWN.
CIN (11:32) (Shotgun) A.Dalton pass deep middle to M.Sanu for 37 yards, TOUCHDOWN.
NE (6:11) (Shotgun) T.Brady pass short middle to R.Gronkowski for 16 yards, TOUCHDOWN. 45th career Brady-to-Gronkowski TD pass, tying Bledsoe-to-Coates for 2nd most QB-to-TE TD passes all-time (Rivers-to-Gates, 65).  NE 87-Gronkowski 13th career 100-yard game, extends his Patriots record for a TE.
CIN (3:48) (Shotgun) A.Dalton pass deep right to A.Green for 17 yards, TOUCHDOWN. The Replay Official challenged the pass completion ruling, and the play was Upheld. The ruling on the field stands.
Adjust Home Score 2014100512


In [331]:
df[df.qtr == 3]

Unnamed: 0,index,time,desc,qtr,yrdln,posteam,note,lefttime,Home,Away,eid,HomeScore,AwayScore,CoverOrNot,h_yrdln,v_yrdln
95,133,900.0,S.Gostkowski kicks 70 yards from NE 35 to CIN ...,3,NE 35,NE,KICKOFF,1800.0,NE,CIN,2014100512,20,3,-1.0,35.0,65.0
96,135,895.0,(14:55) (Shotgun) A.Dalton pass incomplete sho...,3,CIN 19,CIN,,1795.0,NE,CIN,2014100512,20,3,-1.0,81.0,19.0
97,134,891.0,(14:51) (Shotgun) G.Bernard right tackle to CI...,3,CIN 19,CIN,,1791.0,NE,CIN,2014100512,20,3,-1.0,81.0,19.0
98,136,847.0,(14:07) (Shotgun) A.Dalton pass incomplete dee...,3,CIN 24,CIN,,1747.0,NE,CIN,2014100512,20,3,-1.0,76.0,24.0
99,132,842.0,"(14:02) K.Huber punts 35 yards to NE 41, Cente...",3,CIN 24,CIN,PUNT,1742.0,NE,CIN,2014100512,20,3,-1.0,76.0,24.0
100,147,836.0,(13:56) T.Brady pass short left to J.Develin t...,3,NE 31,NE,,1736.0,NE,CIN,2014100512,20,3,-1.0,31.0,69.0
101,143,804.0,(13:24) S.Ridley right end to NE 39 for -3 yar...,3,NE 42,NE,,1704.0,NE,CIN,2014100512,20,3,-1.0,42.0,58.0
102,144,768.0,"(12:48) (Shotgun) PENALTY on NE-B.Stork, False...",3,NE 39,NE,PENALTY,1668.0,NE,CIN,2014100512,20,3,-1.0,39.0,61.0
103,145,755.0,(12:35) (Shotgun) T.Brady pass short left to S...,3,NE 34,NE,,1655.0,NE,CIN,2014100512,20,3,-1.0,34.0,66.0
104,142,727.0,(12:07) (Shotgun) T.Brady pass short left to R...,3,NE 31,NE,,1627.0,NE,CIN,2014100512,20,3,-1.0,31.0,69.0


In [329]:
df.iloc[120].desc, df.iloc[120].posteam

('S.Gostkowski kicks 71 yards from NE 35 to CIN -6. B.Tate to CIN 11 for 17 yards (B.Bolden). FUMBLES (B.Bolden), RECOVERED by NE-K.Arrington at CIN 9. K.Arrington for 9 yards, TOUCHDOWN.',
 'NE')

In [280]:
eid = 2014092807
df = pd.read_csv('data/game_data/'+str(eid)+'_plays.csv')
df = df.sort_values(by=['qtr', 'time'], ascending=[True, False], axis=0).reset_index()
df[df.sp == 1]

Unnamed: 0,index,drive,play,down,time,desc,ydstogo,qtr,ydsnet,yrdln,sp,posteam,note
5,18,2,147,2,13:07,(13:07) M.Glennon pass short right to M.Evans ...,7,1,9,PIT 7,1,TB,TD
7,19,2,167,0,13:01,"P.Murray extra point is GOOD, Center-A.DePaola...",0,1,9,PIT 2,1,TB,XP
17,32,4,441,4,10:07,(10:07) (Field Goal formation) P.Murray 50 yar...,7,1,23,PIT 32,1,TB,FG
28,23,5,761,4,06:22,(6:22) (Field Goal formation) S.Suisham 25 yar...,7,1,73,TB 7,1,PIT,FG
44,43,7,1181,3,00:20,(:20) (Shotgun) B.Roethlisberger pass short mi...,11,1,60,TB 11,1,PIT,TD
45,48,7,1202,0,00:15,"S.Suisham extra point is GOOD, Center-G.Warren...",0,1,60,TB 2,1,PIT,XP
69,66,9,1809,2,05:05,"(5:05) (No Huddle, Shotgun) B.Roethlisberger p...",13,2,93,TB 27,1,PIT,TD
70,59,9,1840,0,04:58,"S.Suisham extra point is GOOD, Center-G.Warren...",0,2,93,TB 2,1,PIT,XP
103,121,12,2644,2,12:31,"(12:31) D.Martin left end for 3 yards, TOUCHDO...",3,3,80,PIT 3,1,TB,TD
106,123,12,2672,0,12:27,"P.Murray extra point is GOOD, Center-A.DePaola...",0,3,80,PIT 2,1,TB,XP


In [151]:
df.iloc[67].desc, df.iloc[67].posteam

('(2:15) T.Way punt is BLOCKED by A.Blue, Center-N.Sundberg, RECOVERED by HOU-A.Blue at WAS 5. A.Blue for 5 yards, TOUCHDOWN.',
 'WAS')

In [75]:
range(2)

[0, 1]

### Create clean dataset

In [345]:
dataset = pd.DataFrame()
for w in range(1, 9):
    for i, r in points[(points['season'] == 2014) & (points['week'] == w)].iterrows():
        eid = r['eid']
        plays = pd.read_csv('data/game_data/'+str(eid)+'_plays.csv')
        plays = parseMatches(plays, eid, points)
        f_plays = plays.loc[:, ['qtr', 'lefttime', 'FavoriteScore', 'f_yrdln', 'CoverOrNot']]
        f_plays.rename(columns={
            'qtr': 'qtr',
            'lefttime': 'lefttime',
            'FavoriteScore': 'score',
            'f_yrdln': 'yrdln',
            'CoverOrNot': 'cover',
        }, inplace=True)
        f_plays['eid'] = eid
        f_plays['season'] = 2014
        f_plays['week'] = w
        f_plays['team'] = plays.Favorite.unique()[0]
        print list(f_plays)
        u_plays = plays.loc[:, ['qtr', 'lefttime', 'UnderdogScore', 'u_yrdln', 'CoverOrNot']]
        u_plays.rename(columns={
            'qtr': 'qtr',
            'lefttime': 'lefttime',
            'UnderdogScore': 'score',
            'u_yrdln': 'yrdln',
            'CoverOrNot': 'cover',
        }, inplace=True)
        u_plays['eid'] = eid
        u_plays['season'] = 2014
        u_plays['week'] = w
        u_plays['team'] = plays.Underdog.values[0]
        u_plays['cover'] = -u_plays['cover']
        dataset = pd.concat([dataset, f_plays, u_plays])
dataset.shape

Successfully finish: 2014090400
['qtr', 'lefttime', 'score', 'yrdln', 'cover', 'eid', 'season', 'week', 'team']
Successfully finish: 2014090700
['qtr', 'lefttime', 'score', 'yrdln', 'cover', 'eid', 'season', 'week', 'team']
Successfully finish: 2014090701
['qtr', 'lefttime', 'score', 'yrdln', 'cover', 'eid', 'season', 'week', 'team']
Successfully finish: 2014090702
['qtr', 'lefttime', 'score', 'yrdln', 'cover', 'eid', 'season', 'week', 'team']
Adjust Home Score 2014090703
['qtr', 'lefttime', 'score', 'yrdln', 'cover', 'eid', 'season', 'week', 'team']
Successfully finish: 2014090704
['qtr', 'lefttime', 'score', 'yrdln', 'cover', 'eid', 'season', 'week', 'team']
Successfully finish: 2014090705
['qtr', 'lefttime', 'score', 'yrdln', 'cover', 'eid', 'season', 'week', 'team']
Successfully finish: 2014090706
['qtr', 'lefttime', 'score', 'yrdln', 'cover', 'eid', 'season', 'week', 'team']
Adjust Home Score 2014090707
['qtr', 'lefttime', 'score', 'yrdln', 'cover', 'eid', 'season', 'week', 'team'

Successfully finish: 2014100511
['qtr', 'lefttime', 'score', 'yrdln', 'cover', 'eid', 'season', 'week', 'team']
Adjust Home Score 2014100512
['qtr', 'lefttime', 'score', 'yrdln', 'cover', 'eid', 'season', 'week', 'team']
Successfully finish: 2014100600
['qtr', 'lefttime', 'score', 'yrdln', 'cover', 'eid', 'season', 'week', 'team']
Adjust Home Score 2014100900
['qtr', 'lefttime', 'score', 'yrdln', 'cover', 'eid', 'season', 'week', 'team']
Successfully finish: 2014101201
['qtr', 'lefttime', 'score', 'yrdln', 'cover', 'eid', 'season', 'week', 'team']
Successfully finish: 2014101202
['qtr', 'lefttime', 'score', 'yrdln', 'cover', 'eid', 'season', 'week', 'team']
Successfully finish: 2014101203
['qtr', 'lefttime', 'score', 'yrdln', 'cover', 'eid', 'season', 'week', 'team']
Successfully finish: 2014101204
['qtr', 'lefttime', 'score', 'yrdln', 'cover', 'eid', 'season', 'week', 'team']
Successfully finish: 2014101205
['qtr', 'lefttime', 'score', 'yrdln', 'cover', 'eid', 'season', 'week', 'team'

(43262, 9)

In [346]:
dataset.to_csv('data/201401-201408_baseline_cleandata.csv', index=False)