## Import library

In [1]:
import pandas as pd
import numpy as np
import csv
import json
import requests
from xml.etree import ElementTree
import glob
import re
import sklearn.linear_model as lm

## Load dataset

In [2]:
points = pd.read_csv('data/2014_point_spread_footballlocks.csv')
points

Unnamed: 0,eid,season,week,Home,Away,HomeScore,AwayScore,Day,Time,Favorite,Underdog,Spread,CoverOrNot
0,2014090400,2014,1,SEA,GB,36,16,Thu,8:30,SEA,GB,-5.0,1.0
1,2014090700,2014,1,ATL,NO,37,34,Sun,1:00,NO,ATL,-3.0,-1.0
2,2014090701,2014,1,BAL,CIN,16,23,Sun,1:00,BAL,CIN,-1.0,-1.0
3,2014090702,2014,1,CHI,BUF,20,23,Sun,1:00,CHI,BUF,-7.0,-1.0
4,2014090703,2014,1,HOU,WAS,17,6,Sun,1:00,HOU,WAS,-3.0,1.0
5,2014090704,2014,1,KC,TEN,10,26,Sun,1:00,KC,TEN,-3.0,-1.0
6,2014090705,2014,1,MIA,NE,33,20,Sun,1:00,NE,MIA,-4.0,-1.0
7,2014090706,2014,1,NYJ,OAK,19,14,Sun,1:00,NYJ,OAK,-6.5,-1.0
8,2014090707,2014,1,PHI,JAC,34,17,Sun,1:00,PHI,JAC,-10.0,1.0
9,2014090708,2014,1,PIT,CLE,30,27,Sun,1:00,PIT,CLE,-6.0,-1.0


In [3]:
# ## TODO: change to complete dataset
# covers = []
# for i, r in points.iterrows():
#     h = r['Home']
#     a = r['Away']
#     f = r['Favorite']
#     u = r['Underdog']
#     hs = r['HomeScore']
#     vs = r['AwayScore']
#     fs = -1
#     us = -1
#     if h == f:
#         fs = hs
#         us = vs
#     else:
#         fs = vs
#         us = hs
#     if (fs == -1) or (us == -1):
#         print "Favorite or Underdog not match"
#         print r
#         break
#     if (fs - us) > -r['Spread']:
#         cover = 1
#     elif (fs - us) == -r['Spread']:
#         cover = 0
#     else:
#         cover = -1
#     covers.append(cover)
# points["CoverOrFail"] = pd.Series(covers)

## Take Season 2014 week 1 game 1 as example

In [4]:
eid = 2014090400

In [5]:
plays = pd.read_csv('data/game_data/'+str(eid)+'_plays.csv')
plays = pd.read_csv(str(eid)+'.csv')

In [7]:
plays_parse = plays.loc[:, ['time', 'desc', 'qtr', 'yrdln', 'posteam', 'note']].copy()
plays_parse = plays_parse.sort_values(by=['qtr', 'time'], ascending=[True, False], axis=0).reset_index()

# [time, Missing Value] make endtime row as 0, since they are nan
time_null_idx = plays_parse['time'].index[pd.isnull(plays_parse['time'].values)]
plays_parse.loc[time_null_idx, 'time'] = '0:00'
plays_parse['time'] = pd.Series([float(a.split(':')[0])*60+float(a.split(':')[1]) for a in plays_parse['time'].values])
plays_parse['lefttime'] = 3600 - 15*60*(plays_parse['qtr']-1) - plays_parse['time']

# [yrdln, Missing Value] Missing Value: make nan the same as previous one
plays_parse['yrdln'] = plays_parse['yrdln'].fillna(method='ffill')

plays_parse

Unnamed: 0,index,time,desc,qtr,yrdln,posteam,note,lefttime
0,86,900.0,S.Hauschka kicks 71 yards from SEA 35 to GB -6...,1,SEA 35,SEA,KICKOFF,2700.0
1,84,896.0,(14:56) E.Lacy right tackle to GB 19 for 6 yar...,1,GB 13,GB,,2704.0
2,90,870.0,(14:30) E.Lacy left tackle to GB 22 for 3 yard...,1,GB 19,GB,PENALTY,2730.0
3,88,851.0,(14:11) (Shotgun) E.Lacy up the middle to GB 3...,1,GB 24,GB,,2749.0
4,87,812.0,(13:32) (No Huddle) J.Starks right guard to GB...,1,GB 39,GB,,2788.0
5,83,786.0,(13:06) (Shotgun) A.Rodgers pass short left to...,1,GB 41,GB,,2814.0
6,89,750.0,(12:30) (Shotgun) A.Rodgers sacked at GB 39 fo...,1,GB 39,GB,,2850.0
7,85,717.0,"(11:57) T.Masthay punts 29 yards to SEA 32, Ce...",1,GB 39,GB,PUNT,2883.0
8,98,713.0,(11:53) R.Wilson pass short left to P.Harvin t...,1,SEA 35,SEA,,2887.0
9,102,680.0,(11:20) M.Lynch left tackle to SEA 44 for 5 ya...,1,SEA 39,SEA,,2920.0


In [8]:
# Assign Home, Away, HomeScore, AwayScore, CoverOrNot to current match
plays_parse['Home'] = points[points['eid'] == eid].Home.iloc[0]
plays_parse['Away'] = points[points['eid'] == eid].Away.iloc[0]
# plays_parse['Favorite'] = points[points['eid'] == eid].Favorite.iloc[0]
# plays_parse['Underdog'] = points[points['eid'] == eid].Underdog.iloc[0]

plays_parse['HomeScore'] = 0
plays_parse['AwayScore'] = 0
plays_parse['CoverOrNot'] = points[points['eid'] == eid].CoverOrNot.iloc[0]
plays_parse

Unnamed: 0,index,time,desc,qtr,yrdln,posteam,note,lefttime,Home,Away,HomeScore,AwayScore,CoverOrNot
0,86,900.0,S.Hauschka kicks 71 yards from SEA 35 to GB -6...,1,SEA 35,SEA,KICKOFF,2700.0,SEA,GB,0,0,1.0
1,84,896.0,(14:56) E.Lacy right tackle to GB 19 for 6 yar...,1,GB 13,GB,,2704.0,SEA,GB,0,0,1.0
2,90,870.0,(14:30) E.Lacy left tackle to GB 22 for 3 yard...,1,GB 19,GB,PENALTY,2730.0,SEA,GB,0,0,1.0
3,88,851.0,(14:11) (Shotgun) E.Lacy up the middle to GB 3...,1,GB 24,GB,,2749.0,SEA,GB,0,0,1.0
4,87,812.0,(13:32) (No Huddle) J.Starks right guard to GB...,1,GB 39,GB,,2788.0,SEA,GB,0,0,1.0
5,83,786.0,(13:06) (Shotgun) A.Rodgers pass short left to...,1,GB 41,GB,,2814.0,SEA,GB,0,0,1.0
6,89,750.0,(12:30) (Shotgun) A.Rodgers sacked at GB 39 fo...,1,GB 39,GB,,2850.0,SEA,GB,0,0,1.0
7,85,717.0,"(11:57) T.Masthay punts 29 yards to SEA 32, Ce...",1,GB 39,GB,PUNT,2883.0,SEA,GB,0,0,1.0
8,98,713.0,(11:53) R.Wilson pass short left to P.Harvin t...,1,SEA 35,SEA,,2887.0,SEA,GB,0,0,1.0
9,102,680.0,(11:20) M.Lynch left tackle to SEA 44 for 5 ya...,1,SEA 39,SEA,,2920.0,SEA,GB,0,0,1.0


In [9]:
# Add score according FG and TD
for i, r in plays_parse.iterrows():
    
    # Dealing with points
    if r['note'] == 'TD':
        if r['posteam'] == r['Home']:
            plays_parse.loc[i:, 'HomeScore'] = plays_parse.loc[i:, 'HomeScore'] + 6
        elif r['posteam'] == r['Away']:
            plays_parse.loc[i:, 'AwayScore'] = plays_parse.loc[i:, 'AwayScore'] + 6
    if r['note'] == 'FG':
        if r['posteam'] == r['Home']:
            plays_parse.loc[i:, 'HomeScore'] = plays_parse.loc[i:, 'HomeScore'] + 3
        elif r['posteam'] == r['Away']:
            plays_parse.loc[i:, 'AwayScore'] = plays_parse.loc[i:, 'AwayScore'] + 3
    if r['note'] == 'XP':
        if r['posteam'] == r['Home']:
            plays_parse.loc[i:, 'HomeScore'] = plays_parse.loc[i:, 'HomeScore'] + 1
        elif r['posteam'] == r['Away']:
            plays_parse.loc[i:, 'AwayScore'] = plays_parse.loc[i:, 'AwayScore'] + 1
    if r['note'] == 'SAF':
        if r['posteam'] == r['Home']:
            plays_parse.loc[i:, 'AwayScore'] = plays_parse.loc[i:, 'AwayScore'] + 2
        elif r['posteam'] == r['Away']:
            plays_parse.loc[i:, 'HomeScore'] = plays_parse.loc[i:, 'HomeScore'] + 2
            
    # yrdln: home 0, away 100
    side = r['yrdln'].split(' ')[0]
    yrdln = int(r['yrdln'].split(' ')[1])
    if side == r['Home']:
        h_yrdln = yrdln
        v_yrdln = 100 - yrdln
    elif side == r['Away']:
        v_yrdln = yrdln
        h_yrdln = 100 - yrdln
    plays_parse.loc[i, 'h_yrdln'] = h_yrdln
    plays_parse.loc[i, 'v_yrdln'] = v_yrdln

plays_parse

Unnamed: 0,index,time,desc,qtr,yrdln,posteam,note,lefttime,Home,Away,HomeScore,AwayScore,CoverOrNot,h_yrdln,v_yrdln
0,86,900.0,S.Hauschka kicks 71 yards from SEA 35 to GB -6...,1,SEA 35,SEA,KICKOFF,2700.0,SEA,GB,0,0,1.0,35.0,65.0
1,84,896.0,(14:56) E.Lacy right tackle to GB 19 for 6 yar...,1,GB 13,GB,,2704.0,SEA,GB,0,0,1.0,87.0,13.0
2,90,870.0,(14:30) E.Lacy left tackle to GB 22 for 3 yard...,1,GB 19,GB,PENALTY,2730.0,SEA,GB,0,0,1.0,81.0,19.0
3,88,851.0,(14:11) (Shotgun) E.Lacy up the middle to GB 3...,1,GB 24,GB,,2749.0,SEA,GB,0,0,1.0,76.0,24.0
4,87,812.0,(13:32) (No Huddle) J.Starks right guard to GB...,1,GB 39,GB,,2788.0,SEA,GB,0,0,1.0,61.0,39.0
5,83,786.0,(13:06) (Shotgun) A.Rodgers pass short left to...,1,GB 41,GB,,2814.0,SEA,GB,0,0,1.0,59.0,41.0
6,89,750.0,(12:30) (Shotgun) A.Rodgers sacked at GB 39 fo...,1,GB 39,GB,,2850.0,SEA,GB,0,0,1.0,61.0,39.0
7,85,717.0,"(11:57) T.Masthay punts 29 yards to SEA 32, Ce...",1,GB 39,GB,PUNT,2883.0,SEA,GB,0,0,1.0,61.0,39.0
8,98,713.0,(11:53) R.Wilson pass short left to P.Harvin t...,1,SEA 35,SEA,,2887.0,SEA,GB,0,0,1.0,35.0,65.0
9,102,680.0,(11:20) M.Lynch left tackle to SEA 44 for 5 ya...,1,SEA 39,SEA,,2920.0,SEA,GB,0,0,1.0,39.0,61.0


In [10]:
print plays_parse.iloc[-1]['HomeScore'] == points.iloc[0]['HomeScore'] and plays_parse.iloc[-1]['AwayScore'] == points.iloc[0]['AwayScore']

True


## Go through season 2014, take REG 1-16 as training set, 17 as testing set

In [152]:
def parseMatches(df, eid, points):
    try:
        df = df.loc[:, ['time', 'desc', 'qtr', 'yrdln', 'posteam', 'note']].copy()
        df = df.sort_values(by=['qtr', 'time'], ascending=[True, False], axis=0).reset_index()

        # [time, Missing Value] make endtime row as 0, since they are nan
        time_null_idx = df['time'].index[pd.isnull(df['time'].values)]
        df.loc[time_null_idx, 'time'] = '0:00'
        df['time'] = pd.Series([float(a.split(':')[0])*60+float(a.split(':')[1]) for a in df['time'].values])

        if len(df.qtr.unique()) == 4:
            df['lefttime'] = 3600 - 15*60*(df['qtr']) + df['time'] # unit: seconds
        else: # suppose only 1 overtime
            df['lefttime'] = 3600 + 15*60 - df.iloc[-1].time - 15*60*(df['qtr']) + df['time'] # unit: seconds


        # [yrdln, Missing Value] Missing Value: make nan the same as previous one
        df['yrdln'] = df['yrdln'].fillna(method='ffill')

        # Assign Home, Away, HomeScore, AwayScore, CoverOrNot to current match
        df['Home'] = points[points['eid'] == eid].Home.iloc[0]
        df['Away'] = points[points['eid'] == eid].Away.iloc[0]

        df['HomeScore'] = 0
        df['AwayScore'] = 0
        df['CoverOrNot'] = points[points['eid'] == eid].CoverOrNot.iloc[0]
        
        h_inter_i = []
        v_inter_i = []
        h_inter = 0
        v_inter = 0
        
        

        for i, r in df.iterrows():
            # Add score according FG and TD
            if r['note'] == 'TD':
                 # In case the touch down is catch by kick and run
                if 'kicks' in r['desc'] and r['posteam'] == r['Home']:
                    df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 6
                    v_inter += 1
                    v_inter_i.append(i)
                elif 'kicks' in r['desc'] and r['posteam'] == r['Away']:
                    df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 6
                    h_inter += 1
                    h_inter_i.append(i)


                # In case the touch down has been recovered
                elif 'RECOVERED' in r['desc']:
                    rec_team = r['desc'].split("RECOVERED by ", 1)[1][0:3]
                    if not rec_team[-1].isalpha():
                        rec_team = rec_team[0:2]

                    if r['Home'] == rec_team:
                        df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 6
                        h_inter += 1
                        h_inter_i.append(i)
                    elif r['Away'] == rec_team:
                        df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 6
                        v_inter += 1
                        v_inter_i.append(i)


                # In case the touch down complished by intercept
                elif 'INTERCEPTED' in r['desc'] and r['posteam'] == r['Home']:
                    df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 6
                    v_inter += 1
                    v_inter_i.append(i)
                elif 'INTERCEPTED' in r['desc'] and r['posteam'] == r['Away']: # In case the touch down has been recovered
                    df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 6
                    h_inter += 1
                    h_inter_i.append(i)

                # In case the touch down complished by punts
                elif 'punts' in r['desc']:
                    if r['Home'] == r['posteam']:
                        df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 6
                        v_inter += 1
                        v_inter_i.append(i)
                    elif r['Away'] == r['posteam']:
                        df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 6
                        h_inter += 1
                        h_inter_i.append(i)

                # TODO: check, In case there's other penalty while touch down
                elif 'PENALTY' in r['desc']:
                    pen_team = r['desc'].split("PENALTY on ", 1)[1][0:3]
                    if r['Home'] == pen_team:
                        df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 6
                    elif r['Away'] == pen_team:
                        df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 6

                # default touchdown
                elif r['posteam'] == r['Home']:
                    df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 6
                elif r['posteam'] == r['Away']:
                    df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 6

            if r['note'] == 'PENALTY':
                if 'TWO-POINT CONVERSION ATTEMPT' in r['desc']:
                    pen_team = r['desc'].split("PENALTY on ", 1)[1][0:3]
                    if r['Away'] == pen_team:
                        df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 2
                    elif r['Home'] == pen_team:
                        df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 2



            if r['note'] == 'FG':
                if r['posteam'] == r['Home']:
                    df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 3
                elif r['posteam'] == r['Away']:
                    df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 3
            if r['note'] == 'XP':
                if r['posteam'] == r['Home']:
                    df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 1
                elif r['posteam'] == r['Away']:
                    df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 1
            if r['note'] == '2PS':
                if r['posteam'] == r['Home']:
                    df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 2
                elif r['posteam'] == r['Away']:
                    df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 2

            if r['note'] == 'SAF':
                if r['posteam'] == r['Home']:
                    df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 2
                elif r['posteam'] == r['Away']:
                    df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 2

            # yrdln: home 0, away 100
            yrdln_split = r['yrdln'].split(' ')
            if len(yrdln_split) == 1 and yrdln_split[0] == str(50):
                h_yrdln = 50
                v_yrdln = 50
            elif len(yrdln_split) == 2:
                side = r['yrdln'].split(' ')[0]
                yrdln = int(r['yrdln'].split(' ')[1])
                if side == r['Home']:
                    h_yrdln = yrdln
                    v_yrdln = 100 - yrdln
                elif side == r['Away']:
                    v_yrdln = yrdln
                    h_yrdln = 100 - yrdln
            else:
                print 'Error in parsing yrdln, eid=', eid, 'yrdln=', r['yrdln']


            df.loc[i, 'h_yrdln'] = h_yrdln
            df.loc[i, 'v_yrdln'] = v_yrdln
    except AttributeError:
        print 'AttributeError:', eid, df['time'].values

    ## Verify with final value
    parse_hs = df.iloc[-1]['HomeScore']
    parse_vs = df.iloc[-1]['AwayScore']
    real_hs = points[points['eid'] == eid]['HomeScore'].values[0]
    real_vs = points[points['eid'] == eid]['AwayScore'].values[0]
    
    if parse_hs == real_hs and parse_vs == real_vs:
        print 'Successfully finish:', eid
    else:
        # TODO: check which interception actually had following extra point, right now just start from first one
        if (0 < (real_hs - parse_hs) <= h_inter) or (0 < (real_vs - parse_vs) <= v_inter):
            for i in range(real_hs - parse_hs):
                idx = h_inter_i[i]
                df.loc[idx:, 'HomeScore'] += 1
                print 'Adjust Home Score', eid
            for i in range(real_vs - parse_vs):
                idx = v_inter_i[i]
                df.loc[idx:, 'AwayScore'] += 1
                print 'Adjust Away Score', eid
        else:
            print 'Fail finishing:', eid
            print 'parse home score =', df.iloc[-1]['HomeScore'], 'real home score =', points[points['eid'] == eid]['HomeScore'].values[0]
            print 'parse away score =', df.iloc[-1]['AwayScore'], 'real away score =', points[points['eid'] == eid]['AwayScore'].values[0]

In [156]:
for i, r in points[(points['season'] == 2014) & (points['week'] == 4)].iterrows():
    eid = r['eid']
    plays = pd.read_csv('data/game_data/'+str(eid)+'_plays.csv')
    parseMatches(plays, eid, points)

Successfully finish: 2014092500
Successfully finish: 2014092801
Adjust Home Score 2014092802
Successfully finish: 2014092803
Successfully finish: 2014092800
Successfully finish: 2014092805
Adjust Away Score 2014092806
Fail finishing: 2014092807
parse home score = 30 real home score = 24
parse away score = 21 real away score = 27
Fail finishing: 2014092808
parse home score = 33 real home score = 33
parse away score = 8 real away score = 14
Successfully finish: 2014092804
Adjust Away Score 2014092809
Adjust Away Score 2014092809
Adjust Away Score 2014092809
Successfully finish: 2014092810
Adjust Home Score 2014092900


### Validation Block

In [147]:
eid = 2014090703
df = pd.read_csv('data/game_data/'+str(eid)+'_plays.csv')
df = df.loc[:, ['time', 'desc', 'qtr', 'yrdln', 'posteam', 'note']].copy()
df = df.sort_values(by=['qtr', 'time'], ascending=[True, False], axis=0).reset_index()

# [time, Missing Value] make endtime row as 0, since they are nan
time_null_idx = df['time'].index[pd.isnull(df['time'].values)]
df.loc[time_null_idx, 'time'] = '0:00'
df['time'] = pd.Series([float(a.split(':')[0])*60+float(a.split(':')[1]) for a in df['time'].values])
if len(df.qtr.unique()) == 4:
    df['lefttime'] = 3600 - 15*60*(df['qtr']) + df['time'] # unit: seconds
else: # suppose only 1 overtime
    df['lefttime'] = 3600 + 15*60 - df.iloc[-1].time - 15*60*(df['qtr']) + df['time'] # unit: seconds

# [yrdln, Missing Value] Missing Value: make nan the same as previous one
df['yrdln'] = df['yrdln'].fillna(method='ffill')

# Assign Home, Away, HomeScore, AwayScore, CoverOrNot to current match
df['Home'] = points[points['eid'] == eid].Home.iloc[0]
df['Away'] = points[points['eid'] == eid].Away.iloc[0]

df['HomeScore'] = 0
df['AwayScore'] = 0
df['CoverOrNot'] = points[points['eid'] == eid].CoverOrNot.iloc[0]

h_inter_i = []
v_inter_i = []
h_inter = 0
v_inter = 0
for i, r in df.iterrows():
    # Add score according FG and TD
    if r['note'] == 'TD':
         # In case the touch down is catch by kick and run
        if 'kicks' in r['desc'] and r['posteam'] == r['Home']:
            df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 6
            v_inter += 1
            v_inter_i.append(i)
        elif 'kicks' in r['desc'] and r['posteam'] == r['Away']:
            df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 6
            h_inter += 1
            h_inter_i.append(i)

        # In case the touch down has been recovered
        elif 'RECOVERED' in r['desc']:
            rec_team = r['desc'].split("RECOVERED by ", 1)[1][0:3]
            if not rec_team[-1].isdigit():
                rec_team = rec_team[0:2]
            if r['Home'] == rec_team:
                df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 6
                h_inter += 1
                h_inter_i.append(i)
            elif r['Away'] == rec_team:
                df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 6
                v_inter += 1
                v_inter_i.append(i)


        # In case the touch down complished by intercept
        elif 'INTERCEPTED' in r['desc'] and r['posteam'] == r['Home']:
            df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 6
            v_inter += 1
            v_inter_i.append(i)
        elif 'INTERCEPTED' in r['desc'] and r['posteam'] == r['Away']: # In case the touch down has been recovered
            df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 6
            h_inter += 1
            h_inter_i.append(i)

        # In case the touch down complished by punts
        elif 'punts' in r['desc']:
            if r['Home'] == r['posteam']:
                df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 6
                v_inter += 1
                v_inter_i.append(i)
            elif r['Away'] == r['posteam']:
                df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 6
                h_inter += 1
                h_inter_i.append(i)

        # TODO: check, In case there's other penalty while touch down
        elif 'PENALTY' in r['desc']:
            pen_team = r['desc'].split("PENALTY on ", 1)[1][0:3]
            if r['Home'] == pen_team:
                df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 6
            elif r['Away'] == pen_team:
                df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 6

        # default touchdown
        elif r['posteam'] == r['Home']:
            df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 6
        elif r['posteam'] == r['Away']:
            df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 6

    if r['note'] == 'PENALTY':
        if 'TWO-POINT CONVERSION ATTEMPT' in r['desc']:
            pen_team = r['desc'].split("PENALTY on ", 1)[1][0:3]
            if r['Away'] == pen_team:
                df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 2
            elif r['Home'] == pen_team:
                df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 2



    if r['note'] == 'FG':
        if r['posteam'] == r['Home']:
            df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 3
        elif r['posteam'] == r['Away']:
            df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 3
    if r['note'] == 'XP':
        if r['posteam'] == r['Home']:
            df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 1
        elif r['posteam'] == r['Away']:
            df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 1
    if r['note'] == '2PS':
        if r['posteam'] == r['Home']:
            df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 2
        elif r['posteam'] == r['Away']:
            df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 2

    if r['note'] == 'SAF':
        if r['posteam'] == r['Home']:
            df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 2
        elif r['posteam'] == r['Away']:
            df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 2



    # yrdln: home 0, away 100
    yrdln_split = r['yrdln'].split(' ')
    if len(yrdln_split) == 1 and yrdln_split[0] == str(50):
        h_yrdln = 50
        v_yrdln = 50
    elif len(yrdln_split) == 2:
        side = r['yrdln'].split(' ')[0]
        yrdln = int(r['yrdln'].split(' ')[1])
        if side == r['Home']:
            h_yrdln = yrdln
            v_yrdln = 100 - yrdln
        elif side == r['Away']:
            v_yrdln = yrdln
            h_yrdln = 100 - yrdln
    else:
        print 'Error in parsing yrdln, eid=', eid, 'yrdln=', r['yrdln']


    df.loc[i, 'h_yrdln'] = h_yrdln
    df.loc[i, 'v_yrdln'] = v_yrdln

## Verify with final value
parse_hs = df.iloc[-1]['HomeScore']
parse_vs = df.iloc[-1]['AwayScore']
real_hs = points[points['eid'] == eid]['HomeScore'].values[0]
real_vs = points[points['eid'] == eid]['AwayScore'].values[0]

if parse_hs == real_hs and parse_vs == real_vs:
    print 'Successfully finish:', eid
else:
    # TODO: check which interception actually had following extra point, right now just start from first one
    if (0 < (real_hs - parse_hs) <= h_inter) or (0 < (real_vs - parse_vs) <= v_inter):
        for i in range(real_hs - parse_hs):
            idx = h_inter_i[i]
            df.loc[idx:, 'HomeScore'] += 1
            print 'Adjust Home Score', eid
        for i in range(real_vs - parse_vs):
            idx = v_inter_i[i]
            df.loc[idx:, 'AwayScore'] += 1
            print 'Adjust Away Score', eid
    else:
        print 'Fail finishing:', eid
        print 'parse home score =', df.iloc[-1]['HomeScore'], 'real home score =', points[points['eid'] == eid]['HomeScore'].values[0]
        print 'parse away score =', df.iloc[-1]['AwayScore'], 'real away score =', points[points['eid'] == eid]['AwayScore'].values[0]

Fail finishing: 2014090703
parse home score = 10 real home score = 17
parse away score = 6 real away score = 6


In [149]:
df[df.qtr == 2]

Unnamed: 0,index,time,desc,qtr,yrdln,posteam,note,lefttime,Home,Away,HomeScore,AwayScore,CoverOrNot,h_yrdln,v_yrdln
37,61,900.0,(15:00) (Shotgun) R.Griffin pass short right t...,2,WAS 34,WAS,,2700.0,HOU,WAS,0,0,1.0,66.0,34.0
38,57,862.0,(14:22) (Shotgun) R.Griffin pass short middle ...,2,WAS 43,WAS,,2662.0,HOU,WAS,0,0,1.0,57.0,43.0
39,50,825.0,(13:45) (Shotgun) R.Griffin pass short left to...,2,WAS 45,WAS,,2625.0,HOU,WAS,0,0,1.0,55.0,45.0
40,52,786.0,(13:06) (Shotgun) R.Helu up the middle pushed ...,2,WAS 48,WAS,,2586.0,HOU,WAS,0,0,1.0,52.0,48.0
41,58,759.0,(12:39) R.Griffin pass short right to D.Young ...,2,HOU 36,WAS,,2559.0,HOU,WAS,0,0,1.0,36.0,64.0
42,59,759.0,"(12:39) (Shotgun) PENALTY on WAS-C.Chester, Fa...",2,HOU 35,WAS,PENALTY,2559.0,HOU,WAS,0,0,1.0,35.0,65.0
43,53,712.0,(11:52) (Shotgun) R.Helu up the middle to HOU ...,2,HOU 40,WAS,,2512.0,HOU,WAS,0,0,1.0,40.0,60.0
44,55,675.0,(11:15) (Shotgun) R.Griffin sacked at HOU 46 f...,2,HOU 34,WAS,,2475.0,HOU,WAS,0,0,1.0,34.0,66.0
45,56,640.0,"(10:40) T.Way punts 38 yards to HOU 8, Center-...",2,HOU 46,WAS,PUNT,2440.0,HOU,WAS,0,0,1.0,46.0,54.0
46,48,631.0,(10:31) (Shotgun) A.Foster right guard to HOU ...,2,HOU 8,HOU,,2431.0,HOU,WAS,0,0,1.0,8.0,92.0


In [150]:
eid = 2014090703
df = pd.read_csv('data/game_data/'+str(eid)+'_plays.csv')
df = df.sort_values(by=['qtr', 'time'], ascending=[True, False], axis=0).reset_index()
df[df.sp == 1]

Unnamed: 0,index,drive,play,down,time,desc,ydstogo,qtr,ydsnet,yrdln,sp,posteam,note
55,70,10,1322,1,06:11,"(6:11) D.Young up the middle for 1 yard, TOUCH...",1,2,46,HOU 1,1,WAS,TD
61,64,11,1473,2,04:28,(4:28) R.Fitzpatrick pass deep right to D.Hopk...,7,2,89,HOU 24,1,HOU,TD
62,65,11,1506,0,04:15,"R.Bullock extra point is GOOD, Center-J.Weeks,...",0,2,89,WAS 2,1,HOU,XP
67,83,12,1617,4,02:15,"(2:15) T.Way punt is BLOCKED by A.Blue, Center...",7,2,3,WAS 24,1,WAS,TD
147,139,19,3596,4,02:00,"(2:00) R.Bullock 42 yard field goal is GOOD, C...",17,4,68,WAS 24,1,HOU,FG


In [151]:
df.iloc[67].desc, df.iloc[67].posteam

('(2:15) T.Way punt is BLOCKED by A.Blue, Center-N.Sundberg, RECOVERED by HOU-A.Blue at WAS 5. A.Blue for 5 yards, TOUCHDOWN.',
 'WAS')

In [75]:
range(2)

[0, 1]