## Import library

In [1]:
import pandas as pd
import numpy as np
import csv
import json
import requests
from xml.etree import ElementTree
import glob
import re
import sklearn.linear_model as lm

## Load dataset

In [2]:
points = pd.read_csv('data/2014_point_spread_footballlocks.csv')
points.head()

Unnamed: 0,eid,season,week,Home,Away,HomeScore,AwayScore,Day,Time,Favorite,Underdog,Spread,CoverOrNot
0,2014090400,2014,1,SEA,GB,36,16,Thu,8:30,SEA,GB,-5.0,1.0
1,2014090700,2014,1,ATL,NO,37,34,Sun,1:00,NO,ATL,-3.0,-1.0
2,2014090701,2014,1,BAL,CIN,16,23,Sun,1:00,BAL,CIN,-1.0,-1.0
3,2014090702,2014,1,CHI,BUF,20,23,Sun,1:00,CHI,BUF,-7.0,-1.0
4,2014090703,2014,1,HOU,WAS,17,6,Sun,1:00,HOU,WAS,-3.0,1.0


In [3]:
a = points.Underdog.unique()
b = points.Favorite.unique()
c = np.setdiff1d(np.union1d(a, b), np.intersect1d(a, b))
c

array(['OAK'], dtype=object)

In [3]:
# ## TODO: change to complete dataset
# covers = []
# for i, r in points.iterrows():
#     h = r['Home']
#     a = r['Away']
#     f = r['Favorite']
#     u = r['Underdog']
#     hs = r['HomeScore']
#     vs = r['AwayScore']
#     fs = -1
#     us = -1
#     if h == f:
#         fs = hs
#         us = vs
#     else:
#         fs = vs
#         us = hs
#     if (fs == -1) or (us == -1):
#         print "Favorite or Underdog not match"
#         print r
#         break
#     if (fs - us) > -r['Spread']:
#         cover = 1
#     elif (fs - us) == -r['Spread']:
#         cover = 0
#     else:
#         cover = -1
#     covers.append(cover)
# points["CoverOrFail"] = pd.Series(covers)

In [192]:
len(points.Favorite.unique())

31

In [191]:
len(points.Underdog.unique())

32

## Take Season 2014 week 1 game 1 as example

In [245]:
eid = 2014090400

In [246]:
plays = pd.read_csv('data/game_data/'+str(eid)+'_plays.csv')
plays = pd.read_csv(str(eid)+'.csv')

In [247]:
plays_parse = plays.loc[:, ['time', 'desc', 'qtr', 'yrdln', 'posteam', 'note']].copy()
plays_parse = plays_parse.sort_values(by=['qtr', 'time'], ascending=[True, False], axis=0).reset_index()

# [time, Missing Value] make endtime row as 0, since they are nan
time_null_idx = plays_parse['time'].index[pd.isnull(plays_parse['time'].values)]
plays_parse.loc[time_null_idx, 'time'] = '0:00'
plays_parse['time'] = pd.Series([float(a.split(':')[0])*60+float(a.split(':')[1]) for a in plays_parse['time'].values])
plays_parse['lefttime'] = 3600 - 15*60*(plays_parse['qtr']-1) - plays_parse['time']

# [yrdln, Missing Value] Missing Value: make nan the same as previous one
plays_parse['yrdln'] = plays_parse['yrdln'].fillna(method='ffill')

plays_parse.head()

Unnamed: 0,index,time,desc,qtr,yrdln,posteam,note,lefttime
0,86,900.0,S.Hauschka kicks 71 yards from SEA 35 to GB -6...,1,SEA 35,SEA,KICKOFF,2700.0
1,84,896.0,(14:56) E.Lacy right tackle to GB 19 for 6 yar...,1,GB 13,GB,,2704.0
2,90,870.0,(14:30) E.Lacy left tackle to GB 22 for 3 yard...,1,GB 19,GB,PENALTY,2730.0
3,88,851.0,(14:11) (Shotgun) E.Lacy up the middle to GB 3...,1,GB 24,GB,,2749.0
4,87,812.0,(13:32) (No Huddle) J.Starks right guard to GB...,1,GB 39,GB,,2788.0


In [248]:
# Assign Home, Away, HomeScore, AwayScore, CoverOrNot to current match
plays_parse['Home'] = points[points['eid'] == eid].Home.iloc[0]
plays_parse['Away'] = points[points['eid'] == eid].Away.iloc[0]
# plays_parse['Favorite'] = points[points['eid'] == eid].Favorite.iloc[0]
# plays_parse['Underdog'] = points[points['eid'] == eid].Underdog.iloc[0]

plays_parse['HomeScore'] = 0
plays_parse['AwayScore'] = 0
plays_parse['CoverOrNot'] = points[points['eid'] == eid].CoverOrNot.iloc[0]
plays_parse.head()

Unnamed: 0,index,time,desc,qtr,yrdln,posteam,note,lefttime,Home,Away,HomeScore,AwayScore,CoverOrNot
0,86,900.0,S.Hauschka kicks 71 yards from SEA 35 to GB -6...,1,SEA 35,SEA,KICKOFF,2700.0,SEA,GB,0,0,1.0
1,84,896.0,(14:56) E.Lacy right tackle to GB 19 for 6 yar...,1,GB 13,GB,,2704.0,SEA,GB,0,0,1.0
2,90,870.0,(14:30) E.Lacy left tackle to GB 22 for 3 yard...,1,GB 19,GB,PENALTY,2730.0,SEA,GB,0,0,1.0
3,88,851.0,(14:11) (Shotgun) E.Lacy up the middle to GB 3...,1,GB 24,GB,,2749.0,SEA,GB,0,0,1.0
4,87,812.0,(13:32) (No Huddle) J.Starks right guard to GB...,1,GB 39,GB,,2788.0,SEA,GB,0,0,1.0


In [249]:
# Add score according FG and TD
for i, r in plays_parse.iterrows():
    
    # Dealing with points
    if r['note'] == 'TD':
        if r['posteam'] == r['Home']:
            plays_parse.loc[i:, 'HomeScore'] = plays_parse.loc[i:, 'HomeScore'] + 6
        elif r['posteam'] == r['Away']:
            plays_parse.loc[i:, 'AwayScore'] = plays_parse.loc[i:, 'AwayScore'] + 6
    if r['note'] == 'FG':
        if r['posteam'] == r['Home']:
            plays_parse.loc[i:, 'HomeScore'] = plays_parse.loc[i:, 'HomeScore'] + 3
        elif r['posteam'] == r['Away']:
            plays_parse.loc[i:, 'AwayScore'] = plays_parse.loc[i:, 'AwayScore'] + 3
    if r['note'] == 'XP':
        if r['posteam'] == r['Home']:
            plays_parse.loc[i:, 'HomeScore'] = plays_parse.loc[i:, 'HomeScore'] + 1
        elif r['posteam'] == r['Away']:
            plays_parse.loc[i:, 'AwayScore'] = plays_parse.loc[i:, 'AwayScore'] + 1
    if r['note'] == 'SAF':
        if r['posteam'] == r['Home']:
            plays_parse.loc[i:, 'AwayScore'] = plays_parse.loc[i:, 'AwayScore'] + 2
        elif r['posteam'] == r['Away']:
            plays_parse.loc[i:, 'HomeScore'] = plays_parse.loc[i:, 'HomeScore'] + 2
            
    # yrdln: home 0, away 100
    side = r['yrdln'].split(' ')[0]
    yrdln = int(r['yrdln'].split(' ')[1])
    if side == r['Home']:
        h_yrdln = yrdln
        v_yrdln = 100 - yrdln
    elif side == r['Away']:
        v_yrdln = yrdln
        h_yrdln = 100 - yrdln
    plays_parse.loc[i, 'h_yrdln'] = h_yrdln
    plays_parse.loc[i, 'v_yrdln'] = v_yrdln

plays_parse.head()

Unnamed: 0,index,time,desc,qtr,yrdln,posteam,note,lefttime,Home,Away,HomeScore,AwayScore,CoverOrNot,h_yrdln,v_yrdln
0,86,900.0,S.Hauschka kicks 71 yards from SEA 35 to GB -6...,1,SEA 35,SEA,KICKOFF,2700.0,SEA,GB,0,0,1.0,35.0,65.0
1,84,896.0,(14:56) E.Lacy right tackle to GB 19 for 6 yar...,1,GB 13,GB,,2704.0,SEA,GB,0,0,1.0,87.0,13.0
2,90,870.0,(14:30) E.Lacy left tackle to GB 22 for 3 yard...,1,GB 19,GB,PENALTY,2730.0,SEA,GB,0,0,1.0,81.0,19.0
3,88,851.0,(14:11) (Shotgun) E.Lacy up the middle to GB 3...,1,GB 24,GB,,2749.0,SEA,GB,0,0,1.0,76.0,24.0
4,87,812.0,(13:32) (No Huddle) J.Starks right guard to GB...,1,GB 39,GB,,2788.0,SEA,GB,0,0,1.0,61.0,39.0


In [250]:
print plays_parse.iloc[-1]['HomeScore'] == points.iloc[0]['HomeScore'] and plays_parse.iloc[-1]['AwayScore'] == points.iloc[0]['AwayScore']

True


## Go through season 2014, take REG 1-16 as training set, 17 as testing set

In [33]:
def parseMatches(df, eid, points):
    try:
        df = df.loc[:, ['time', 'desc', 'qtr', 'yrdln', 'posteam', 'note']].copy()
        df = df.sort_values(by=['qtr', 'time'], ascending=[True, False], axis=0).reset_index()

        # [time, Missing Value] make endtime row as 0, since they are nan
        time_null_idx = df['time'].index[pd.isnull(df['time'].values)]
        df.loc[time_null_idx, 'time'] = '0:00'
        df['time'] = pd.Series([float(a.split(':')[0])*60+float(a.split(':')[1]) for a in df['time'].values])

        if len(df.qtr.unique()) == 4:
            df['lefttime'] = 3600 - 15*60*(df['qtr']) + df['time'] # unit: seconds
        else: # suppose only 1 overtime
            df['lefttime'] = 3600 + 15*60 - df.iloc[-1].time - 15*60*(df['qtr']) + df['time'] # unit: seconds


        # [yrdln, Missing Value] Missing Value: make nan the same as previous one
        df['yrdln'] = df['yrdln'].fillna(method='ffill')

        # Assign Home, Away, HomeScore, AwayScore, CoverOrNot to current match
        df['Home'] = points[points['eid'] == eid].Home.iloc[0]
        df['Away'] = points[points['eid'] == eid].Away.iloc[0]
        df['Favorite'] = points[points['eid'] == eid].Favorite.iloc[0]
        df['Underdog'] = points[points['eid'] == eid].Underdog.iloc[0]
        df['HomeScore'] = 0
        df['AwayScore'] = 0
        df['eid'] = eid
        df['CoverOrNot'] = points[points['eid'] == eid].CoverOrNot.iloc[0]
        
        h_inter_i = []
        v_inter_i = []
        h_inter = 0
        v_inter = 0
        
        

        for i, r in df.iterrows():
            # Add score according FG and TD
            if r['note'] == 'TD':
                # In case the touch down has been recovered
                if 'RECOVERED' in r['desc']:
                    rec_team = r['desc'].split("RECOVERED by ", 1)[1][0:3]
                    if not rec_team[-1].isalpha():
                        rec_team = rec_team[0:2]
                    if 'REVERSED' in r['desc'].split("RECOVERED by ", 1)[1]:
                        if r['Home'] == rec_team:
                            df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 6
                            v_inter += 1
                            v_inter_i.append(i)
                        elif r['Away'] == rec_team:
                            df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 6
                            h_inter += 1
                            h_inter_i.append(i)

                    else:
                        if r['Home'] == rec_team:
                            df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 6
                            h_inter += 1
                            h_inter_i.append(i)
                        elif r['Away'] == rec_team:
                            df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 6
                            v_inter += 1
                            v_inter_i.append(i)


                # In case the touch down is catch by kick and run
                elif 'kicks' in r['desc'] and r['posteam'] == r['Home']:
                    df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 6
                    v_inter += 1
                    v_inter_i.append(i)
                elif 'kicks' in r['desc'] and r['posteam'] == r['Away']:
                    df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 6
                    h_inter += 1
                    h_inter_i.append(i)




                # In case the touch down complished by intercept
                elif 'INTERCEPTED' in r['desc'] and r['posteam'] == r['Home']:
                    df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 6
                    v_inter += 1
                    v_inter_i.append(i)
                elif 'INTERCEPTED' in r['desc'] and r['posteam'] == r['Away']: # In case the touch down has been recovered
                    df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 6
                    h_inter += 1
                    h_inter_i.append(i)

                # In case the touch down complished by punts
                elif 'punts' in r['desc']:
                    if r['Home'] == r['posteam']:
                        df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 6
                        v_inter += 1
                        v_inter_i.append(i)
                    elif r['Away'] == r['posteam']:
                        df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 6
                        h_inter += 1
                        h_inter_i.append(i)

                # TODO: check, In case there's other penalty while touch down
#                 elif 'PENALTY' in r['desc']:
#                     pen_team = r['desc'].split("PENALTY on ", 1)[1][0:3]
#                     if r['Home'] == pen_team:
#                         df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 6
#                     elif r['Away'] == pen_team:
#                         df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 6

                # default touchdown
                elif r['posteam'] == r['Home']:
                    df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 6
                elif r['posteam'] == r['Away']:
                    df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 6

            if r['note'] == 'PENALTY':
                if 'TWO-POINT CONVERSION ATTEMPT' in r['desc']:
                    pen_team = r['desc'].split("PENALTY on ", 1)[1][0:3]
                    if not pen_team[-1].isalpha():
                        pen_team = pen_team[0:2]

                    if 'ATTEMPT FAILS' in r['desc']:
                        pass
                    elif 'ATTEMPT SUCCEEDS' in r['desc']:
                        if r['Away'] == pen_team:
                            df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 2
                        elif r['Home'] == pen_team:
                            df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 2


            if r['note'] == 'FG':
                if r['posteam'] == r['Home']:
                    df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 3
                elif r['posteam'] == r['Away']:
                    df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 3
            if r['note'] == 'XP':
                if r['posteam'] == r['Home']:
                    df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 1
                elif r['posteam'] == r['Away']:
                    df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 1
            if r['note'] == '2PS':
                if r['posteam'] == r['Home']:
                    df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 2
                elif r['posteam'] == r['Away']:
                    df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 2

            if r['note'] == '2PR':
                if r['posteam'] == r['Home']:
                    df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 2
                elif r['posteam'] == r['Away']:
                    df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 2


            if r['note'] == 'SAF':
                if r['posteam'] == r['Home']:
                    df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 2
                elif r['posteam'] == r['Away']:
                    df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 2
            
            # Missing data: 2014112307 2nd quarter starter -- TOUCHDOWN
            if eid == 2014112307 and i == 52:
                df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 6

            # yrdln: home 0, away 100
            yrdln_split = r['yrdln'].split(' ')
            if len(yrdln_split) == 1 and yrdln_split[0] == str(50):
                f_yrdln = 50
                u_yrdln = 50
            elif len(yrdln_split) == 2:
                side = r['yrdln'].split(' ')[0]
                yrdln = int(r['yrdln'].split(' ')[1])
                if side == r['Favorite']:
                    f_yrdln = yrdln
                    u_yrdln = 100 - yrdln
                elif side == r['Underdog']:
                    u_yrdln = yrdln
                    f_yrdln = 100 - yrdln
            else:
                print 'Error in parsing yrdln, eid=', eid, 'yrdln=', r['yrdln']

            df.loc[i, 'f_yrdln'] = f_yrdln
            df.loc[i, 'u_yrdln'] = u_yrdln
    except AttributeError:
        print 'AttributeError:', eid, df['time'].values
        
    # Assign Favorite score and Underdog score
    if df['Home'].values[0] == df['Favorite'].values[0]:
        df['FavoriteScore'] = df['HomeScore']
        df['UnderdogScore'] = df['AwayScore']
    elif df['Home'].values[0] == df['Underdog'].values[0]:
        df['FavoriteScore'] = df['AwayScore']
        df['UnderdogScore'] = df['HomeScore']


    ## Verify with final value
    parse_hs = df.iloc[-1]['HomeScore']
    parse_vs = df.iloc[-1]['AwayScore']
    real_hs = points[points['eid'] == eid]['HomeScore'].values[0]
    real_vs = points[points['eid'] == eid]['AwayScore'].values[0]
    
    if parse_hs == real_hs and parse_vs == real_vs:
        print 'Successfully finish:', eid
    else:
        # TODO: check which interception actually had following extra point, right now just start from first one
        if (0 < (real_hs - parse_hs) <= h_inter) or (0 < (real_vs - parse_vs) <= v_inter):
            try:
                for i in range(real_hs - parse_hs):
                    idx = h_inter_i[i]
                    df.loc[idx:, 'HomeScore'] += 1
                    print 'Adjust Home Score', eid
                for i in range(real_vs - parse_vs):
                    idx = v_inter_i[i]
                    df.loc[idx:, 'AwayScore'] += 1
                    print 'Adjust Away Score', eid
            except IndexError:
                print 'IndexError:', h_inter_i, v_inter_i, real_vs - parse_vs

        else:
            print 'Fail finishing:', eid
            print 'parse home score =', df.iloc[-1]['HomeScore'], 'real home score =', points[points['eid'] == eid]['HomeScore'].values[0]
            print 'parse away score =', df.iloc[-1]['AwayScore'], 'real away score =', points[points['eid'] == eid]['AwayScore'].values[0]
    return df

In [35]:
for i, r in points[(points['season'] == 2014) & (points['week'] == 18)].iterrows():
    eid = r['eid']
    plays = pd.read_csv('data/game_data/'+str(eid)+'_plays.csv')
    plays = parseMatches(plays, eid, points)
    f_plays = plays.loc[:, ['qtr', 'lefttime', 'FavoriteScore', 'f_yrdln', 'CoverOrNot']]
    f_plays.rename(columns={
        'qtr': 'qtr',
        'lefttime': 'lefttime',
        'FavoriteScore': 'score',
        'f_yrdln': 'yrdln',
        'CoverOrNot': 'cover',
    }, inplace=True)
    f_plays['team'] = plays.Favorite.unique()[0]
    u_plays = plays.loc[:, ['qtr', 'lefttime', 'UnderdogScore', 'u_yrdln', 'CoverOrNot']]
    u_plays.rename(columns={
        'qtr': 'qtr',
        'lefttime': 'lefttime',
        'UnderdogScore': 'score',
        'u_yrdln': 'yrdln',
        'CoverOrNot': 'cover',
    }, inplace=True)
    u_plays['team'] = plays.Underdog.values[0]
    u_plays['cover'] = -u_plays['cover']

Successfully finish: 2015010300
Successfully finish: 2015010301
Successfully finish: 2015010400
Successfully finish: 2015010401


### Validation Block

In [27]:
eid = 2015010300
df = pd.read_csv('data/game_data/'+str(eid)+'_plays.csv')
df = df.loc[:, ['time', 'desc', 'qtr', 'yrdln', 'posteam', 'note']].copy()
df = df.sort_values(by=['qtr', 'time'], ascending=[True, False], axis=0).reset_index()

# [time, Missing Value] make endtime row as 0, since they are nan
time_null_idx = df['time'].index[pd.isnull(df['time'].values)]
df.loc[time_null_idx, 'time'] = '0:00'
df['time'] = pd.Series([float(a.split(':')[0])*60+float(a.split(':')[1]) for a in df['time'].values])
if len(df.qtr.unique()) == 4:
    df['lefttime'] = 3600 - 15*60*(df['qtr']) + df['time'] # unit: seconds
else: # suppose only 1 overtime
    df['lefttime'] = 3600 + 15*60 - df.iloc[-1].time - 15*60*(df['qtr']) + df['time'] # unit: seconds

# [yrdln, Missing Value] Missing Value: make nan the same as previous one
df['yrdln'] = df['yrdln'].fillna(method='ffill')

# Assign Home, Away, HomeScore, AwayScore, CoverOrNot to current match
df['Home'] = points[points['eid'] == eid].Home.iloc[0]
df['Away'] = points[points['eid'] == eid].Away.iloc[0]
df['eid'] = eid

df['HomeScore'] = 0
df['AwayScore'] = 0
df['CoverOrNot'] = points[points['eid'] == eid].CoverOrNot.iloc[0]

h_inter_i = []
v_inter_i = []
h_inter = 0
v_inter = 0
for i, r in df.iterrows():
    # Add score according FG and TD
    if r['note'] == 'TD':
        # In case the touch down has been recovered
        if 'RECOVERED' in r['desc']:
            rec_team = r['desc'].split("RECOVERED by ", 1)[1][0:3]
            if not rec_team[-1].isalpha():
                rec_team = rec_team[0:2]
            if r['Home'] == rec_team:
                df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 6
                h_inter += 1
                h_inter_i.append(i)
            elif r['Away'] == rec_team:
                df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 6
                v_inter += 1
                v_inter_i.append(i)


         # In case the touch down is catch by kick and run
        elif 'kicks' in r['desc'] and r['posteam'] == r['Home']:
            df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 6
            v_inter += 1
            v_inter_i.append(i)
        elif 'kicks' in r['desc'] and r['posteam'] == r['Away']:
            df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 6
            h_inter += 1
            h_inter_i.append(i)



        # In case the touch down complished by intercept
        elif 'INTERCEPTED' in r['desc'] and r['posteam'] == r['Home']:
            df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 6
            v_inter += 1
            v_inter_i.append(i)
        elif 'INTERCEPTED' in r['desc'] and r['posteam'] == r['Away']: # In case the touch down has been recovered
            df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 6
            h_inter += 1
            h_inter_i.append(i)

        # In case the touch down complished by punts
        elif 'punts' in r['desc']:
            if r['Home'] == r['posteam']:
                df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 6
                v_inter += 1
                v_inter_i.append(i)
            elif r['Away'] == r['posteam']:
                df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 6
                h_inter += 1
                h_inter_i.append(i)

        # TODO: check, In case there's other penalty while touch down
#         elif 'PENALTY' in r['desc']:
#             pen_team = r['desc'].split("PENALTY on ", 1)[1][0:3]
#             if r['Home'] == pen_team:
#                 df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 6
#             elif r['Away'] == pen_team:
#                 df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 6

        # default touchdown
        elif r['posteam'] == r['Home']:
            df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 6
            print r['posteam'], r['desc']
        elif r['posteam'] == r['Away']:
            df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 6
            print r['posteam'], r['desc']

    if r['note'] == 'PENALTY':
        if 'TWO-POINT CONVERSION ATTEMPT' in r['desc']:
            pen_team = r['desc'].split("PENALTY on ", 1)[1][0:3]
            if not pen_team[-1].isalpha():
                pen_team = pen_team[0:2]

            if 'ATTEMPT FAILS' in r['desc']:
                pass
            elif 'ATTEMPT SUCCEEDS' in r['desc']:
                if r['Away'] == pen_team:
                    df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 2
                elif r['Home'] == pen_team:
                    df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 2


    if r['note'] == '2PR':
        if r['posteam'] == r['Home']:
            df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 2
        elif r['posteam'] == r['Away']:
            df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 2



    if r['note'] == 'FG':
        if r['posteam'] == r['Home']:
            df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 3
        elif r['posteam'] == r['Away']:
            df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 3
    if r['note'] == 'XP':
        if r['posteam'] == r['Home']:
            df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 1
        elif r['posteam'] == r['Away']:
            df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 1
    if r['note'] == '2PS':
        if r['posteam'] == r['Home']:
            df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 2
        elif r['posteam'] == r['Away']:
            df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 2

    if r['note'] == 'SAF':
        if r['posteam'] == r['Home']:
            df.loc[i:, 'AwayScore'] = df.loc[i:, 'AwayScore'] + 2
        elif r['posteam'] == r['Away']:
            df.loc[i:, 'HomeScore'] = df.loc[i:, 'HomeScore'] + 2



    # yrdln: home 0, away 100
    yrdln_split = r['yrdln'].split(' ')
    if len(yrdln_split) == 1 and yrdln_split[0] == str(50):
        h_yrdln = 50
        v_yrdln = 50
    elif len(yrdln_split) == 2:
        side = r['yrdln'].split(' ')[0]
        yrdln = int(r['yrdln'].split(' ')[1])
        if side == r['Home']:
            h_yrdln = yrdln
            v_yrdln = 100 - yrdln
        elif side == r['Away']:
            v_yrdln = yrdln
            h_yrdln = 100 - yrdln
    else:
        print 'Error in parsing yrdln, eid=', eid, 'yrdln=', r['yrdln']


    df.loc[i, 'h_yrdln'] = h_yrdln
    df.loc[i, 'v_yrdln'] = v_yrdln

## Verify with final value
parse_hs = df.iloc[-1]['HomeScore']
parse_vs = df.iloc[-1]['AwayScore']
real_hs = points[points['eid'] == eid]['HomeScore'].values[0]
real_vs = points[points['eid'] == eid]['AwayScore'].values[0]

if parse_hs == real_hs and parse_vs == real_vs:
    print 'Successfully finish:', eid
else:
    # TODO: check which interception actually had following extra point, right now just start from first one
    if (0 < (real_hs - parse_hs) <= h_inter) or (0 < (real_vs - parse_vs) <= v_inter):
        for i in range(real_hs - parse_hs):
            idx = h_inter_i[i]
            df.loc[idx:, 'HomeScore'] += 1
            print 'Adjust Home Score', eid
        for i in range(real_vs - parse_vs):
            idx = v_inter_i[i]
            df.loc[idx:, 'AwayScore'] += 1
            print 'Adjust Away Score', eid
    else:
        print 'Fail finishing:', eid
        print 'parse home score =', df.iloc[-1]['HomeScore'], 'real home score =', points[points['eid'] == eid]['HomeScore'].values[0]
        print 'parse away score =', df.iloc[-1]['AwayScore'], 'real away score =', points[points['eid'] == eid]['AwayScore'].values[0]

CAR (5:29) (No Huddle, Shotgun) J.Stewart right tackle for 13 yards, TOUCHDOWN.
ARI (15:00) R.Lindley pass short middle to D.Fells for 1 yard, TOUCHDOWN.
CAR (5:50) (Shotgun) C.Newton pass short right to F.Whittaker for 39 yards, TOUCHDOWN.
CAR (4:04) C.Newton pass short left to M.Tolbert for 1 yard, TOUCHDOWN.
Fail finishing: 2015010300
parse home score = 33 real home score = 27
parse away score = 10 real away score = 16


In [30]:
df[df.qtr == 2]

Unnamed: 0,index,time,desc,qtr,yrdln,posteam,note,lefttime,Home,Away,eid,HomeScore,AwayScore,CoverOrNot,h_yrdln,v_yrdln
43,107,900.0,(15:00) R.Lindley pass short middle to D.Fells...,2,CAR 1,ARI,TD,2700.0,CAR,ARI,2015010300,10,6,1.0,1.0,99.0
44,90,896.0,(14:56) (Shotgun) C.Newton pass incomplete dee...,2,CAR 20,CAR,PENALTY,2696.0,CAR,ARI,2015010300,10,6,1.0,20.0,80.0
45,96,896.0,C.Catanzaro kicks 65 yards from ARI 35 to end ...,2,ARI 35,ARI,KICKOFF,2696.0,CAR,ARI,2015010300,10,6,1.0,65.0,35.0
46,101,896.0,"C.Catanzaro extra point is GOOD, Center-M.Leac...",2,CAR 2,ARI,XP,2696.0,CAR,ARI,2015010300,10,7,1.0,2.0,98.0
47,89,891.0,(14:51) (Shotgun) J.Stewart left end to CAR 40...,2,CAR 30,CAR,,2691.0,CAR,ARI,2015010300,10,7,1.0,30.0,70.0
48,98,891.0,"(14:51) PENALTY on ARI-D.Williams, Neutral Zon...",2,CAR 25,CAR,PENALTY,2691.0,CAR,ARI,2015010300,10,7,1.0,25.0,75.0
49,92,861.0,"(14:21) (No Huddle, Shotgun) J.Stewart right t...",2,CAR 40,CAR,,2661.0,CAR,ARI,2015010300,10,7,1.0,40.0,60.0
50,93,825.0,"(13:45) (No Huddle) PENALTY on CAR-B.Bell, Fal...",2,ARI 30,CAR,PENALTY,2625.0,CAR,ARI,2015010300,10,7,1.0,70.0,30.0
51,94,825.0,"(13:45) (No Huddle, Shotgun) C.Newton right en...",2,ARI 42,CAR,,2625.0,CAR,ARI,2015010300,10,7,1.0,58.0,42.0
52,97,774.0,(12:54) (Shotgun) C.Newton pass incomplete dee...,2,ARI 35,CAR,,2574.0,CAR,ARI,2015010300,10,7,1.0,65.0,35.0


In [31]:
df.iloc[78].desc, df.iloc[78].posteam

('(2:50) M.Grice up the middle to CAR 1 for no gain (K.Short). FUMBLES (K.Short), RECOVERED by CAR-R.Harper at CAR 1. The Replay Official challenged the fumble ruling, and the play was REVERSED. M.Grice up the middle for 1 yard, TOUCHDOWN.',
 'ARI')

In [408]:
eid = 2014112307
df = pd.read_csv('data/game_data/'+str(eid)+'_plays.csv')
df = df.sort_values(by=['qtr', 'time'], ascending=[True, False], axis=0).reset_index()
df[df.sp == 1]

Unnamed: 0,index,drive,play,down,time,desc,ydstogo,qtr,ydsnet,yrdln,sp,posteam,note
0,73,1,36,0,15:00,R.Succop kicks 72 yards from TEN 35 to PHI -7....,0,1,0,TEN 35,1,TEN,TD
1,72,1,54,0,14:47,"C.Parkey extra point is GOOD, Center-J.Dorenbo...",0,1,0,TEN 2,1,PHI,XP
14,76,3,334,2,10:49,(10:49) (No Huddle) D.Sproles left end for 4 y...,4,1,51,TEN 4,1,PHI,TD
15,80,3,351,0,10:44,"C.Parkey extra point is GOOD, Center-J.Dorenbo...",0,1,51,TEN 2,1,PHI,XP
41,104,7,982,4,03:28,"(3:28) C.Parkey 36 yard field goal is GOOD, Ce...",18,1,46,TEN 18,1,PHI,FG
52,130,10,1268,0,14:51,"R.Succop extra point is GOOD, Center-B.Brinkle...",0,2,30,PHI 2,1,TEN,XP
59,128,11,1467,4,11:33,"(11:33) C.Parkey 26 yard field goal is GOOD, C...",2,2,73,TEN 7,1,PHI,FG
67,146,12,1642,2,08:26,"(8:26) S.Greene up the middle for 2 yards, TOU...",2,2,81,PHI 2,1,TEN,TD
69,148,12,1659,0,08:22,"R.Succop extra point is GOOD, Center-B.Brinkle...",0,2,81,PHI 2,1,TEN,XP
79,141,13,1906,1,04:37,"(4:37) (No Huddle, Shotgun) L.McCoy left guard...",2,2,69,TEN 2,1,PHI,TD


In [373]:
df.iloc[160].desc, df.iloc[160].posteam

('(Pass formation) TWO-POINT CONVERSION ATTEMPT. T.Bridgewater pass to Ch.Johnson is complete. ATTEMPT SUCCEEDS. PENALTY on GB-M.Neal, Roughing the Passer, 15 yards, enforced between downs.',
 'MIN')

In [75]:
range(2)

[0, 1]

### Create clean dataset

In [36]:
dataset = pd.DataFrame()
for w in range(1, 23):
    for i, r in points[(points['season'] == 2014) & (points['week'] == w)].iterrows():
        eid = r['eid']
        plays = pd.read_csv('data/game_data/'+str(eid)+'_plays.csv')
        plays = parseMatches(plays, eid, points)
        f_plays = plays.loc[:, ['qtr', 'lefttime', 'FavoriteScore', 'f_yrdln', 'CoverOrNot']]
        f_plays.rename(columns={
            'qtr': 'qtr',
            'lefttime': 'lefttime',
            'FavoriteScore': 'score',
            'f_yrdln': 'yrdln',
            'CoverOrNot': 'cover',
        }, inplace=True)
        f_plays['eid'] = eid
        f_plays['season'] = 2014
        f_plays['week'] = w
        f_plays['team'] = plays.Favorite.unique()[0]
        print list(f_plays)
        u_plays = plays.loc[:, ['qtr', 'lefttime', 'UnderdogScore', 'u_yrdln', 'CoverOrNot']]
        u_plays.rename(columns={
            'qtr': 'qtr',
            'lefttime': 'lefttime',
            'UnderdogScore': 'score',
            'u_yrdln': 'yrdln',
            'CoverOrNot': 'cover',
        }, inplace=True)
        u_plays['eid'] = eid
        u_plays['season'] = 2014
        u_plays['week'] = w
        u_plays['team'] = plays.Underdog.values[0]
        u_plays['cover'] = -u_plays['cover']
        dataset = pd.concat([dataset, f_plays, u_plays])
dataset.shape

Successfully finish: 2014090400
['qtr', 'lefttime', 'score', 'yrdln', 'cover', 'eid', 'season', 'week', 'team']
Successfully finish: 2014090700
['qtr', 'lefttime', 'score', 'yrdln', 'cover', 'eid', 'season', 'week', 'team']
Successfully finish: 2014090701
['qtr', 'lefttime', 'score', 'yrdln', 'cover', 'eid', 'season', 'week', 'team']
Successfully finish: 2014090702
['qtr', 'lefttime', 'score', 'yrdln', 'cover', 'eid', 'season', 'week', 'team']
Adjust Home Score 2014090703
['qtr', 'lefttime', 'score', 'yrdln', 'cover', 'eid', 'season', 'week', 'team']
Successfully finish: 2014090704
['qtr', 'lefttime', 'score', 'yrdln', 'cover', 'eid', 'season', 'week', 'team']
Successfully finish: 2014090705
['qtr', 'lefttime', 'score', 'yrdln', 'cover', 'eid', 'season', 'week', 'team']
Successfully finish: 2014090706
['qtr', 'lefttime', 'score', 'yrdln', 'cover', 'eid', 'season', 'week', 'team']
Adjust Home Score 2014090707
['qtr', 'lefttime', 'score', 'yrdln', 'cover', 'eid', 'season', 'week', 'team'

Successfully finish: 2014100511
['qtr', 'lefttime', 'score', 'yrdln', 'cover', 'eid', 'season', 'week', 'team']
Adjust Home Score 2014100512
['qtr', 'lefttime', 'score', 'yrdln', 'cover', 'eid', 'season', 'week', 'team']
Successfully finish: 2014100600
['qtr', 'lefttime', 'score', 'yrdln', 'cover', 'eid', 'season', 'week', 'team']
Adjust Home Score 2014100900
['qtr', 'lefttime', 'score', 'yrdln', 'cover', 'eid', 'season', 'week', 'team']
Successfully finish: 2014101201
['qtr', 'lefttime', 'score', 'yrdln', 'cover', 'eid', 'season', 'week', 'team']
Successfully finish: 2014101202
['qtr', 'lefttime', 'score', 'yrdln', 'cover', 'eid', 'season', 'week', 'team']
Successfully finish: 2014101203
['qtr', 'lefttime', 'score', 'yrdln', 'cover', 'eid', 'season', 'week', 'team']
Successfully finish: 2014101204
['qtr', 'lefttime', 'score', 'yrdln', 'cover', 'eid', 'season', 'week', 'team']
Successfully finish: 2014101205
['qtr', 'lefttime', 'score', 'yrdln', 'cover', 'eid', 'season', 'week', 'team'

Adjust Home Score 2014111000
Adjust Home Score 2014111000
['qtr', 'lefttime', 'score', 'yrdln', 'cover', 'eid', 'season', 'week', 'team']
Successfully finish: 2014111300
['qtr', 'lefttime', 'score', 'yrdln', 'cover', 'eid', 'season', 'week', 'team']
Successfully finish: 2014111601
['qtr', 'lefttime', 'score', 'yrdln', 'cover', 'eid', 'season', 'week', 'team']
Successfully finish: 2014111602
['qtr', 'lefttime', 'score', 'yrdln', 'cover', 'eid', 'season', 'week', 'team']
Successfully finish: 2014111604
['qtr', 'lefttime', 'score', 'yrdln', 'cover', 'eid', 'season', 'week', 'team']
Successfully finish: 2014111600
['qtr', 'lefttime', 'score', 'yrdln', 'cover', 'eid', 'season', 'week', 'team']
Successfully finish: 2014111605
['qtr', 'lefttime', 'score', 'yrdln', 'cover', 'eid', 'season', 'week', 'team']
Successfully finish: 2014111606
['qtr', 'lefttime', 'score', 'yrdln', 'cover', 'eid', 'season', 'week', 'team']
Successfully finish: 2014111607
['qtr', 'lefttime', 'score', 'yrdln', 'cover',

Successfully finish: 2014121411
['qtr', 'lefttime', 'score', 'yrdln', 'cover', 'eid', 'season', 'week', 'team']
Successfully finish: 2014121405
['qtr', 'lefttime', 'score', 'yrdln', 'cover', 'eid', 'season', 'week', 'team']
Successfully finish: 2014121412
['qtr', 'lefttime', 'score', 'yrdln', 'cover', 'eid', 'season', 'week', 'team']
Successfully finish: 2014121413
['qtr', 'lefttime', 'score', 'yrdln', 'cover', 'eid', 'season', 'week', 'team']
Successfully finish: 2014121500
['qtr', 'lefttime', 'score', 'yrdln', 'cover', 'eid', 'season', 'week', 'team']
Successfully finish: 2014121800
['qtr', 'lefttime', 'score', 'yrdln', 'cover', 'eid', 'season', 'week', 'team']
Successfully finish: 2014122000
['qtr', 'lefttime', 'score', 'yrdln', 'cover', 'eid', 'season', 'week', 'team']
Adjust Home Score 2014122001
Adjust Away Score 2014122001
['qtr', 'lefttime', 'score', 'yrdln', 'cover', 'eid', 'season', 'week', 'team']
Successfully finish: 2014122103
['qtr', 'lefttime', 'score', 'yrdln', 'cover',

(94914, 9)

In [442]:
dataset.to_csv('data/201401-201417_baseline_cleandata.csv', index=False)