In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("Data/2018drives_augmented.tsv", sep="\t", index_col=False)
df

Unnamed: 0,offense,defense,game_id,id,plays,start_yardline,yards,end_yardline,drive_result,points,start_distance,dist_range
0,Alabama,Louisville,401012246,4010122461,7,35,65,100,TD,7,65,"(60.0, 65.0]"
1,Louisville,Alabama,401012246,4010122462,4,75,20,55,PUNT,0,75,"(70.0, 75.0]"
2,Alabama,Louisville,401012246,4010122463,8,20,63,65,FUMBLE,0,80,"(75.0, 80.0]"
3,Louisville,Alabama,401012246,4010122464,3,83,-1,84,PUNT,0,83,"(80.0, 85.0]"
4,Alabama,Louisville,401012246,4010122465,6,45,55,100,TD,7,55,"(50.0, 55.0]"
5,Louisville,Alabama,401012246,4010122466,8,75,49,26,INT,0,75,"(70.0, 75.0]"
6,Alabama,Louisville,401012246,4010122467,6,25,75,100,TD,7,75,"(70.0, 75.0]"
7,Louisville,Alabama,401012246,4010122468,6,86,31,55,PUNT,0,86,"(85.0, 90.0]"
8,Alabama,Louisville,401012246,4010122469,5,22,19,41,PUNT,0,78,"(75.0, 80.0]"
9,Louisville,Alabama,401012246,40101224610,4,80,20,60,PUNT,0,80,"(75.0, 80.0]"


In [2]:
grouped_df = pd.read_csv("Data/2018drives_grouped.tsv", sep="\t", index_col=False)
grouped_df

Unnamed: 0,start_distance,driveCount,meanPPD
0,3,16,5.875000
1,7,14,5.285714
2,9,14,5.071429
3,10,11,5.000000
4,11,15,5.000000
5,12,11,6.000000
6,13,16,4.687500
7,14,16,4.375000
8,15,13,5.230769
9,17,10,5.200000


In [3]:
from sklearn.linear_model import LinearRegression

x = np.array(grouped_df["start_distance"]).reshape(-1, 1)
y = np.array(grouped_df["meanPPD"]).reshape(-1, 1)
weights = np.array(grouped_df["driveCount"])

reg = LinearRegression().fit(x, y, sample_weight=weights)
print(reg.score(x, y, sample_weight=weights))
print()
print(reg.coef_)
print(reg.intercept_)

0.8662863447771729

[[-0.04543692]]
[5.3591538]


In [4]:
df = df[["offense", "defense", "game_id", "start_distance", "points"]].copy()
df["expectedPoints"] = reg.predict(np.array(df["start_distance"]).reshape(-1, 1))
df

Unnamed: 0,offense,defense,game_id,start_distance,points,expectedPoints
0,Alabama,Louisville,401012246,65,7,2.405754
1,Louisville,Alabama,401012246,75,0,1.951385
2,Alabama,Louisville,401012246,80,0,1.724200
3,Louisville,Alabama,401012246,83,0,1.587890
4,Alabama,Louisville,401012246,55,7,2.860123
5,Louisville,Alabama,401012246,75,0,1.951385
6,Alabama,Louisville,401012246,75,7,1.951385
7,Louisville,Alabama,401012246,86,0,1.451579
8,Alabama,Louisville,401012246,78,0,1.815074
9,Louisville,Alabama,401012246,80,0,1.724200


In [5]:
df["PRE"] = df["points"] - df["expectedPoints"]
df

Unnamed: 0,offense,defense,game_id,start_distance,points,expectedPoints,PRE
0,Alabama,Louisville,401012246,65,7,2.405754,4.594246
1,Louisville,Alabama,401012246,75,0,1.951385,-1.951385
2,Alabama,Louisville,401012246,80,0,1.724200,-1.724200
3,Louisville,Alabama,401012246,83,0,1.587890,-1.587890
4,Alabama,Louisville,401012246,55,7,2.860123,4.139877
5,Louisville,Alabama,401012246,75,0,1.951385,-1.951385
6,Alabama,Louisville,401012246,75,7,1.951385,5.048615
7,Louisville,Alabama,401012246,86,0,1.451579,-1.451579
8,Alabama,Louisville,401012246,78,0,1.815074,-1.815074
9,Louisville,Alabama,401012246,80,0,1.724200,-1.724200


In [6]:
teams_df = df.groupby(by=["offense"]).agg(["mean"])["PRE"]
teams_df.columns = ["OPRE"]
teams_df.index.name = "team"
teams_df

Unnamed: 0_level_0,OPRE
team,Unnamed: 1_level_1
Air Force,0.158343
Akron,-0.858433
Alabama,1.683794
Appalachian State,0.085654
Arizona,0.042111
Arizona State,0.597568
Arkansas,-0.414746
Arkansas State,-0.034900
Army,1.167434
Auburn,-0.366151


In [7]:
defenses = df.groupby(by=["defense"]).agg(["mean"])["PRE"]
defenses.columns = ["DPRE"]
defenses.index.name = "team"
defenses

Unnamed: 0_level_0,DPRE
team,Unnamed: 1_level_1
Air Force,0.252091
Akron,-0.543122
Alabama,-1.237862
Appalachian State,-0.854826
Arizona,0.003202
Arizona State,-0.100535
Arkansas,0.185136
Arkansas State,0.291680
Army,-0.102225
Auburn,-0.677161


In [8]:
teams_df = pd.merge(teams_df, defenses, how="inner", left_index=True, right_index=True)
for index, row in teams_df.iterrows():
    print(index)
    print(row)

Air Force
OPRE    0.158343
DPRE    0.252091
Name: Air Force, dtype: float64
Akron
OPRE   -0.858433
DPRE   -0.543122
Name: Akron, dtype: float64
Alabama
OPRE    1.683794
DPRE   -1.237862
Name: Alabama, dtype: float64
Appalachian State
OPRE    0.085654
DPRE   -0.854826
Name: Appalachian State, dtype: float64
Arizona
OPRE    0.042111
DPRE    0.003202
Name: Arizona, dtype: float64
Arizona State
OPRE    0.597568
DPRE   -0.100535
Name: Arizona State, dtype: float64
Arkansas
OPRE   -0.414746
DPRE    0.185136
Name: Arkansas, dtype: float64
Arkansas State
OPRE   -0.03490
DPRE    0.29168
Name: Arkansas State, dtype: float64
Army
OPRE    1.167434
DPRE   -0.102225
Name: Army, dtype: float64
Auburn
OPRE   -0.366151
DPRE   -0.677161
Name: Auburn, dtype: float64
BYU
OPRE   -0.298717
DPRE    0.050312
Name: BYU, dtype: float64
Ball State
OPRE   -0.494679
DPRE    0.539614
Name: Ball State, dtype: float64
Baylor
OPRE    0.160827
DPRE    0.517770
Name: Baylor, dtype: float64
Boise State
OPRE    0.672956
D

In [9]:
teams_df.sort_values(by="OPRE", ascending=False).head(10)

Unnamed: 0_level_0,OPRE,DPRE
team,Unnamed: 1_level_1,Unnamed: 2_level_1
Oklahoma,2.029725,0.313981
Alabama,1.683794,-1.237862
UCF,1.678212,-0.28684
Army,1.167434,-0.102225
Washington State,1.164704,-0.128937
Ohio,1.14607,0.173518
Clemson,1.128618,-1.166061
Georgia,1.027589,-0.709425
Georgia Tech,1.013195,0.343878
Ohio State,0.945836,-0.247485


In [10]:
teams_df.sort_values(by="DPRE").head(10)

Unnamed: 0_level_0,OPRE,DPRE
team,Unnamed: 1_level_1,Unnamed: 2_level_1
Michigan,0.659549,-1.3241
Alabama,1.683794,-1.237862
Clemson,1.128618,-1.166061
UAB,0.413834,-1.12214
Mississippi State,0.433034,-1.058787
Miami,-0.205864,-1.006897
Fresno State,0.59874,-0.964826
Utah State,0.724517,-0.932895
Cincinnati,0.274479,-0.91083
Kentucky,-0.468959,-0.897212


In [11]:
df

Unnamed: 0,offense,defense,game_id,start_distance,points,expectedPoints,PRE
0,Alabama,Louisville,401012246,65,7,2.405754,4.594246
1,Louisville,Alabama,401012246,75,0,1.951385,-1.951385
2,Alabama,Louisville,401012246,80,0,1.724200,-1.724200
3,Louisville,Alabama,401012246,83,0,1.587890,-1.587890
4,Alabama,Louisville,401012246,55,7,2.860123,4.139877
5,Louisville,Alabama,401012246,75,0,1.951385,-1.951385
6,Alabama,Louisville,401012246,75,7,1.951385,5.048615
7,Louisville,Alabama,401012246,86,0,1.451579,-1.451579
8,Alabama,Louisville,401012246,78,0,1.815074,-1.815074
9,Louisville,Alabama,401012246,80,0,1.724200,-1.724200


In [12]:
df["off_oppAdj"] = 0.0
df["def_oppAdj"] = 0.0

for index, row in df.iterrows():
    df.at[index, "off_oppAdj"] = df.at[index, "PRE"] - teams_df.at[row["defense"], "DPRE"]
    df.at[index, "def_oppAdj"] = df.at[index, "PRE"] - teams_df.at[row["offense"], "OPRE"]
df

Unnamed: 0,offense,defense,game_id,start_distance,points,expectedPoints,PRE,off_oppAdj,def_oppAdj
0,Alabama,Louisville,401012246,65,7,2.405754,4.594246,3.219069,2.910452
1,Louisville,Alabama,401012246,75,0,1.951385,-1.951385,-0.713523,-1.081505
2,Alabama,Louisville,401012246,80,0,1.724200,-1.724200,-3.099377,-3.407994
3,Louisville,Alabama,401012246,83,0,1.587890,-1.587890,-0.350027,-0.718009
4,Alabama,Louisville,401012246,55,7,2.860123,4.139877,2.764700,2.456083
5,Louisville,Alabama,401012246,75,0,1.951385,-1.951385,-0.713523,-1.081505
6,Alabama,Louisville,401012246,75,7,1.951385,5.048615,3.673438,3.364821
7,Louisville,Alabama,401012246,86,0,1.451579,-1.451579,-0.213717,-0.581699
8,Alabama,Louisville,401012246,78,0,1.815074,-1.815074,-3.190251,-3.498868
9,Louisville,Alabama,401012246,80,0,1.724200,-1.724200,-0.486338,-0.854320


In [13]:
teams_df[["off_aPPD"]] = df.groupby(by="offense").agg(["mean"])["off_oppAdj"]
teams_df[["def_aPPD"]] = df.groupby(by="defense").agg(["mean"])["def_oppAdj"]
teams_df

Unnamed: 0_level_0,OPRE,DPRE,off_aPPD,def_aPPD
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Air Force,0.158343,0.252091,0.178660,0.243646
Akron,-0.858433,-0.543122,-0.709744,-0.180363
Alabama,1.683794,-1.237862,1.323021,-1.166211
Appalachian State,0.085654,-0.854826,-0.285452,-0.824907
Arizona,0.042111,0.003202,0.008016,0.042345
Arizona State,0.597568,-0.100535,0.639211,0.121372
Arkansas,-0.414746,0.185136,-0.135609,0.061074
Arkansas State,-0.034900,0.291680,-0.268922,0.149760
Army,1.167434,-0.102225,0.973849,-0.303349
Auburn,-0.366151,-0.677161,-0.140102,-0.582088


In [14]:
def printLeaders():
    print(teams_df.sort_values(by="off_aPPD", ascending=False).head(25))
    print(teams_df.sort_values(by="def_aPPD").head(25))
    print()

printLeaders()

                       OPRE      DPRE  off_aPPD  def_aPPD
team                                                     
Oklahoma           2.029725  0.313981  1.896206  0.245772
Alabama            1.683794 -1.237862  1.323021 -1.166211
UCF                1.678212 -0.286840  1.278911 -0.107396
Georgia            1.027589 -0.709425  1.245091 -0.614168
Washington State   1.164704 -0.128937  1.154970  0.202665
NC State           0.791119 -0.004918  1.033996 -0.121133
Army               1.167434 -0.102225  0.973849 -0.303349
Clemson            1.128618 -1.166061  0.938166 -1.189250
Ohio               1.146070  0.173518  0.919951  0.311527
Michigan           0.659549 -1.324100  0.902730 -1.279235
Louisiana          0.556327  1.355490  0.839581  1.245701
Georgia Tech       1.013195  0.343878  0.785032  0.418257
West Virginia      0.904499 -0.150749  0.769445 -0.151479
Ohio State         0.945836 -0.247485  0.737124 -0.051645
Ole Miss           0.638130  0.634287  0.711253  0.458315
Purdue        

In [15]:
def adjust():
    for index, row in df.iterrows():
        df.at[index, "off_oppAdj"] = df.at[index, "PRE"] - teams_df.at[row["defense"], "def_aPPD"]
        df.at[index, "def_oppAdj"] = df.at[index, "PRE"] - teams_df.at[row["offense"], "off_aPPD"]

    teams_df[["off_aPPD"]] = df.groupby(by="offense").agg(["mean"])["off_oppAdj"]
    teams_df[["def_aPPD"]] = df.groupby(by="defense").agg(["mean"])["def_oppAdj"]

In [16]:
for i in range(40):
    adjust()
    print(teams_df.at["Oklahoma", "off_aPPD"])
printLeaders()

2.216446365468757
2.143719829609683
2.2453961779274807
2.2157362995914704
2.2598632412120656
2.247366223328081
2.2697096543338096
2.264002206298379
2.2763007357210583
2.273405271416836
2.2805346433048053
2.2788797643432903
2.2831794923141002
2.2821070163985104
2.2848039842497103
2.2840192638624517
2.2857915251795413
2.285154399534258
2.286387914042678
2.285828438990205
2.2867464556831445
2.286228516141152
2.2869612771190435
2.28646579743565
2.287089628548535
2.2866063855628553
2.287166122008767
2.286689585660155
2.287211597026867
2.286738757122061
2.287238562438086
2.2867677725421474
2.28725450789355
2.2867848634453543
2.2872639078729278
2.286794909414311
2.287269429844732
2.2868007998893156
2.2872726605854847
2.2868042437473908
                       OPRE      DPRE  off_aPPD  def_aPPD
team                                                     
Oklahoma           2.029725  0.313981  2.286804  0.127629
Georgia            1.027589 -0.709425  1.806059 -1.239629
Alabama            1.683794 -

In [17]:
teams_df["net_aPPD"] = teams_df["off_aPPD"] - teams_df["def_aPPD"]

In [18]:
teams_df.sort_values(by="net_aPPD", ascending=False).head(25).reset_index()

Unnamed: 0,team,OPRE,DPRE,off_aPPD,def_aPPD,net_aPPD
0,Alabama,1.683794,-1.237862,1.701028,-1.647088,3.348116
1,Georgia,1.027589,-0.709425,1.806059,-1.239629,3.045688
2,Michigan,0.659549,-1.3241,1.055215,-1.466303,2.521519
3,Mississippi State,0.433034,-1.058787,1.003031,-1.446596,2.449627
4,Clemson,1.128618,-1.166061,0.955743,-1.413681,2.369424
5,Oklahoma,2.029725,0.313981,2.286804,0.127629,2.159175
6,Kentucky,-0.468959,-0.897212,0.253211,-1.671259,1.924469
7,Missouri,0.164812,-0.173537,0.97679,-0.941863,1.918654
8,Florida,-0.027571,-0.50468,0.757129,-1.160335,1.917464
9,LSU,-0.235831,-0.484196,0.683871,-1.207595,1.891466


In [19]:
teams_df["AP"] = "NR"
teams_df

Unnamed: 0_level_0,OPRE,DPRE,off_aPPD,def_aPPD,net_aPPD,AP
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Air Force,0.158343,0.252091,0.067507,0.428786,-0.361279,NR
Akron,-0.858433,-0.543122,-0.738160,-0.254599,-0.483561,NR
Alabama,1.683794,-1.237862,1.701028,-1.647088,3.348116,NR
Appalachian State,0.085654,-0.854826,-0.509278,-0.724438,0.215160,NR
Arizona,0.042111,0.003202,0.038794,0.079288,-0.040494,NR
Arizona State,0.597568,-0.100535,0.720966,0.035762,0.685203,NR
Arkansas,-0.414746,0.185136,-0.038725,-0.180073,0.141348,NR
Arkansas State,-0.034900,0.291680,-0.468298,0.315165,-0.783463,NR
Army,1.167434,-0.102225,0.837792,-0.184240,1.022032,NR
Auburn,-0.366151,-0.677161,0.137888,-1.081157,1.219045,NR


In [20]:
teams_df.to_csv("Data/teams_aPPD.tsv", sep="\t")

In [21]:
teams_df.describe()

Unnamed: 0,OPRE,DPRE,off_aPPD,def_aPPD,net_aPPD
count,130.0,130.0,130.0,130.0,130.0
mean,0.011512,0.005586,0.013006,0.004865,0.008142
std,0.605886,0.61896,0.706912,0.732938,1.198909
min,-1.098125,-1.3241,-1.676362,-1.671259,-2.835009
25%,-0.438875,-0.449844,-0.476028,-0.489064,-0.817101
50%,-0.029591,-0.005283,-0.011836,0.038228,-0.021155
75%,0.380258,0.328315,0.526338,0.431742,0.914509
max,2.029725,2.020214,2.286804,2.01893,3.348116
