In [333]:
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
from seaborn import pairplot
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
import matplotlib.ticker as plticker

In [206]:
df = pd.read_csv('ML_epa_data.csv')
df = df[df['total_line'].notna()]
df

Unnamed: 0.1,Unnamed: 0,game_id,total_line,spread_line,home_score,away_score,result,total,season,week,home_team,away_team,posteam,defteam,off_epa,off_pass_epa,off_rush_epa,def_epa,def_pass_epa,def_rush_epa
0,1,2011_01_NO_GB,47.5,-4.5,42.0,34.0,-8.0,76.0,2011.0,1.0,GB,NO,GB,NO,0.355356,0.554186,0.072405,0.160695,0.275887,-0.119056
1,2,2011_01_NO_GB,47.5,-4.5,42.0,34.0,-8.0,76.0,2011.0,1.0,GB,NO,NO,GB,0.160695,0.275887,-0.119056,0.355356,0.554186,0.072405
2,3,2011_01_PIT_BAL,37.0,-1.0,35.0,7.0,-28.0,42.0,2011.0,1.0,BAL,PIT,BAL,PIT,0.184658,0.345858,0.033533,-0.341011,-0.378989,-0.234197
3,4,2011_01_PIT_BAL,37.0,-1.0,35.0,7.0,-28.0,42.0,2011.0,1.0,BAL,PIT,PIT,BAL,-0.341011,-0.378989,-0.234197,0.184658,0.345858,0.033533
4,5,2011_01_ATL_CHI,40.5,1.0,30.0,12.0,-18.0,42.0,2011.0,1.0,CHI,ATL,ATL,CHI,-0.262554,-0.290874,-0.166269,-0.055269,0.014545,-0.150940
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5673,5674,2021_12_LAC_DEN,49.0,1.5,,,,,,,,,,,,,,,,
5674,5675,2021_12_LA_GB,50.0,-1.5,,,,,,,,,,,,,,,,
5675,5676,2021_12_MIN_SF,48.0,-2.5,,,,,,,,,,,,,,,,
5676,5677,2021_12_CLE_BAL,48.0,-4.5,,,,,,,,,,,,,,,,


# I want to make a column where I can track each teams epa/play on the season- and use that number to predict total points scored in the game

In [297]:
# seperate EPA for each team / each catagory / each wk / each season

offense_epa = df.groupby(['posteam', 'season', 'week'], as_index=False)['off_epa'].mean()
defense_epa = df.groupby(['posteam', 'season', 'week'], as_index=False)['def_epa'].mean()



In [299]:
# lag EPA one period back
# show the epa for each team in these scenarios. grouped by posteam. 
### when (for example) arizona is on defense they will still be called the posteam in this example

offense_epa['epa_shifted'] = total_offense_epa.groupby('posteam')['off_epa'].shift()
defense_epa['epa_shifted'] = total_defense_epa.groupby('posteam')['def_epa'].shift()

In [301]:
# calculate EWMA (exponentially weighted moving average) with a static window and dynamic window and assign it as a column 

def dynamic_window_ewma(x):
    
    values = np.zeros(len(x))
    for i, (_, row) in enumerate(x.iterrows()):
        epa = x.epa_shifted[:i+1]
        if row.week > 4:
            values[i] = epa.ewm(min_periods=1, span=row.week).mean().values[-1]
        else:
            values[i] = epa.ewm(min_periods=1, span=4).mean().values[-1]
            
    return pd.Series(values, index=x.index)



offense_epa['ewma'] = passing_offense_epa.groupby('posteam')['epa_shifted'].transform(lambda x: x.ewm(min_periods=1, span=4).mean())
offense_epa['ewma_dynamic_window'] = passing_offense_epa.groupby('posteam').apply(dynamic_window_ewma).values

defense_epa['ewma'] = passing_defense_epa.groupby('posteam')['epa_shifted'].transform(lambda x: x.ewm(min_periods=1, span=4).mean())
defense_epa['ewma_dynamic_window'] = passing_defense_epa.groupby('posteam').apply(dynamic_window_ewma).values



In [313]:
#Merge the data 

epa = offense_epa.merge(defense_epa, on=['posteam', 'season', 'week'], suffixes=('_offense', '_defense')).rename(columns={"posteam": "team"})
epa 

Unnamed: 0,team,season,week,off_epa,epa_shifted_offense,ewma_offense,ewma_dynamic_window_offense,def_epa,epa_shifted_defense,ewma_defense,ewma_dynamic_window_defense
0,ARI,2011.0,1.0,0.132567,,,,0.125934,,,
1,ARI,2011.0,2.0,0.112179,0.132567,0.395432,0.395432,0.035036,0.125934,0.348553,0.348553
2,ARI,2011.0,3.0,-0.075423,0.112179,0.136076,0.136076,-0.099225,0.035036,0.134914,0.134914
3,ARI,2011.0,4.0,0.014615,-0.075423,0.043014,0.043014,0.097595,-0.099225,0.036944,0.036944
4,ARI,2011.0,5.0,-0.338153,0.014615,-0.000856,0.010447,-0.007806,0.097595,0.112811,0.112248
...,...,...,...,...,...,...,...,...,...,...,...
5639,WAS,2021.0,5.0,-0.145090,0.243477,0.188709,0.145009,0.134703,0.105805,0.233730,0.201883
5640,WAS,2021.0,6.0,-0.168297,-0.145090,-0.007768,-0.006488,0.126235,0.134703,0.229967,0.187683
5641,WAS,2021.0,7.0,-0.104424,-0.168297,-0.063287,-0.044932,0.093420,0.126235,0.236656,0.187154
5642,WAS,2021.0,8.0,-0.016824,-0.104424,-0.143397,-0.093784,0.078866,0.093420,0.256653,0.196601


In [314]:
# remove the first season 
epa = epa.loc[epa['season'] != epa['season'].unique()[0], :]

epa = epa.reset_index(drop=True)

epa.head()

Unnamed: 0,team,season,week,off_epa,epa_shifted_offense,ewma_offense,ewma_dynamic_window_offense,def_epa,epa_shifted_defense,ewma_defense,ewma_dynamic_window_defense
0,ARI,2012.0,1.0,-0.18469,-0.15994,-0.206774,-0.206774,-0.21492,-0.075943,-0.150386,-0.150386
1,ARI,2012.0,2.0,-0.060724,-0.18469,-0.141053,-0.141053,-0.083637,-0.21492,-0.148535,-0.148535
2,ARI,2012.0,3.0,-0.01778,-0.060724,-0.107644,-0.107644,-0.298261,-0.083637,-0.092408,-0.092408
3,ARI,2012.0,4.0,-0.212165,-0.01778,0.047295,0.047295,-0.073324,-0.298261,-0.233106,-0.233106
4,ARI,2012.0,5.0,-0.208127,-0.212165,-0.031411,-0.039147,-0.101606,-0.073324,-0.135715,-0.138678


In [327]:
schedule = df[['season', 'week', 'home_team', 'away_team', 'home_score', 'away_score','spread_line','total_line']]\
.drop_duplicates().reset_index(drop=True)\
.assign(home_team_win = lambda x: (x.home_score > x.away_score).astype(int))

data = schedule.merge(epa.rename(columns={'team': 'home_team'}), on=['home_team', 'season', 'week'])\
.merge(epa.rename(columns={'team': 'away_team'}), on=['away_team', 'season', 'week'], suffixes=('_home', '_away'))

data.head()


Unnamed: 0,season,week,home_team,away_team,home_score,away_score,spread_line,total_line,home_team_win,off_epa_home,...,ewma_defense_home,ewma_dynamic_window_defense_home,off_epa_away,epa_shifted_offense_away,ewma_offense_away,ewma_dynamic_window_offense_away,def_epa_away,epa_shifted_defense_away,ewma_defense_away,ewma_dynamic_window_defense_away
0,2012.0,1.0,NYG,DAL,17.0,24.0,-3.5,45.5,0,-0.012447,...,0.043994,0.043994,0.275412,-0.126184,0.094602,0.094602,-0.012447,0.187463,0.328496,0.328496
1,2012.0,1.0,CHI,IND,41.0,21.0,-10.0,41.5,1,0.165871,...,0.034616,0.034616,-0.114713,-0.08755,-0.053805,-0.053805,0.165871,-0.134416,-0.2094,-0.2094
2,2012.0,1.0,CLE,PHI,16.0,17.0,9.0,42.0,0,-0.418577,...,0.00347,0.00347,-0.147893,0.13125,0.15736,0.15736,-0.418577,-0.130564,-0.217721,-0.217721
3,2012.0,1.0,DET,LA,27.0,23.0,-9.0,46.0,1,-0.084562,...,0.259318,0.259318,-0.02121,-0.172216,-0.262965,-0.262965,-0.084562,0.145449,0.13737,0.13737
4,2012.0,1.0,HOU,MIA,30.0,10.0,-13.0,41.5,1,0.089868,...,-0.124596,-0.124596,-0.395011,-0.104869,0.019133,0.019133,0.089868,-0.05974,-0.096226,-0.096226


In [328]:
data.columns

Index(['season', 'week', 'home_team', 'away_team', 'home_score', 'away_score',
       'spread_line', 'total_line', 'home_team_win', 'off_epa_home',
       'epa_shifted_offense_home', 'ewma_offense_home',
       'ewma_dynamic_window_offense_home', 'def_epa_home',
       'epa_shifted_defense_home', 'ewma_defense_home',
       'ewma_dynamic_window_defense_home', 'off_epa_away',
       'epa_shifted_offense_away', 'ewma_offense_away',
       'ewma_dynamic_window_offense_away', 'def_epa_away',
       'epa_shifted_defense_away', 'ewma_defense_away',
       'ewma_dynamic_window_defense_away'],
      dtype='object')

In [329]:
#pairplot(data)

In [330]:
target = 'home_team_win'
features = [column for column in data.columns if 'ewma' in column and 'dynamic' in column]
for feature in features:
  print(feature)

ewma_dynamic_window_offense_home
ewma_dynamic_window_defense_home
ewma_dynamic_window_offense_away
ewma_dynamic_window_defense_away


In [331]:
data = data.dropna()

X = data.loc[df['season'] != 2021, features].values
y = data.loc[df['season'] != 2021, target].values

clf = LogisticRegression()
clf.fit(X, y)


LogisticRegression()

In [334]:
accuracy_scores = cross_val_score(clf, X, y, cv=10)
log_losses = cross_val_score(clf, X, y, cv=10, scoring='neg_log_loss')

print('Model Accuracy:', np.mean(accuracy_scores))

Model Accuracy: 0.6093979779411764


In [335]:
print('Neg log loss:', np.mean(log_losses))

Neg log loss: -0.6515044371407708


In [344]:
data_2021 = data.loc[(data['season'] == 2021)].assign(
    predicted_winner = lambda x: clf.predict(x[features]),
    home_team_win_probability = lambda x: clf.predict_proba(x[features])[:, 1]
)\
[['home_team', 'away_team', 'week', 'predicted_winner', 'home_team_win_probability', 'home_team_win']]

data_2021['actual_winner'] = data_2021.apply(lambda x: x.home_team if x.home_team_win else x.away_team, axis=1)
data_2021['predicted_winner'] = data_2021.apply(lambda x: x.home_team if x.predicted_winner == 1 else x.away_team, axis=1)
data_2021['win_probability'] = data_2021.apply(lambda x: x.home_team_win_probability if x.predicted_winner == x.home_team else 1 - x.home_team_win_probability, axis=1)
data_2021['correct_prediction'] = (data_2021['predicted_winner'] == data_2021['actual_winner']).astype(int)

data_2021 = data_2021.drop(columns=['home_team_win_probability', 'home_team_win'])

data_2021.sort_values(by='win_probability', ascending=False).reset_index(drop=True).head(10)

Unnamed: 0,home_team,away_team,week,predicted_winner,actual_winner,win_probability,correct_prediction
0,DEN,NYJ,3.0,DEN,DEN,0.844502,1
1,ARI,CAR,10.0,ARI,CAR,0.825695,0
2,LA,DET,7.0,LA,LA,0.813181,1
3,CIN,JAX,4.0,CIN,CIN,0.805789,1
4,DEN,BAL,4.0,DEN,BAL,0.800698,0
5,ARI,HOU,7.0,ARI,ARI,0.793217,1
6,TB,ATL,2.0,TB,TB,0.790978,1
7,LV,MIA,3.0,LV,LV,0.778036,1
8,TB,CHI,7.0,TB,TB,0.775063,1
9,BUF,MIA,8.0,BUF,BUF,0.773205,1


In [345]:
correct = data_2021.loc[data_2021['correct_prediction'] == 1].groupby('week')['correct_prediction'].sum()

num_games = data_2021.groupby('week')['correct_prediction'].size()

results = correct / num_games

results

week
1.0     0.437500
2.0     0.500000
3.0     0.625000
4.0     0.437500
5.0     0.625000
6.0     0.714286
7.0     0.769231
8.0     0.400000
9.0     0.357143
10.0    0.500000
Name: correct_prediction, dtype: float64