In [9]:
import requests
import pandas as pd
import numpy as np
import time

def load_nfl_teams():
    nfl_teams = pd.read_csv('../../shared_datasets/nfl_teams.csv')
    nfl_teams.loc[(nfl_teams.team_name == 'St. Louis Cardinals'), 'team_id_pfr'] = 'CRD'
    labels = list(nfl_teams.columns)[4:] + ['team_name_short']
    nfl_teams.drop(labels, axis=1, inplace=True)
    return nfl_teams

def load_scores():
    return pd.read_csv('../../shared_datasets/spreadspoke_scores.csv')

def get_game_id(date, home_team):
    arr = date.split('/')
    return ''.join(arr) + home_team

In [10]:
# This loop is here more as a utility
# Adding more features here changes what features we use

# If you change which columns we use, we'll also have to change some highlighted ares in cell 1 (which for some reason 
# is lower than this one)
from IPython.display import display
teams = load_nfl_teams()
team_ids = teams['team_id_pfr']
for team_id in team_ids:
    avg_df = pd.read_csv(f'../../shared_datasets/team_data/avg_data/{team_id}_avgs.csv')
    # Change the following line and run this if we want to change the stats we grab
    avg_df = avg_df.loc[:, ['game_id', 'avg_first_downs', 'avg_total_yds', 'avg_pass_yds', 'avg_rush_yds', 'avg_TO']]
    avg_df.to_csv(f'../datasets/team_data/avg_data/{team_id}_avgs.csv', index=False)

In [11]:
# The following cell inserts each team's ID into the scores data to make it a bit easier to stay consistent
scores = load_scores()
teams = load_nfl_teams()
def get_home_team_id(row):
    team_name = row['team_home']
    team = teams[teams['team_name'] == team_name]
    return team['team_id_pfr'].item()

def get_away_team_id(row):
    team_name = row['team_away']
    team = teams[teams['team_name'] == team_name]
    return team['team_id_pfr'].item()
    
def insert_ids(row):
    home_id = get_home_team_id(row)
    away_id = get_away_team_id(row)
    return pd.Series([home_id, away_id])
    
scores[['home_id', 'away_id']] = scores.apply(insert_ids, axis=1)
scores.drop(columns=['team_favorite_id', 'spread_favorite', 'stadium_neutral', 'weather_humidity', 'Unnamed: 17', 'Unnamed: 18', 'Unnamed: 19'], inplace=True)
scores.to_csv('../datasets/unclean_scores.csv', index=False)

In [12]:
# Just some more cleaning on the data. Added a total score column and gave each a label: over or under

scores = pd.read_csv('../datasets/unclean_scores.csv')
scores = scores.drop(columns=['stadium'])
scores = scores[scores['over_under_line'].notna()]
scores['total_score'] = scores['score_home'] + scores['score_away']

def label_game(row):
    total = row['total_score']
    ou = row['over_under_line']
    
    # This part is a little weird but essentially it checks to see if the
    # input string is empty. NaN removal didn't catch some of these, so we make the empty
    # ones nan and then remove them later.
    if len(ou) < 2:
        return np.nan
    
    if float(total) > float(ou):
        return 'over'
    else:
        return 'under'
scores['label'] = scores.apply(label_game, axis=1)

scores = scores[(scores['label'] == 'over') | (scores['label'] == 'under')]
drop = ['team_home', 'team_away', 'schedule_week', 'schedule_playoff']
scores = scores.drop(columns=drop, axis=1)
scores = scores.reindex(columns=['schedule_date', 'schedule_season', 'home_id', 'away_id', 'weather_temperature', 'weather_wind_mph', 'score_home', 'score_away', 'total_score', 'over_under_line', 'label'])
scores.to_csv('../datasets/clean_scores.csv', index=False)
scores

Unnamed: 0,schedule_date,schedule_season,home_id,away_id,weather_temperature,weather_wind_mph,score_home,score_away,total_score,over_under_line,label
350,1/14/1968,1967,GNB,RAI,60.0,12.0,33,14,47,43,over
538,1/12/1969,1968,CLT,NYJ,66.0,12.0,7,16,23,40,under
727,1/11/1970,1969,KAN,MIN,55.0,14.0,23,7,30,39,under
916,1/17/1971,1970,CLT,DAL,59.0,11.0,16,13,29,36,under
1105,1/16/1972,1971,DAL,MIA,34.0,18.0,24,3,27,34,under
...,...,...,...,...,...,...,...,...,...,...,...
12942,1/17/2021,2020,KAN,CLE,,,22,17,39,56,under
12943,1/17/2021,2020,NOR,TAM,,,20,30,50,53,under
12944,1/24/2021,2020,GNB,TAM,,,26,31,57,53,over
12945,1/24/2021,2020,KAN,BUF,,,38,24,62,55,over


In [13]:
# The following code finally gives us our clean dataset
# We will need to change some stuff to add in more stats but it's nothing more than adding
# a few lines of code
# This adds the rolling average stats of each team for each individual game to the dataset
scores = pd.read_csv('../datasets/clean_scores.csv')
def get_game_avg(row):
    home_id = row['home_id']
    away_id = row['away_id']
    game_id = get_game_id(row['schedule_date'], home_id)
    
    # TODO: Change this if we end up adding more statistics to the dataset (columns of dataset will be different)
    home_df = pd.read_csv(f'../datasets/team_data/avg_data/{home_id}_avgs.csv')
    home_avgs = home_df[home_df['game_id'] == game_id]
    home_avg_first_downs = home_avgs['avg_first_downs'].iloc[0]
    home_avg_yards = home_avgs['avg_total_yds'].iloc[0]
    home_avg_rush_yds = home_avgs['avg_rush_yds'].iloc[0]
    home_avg_pass_yds = home_avgs['avg_pass_yds'].iloc[0]
    home_avg_TO = home_avgs['avg_TO'].iloc[0]
    
    away_df = pd.read_csv(f'../datasets/team_data/avg_data/{away_id}_avgs.csv')
    away_avgs = away_df[away_df['game_id'] == game_id]
    away_avg_first_downs = away_avgs['avg_first_downs'].iloc[0]
    away_avg_rush_yds = away_avgs['avg_rush_yds'].iloc[0]
    away_avg_pass_yds = away_avgs['avg_pass_yds'].iloc[0]
    away_avg_yards = away_avgs['avg_total_yds'].iloc[0]
    away_avg_TO = away_avgs['avg_TO'].iloc[0]
    return pd.Series([home_avg_first_downs, home_avg_yards, home_avg_pass_yds, home_avg_rush_yds, home_avg_TO, 
                      away_avg_first_downs, away_avg_yards, away_avg_pass_yds, away_avg_rush_yds, away_avg_TO])

scores[['home_avg_first_downs', 'home_avg_yds', 'home_avg_pass_yds', 'home_avg_rush_yds', 'home_avg_TO', 'away_avg_first_downs', 'away_avg_yds', 'away_avg_pass_yds', 'away_avg_rush_yds', 'away_avg_TO']] = scores.apply(get_game_avg, axis=1)

scores.to_csv('../datasets/scores_w_avgs.csv', index=False)
scores

Unnamed: 0,schedule_date,schedule_season,home_id,away_id,weather_temperature,weather_wind_mph,score_home,score_away,total_score,over_under_line,...,home_avg_first_downs,home_avg_yds,home_avg_pass_yds,home_avg_rush_yds,home_avg_TO,away_avg_first_downs,away_avg_yds,away_avg_pass_yds,away_avg_rush_yds,away_avg_TO
0,1/14/1968,1967,GNB,RAI,60.0,12.0,33,14,47,43.0,...,17.647059,304.117647,167.764706,136.352941,2.529412,17.764706,361.764706,218.705882,143.058824,2.352941
1,1/12/1969,1968,CLT,NYJ,66.0,12.0,7,16,23,40.0,...,18.411765,332.529412,203.941176,128.588235,2.529412,18.705882,365.470588,247.411765,118.058824,1.823529
2,1/11/1970,1969,KAN,MIN,55.0,14.0,23,7,30,39.0,...,17.823529,315.470588,165.117647,150.352941,2.588235,16.941176,293.705882,162.176471,131.529412,2.235294
3,1/17/1971,1970,CLT,DAL,59.0,11.0,16,13,29,36.0,...,17.058824,301.470588,201.411765,100.058824,2.529412,16.117647,306.823529,139.647059,167.176471,2.000000
4,1/16/1972,1971,DAL,MIA,34.0,18.0,24,3,27,34.0,...,19.823529,342.000000,179.000000,163.000000,2.176471,16.294118,311.176471,146.647059,164.529412,1.705882
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10391,1/17/2021,2020,KAN,CLE,,,22,17,39,56.0,...,24.764706,417.117647,304.058824,113.058824,1.000000,22.117647,370.882353,225.294118,145.588235,0.882353
10392,1/17/2021,2020,NOR,TAM,,,20,30,50,53.0,...,23.294118,378.294118,236.529412,141.764706,1.294118,22.764706,391.647059,291.529412,100.117647,0.882353
10393,1/24/2021,2020,GNB,TAM,,,26,31,57,53.0,...,22.235294,386.294118,256.000000,130.294118,0.764706,22.705882,392.352941,294.941176,97.411765,0.941176
10394,1/24/2021,2020,KAN,BUF,,,38,24,62,55.0,...,24.823529,421.235294,311.235294,110.000000,1.058824,23.882353,376.235294,272.117647,104.117647,1.176471


In [14]:
# Reordered the columns to make it a bit more readable from a coding standpoint
scores_df = pd.read_csv('../datasets/scores_w_avgs.csv')
scores_df = scores_df.reindex(columns=['schedule_date', 'schedule_season', 'home_id', 'away_id', 'weather_temperature', 'weather_wind_mph', 'score_home', 'home_avg_first_downs', 'home_avg_yds', 'home_avg_pass_yds', 'home_avg_rush_yds', 'home_avg_TO', 'score_away', 'away_avg_first_downs', 'away_avg_yds', 'away_avg_pass_yds', 'away_avg_rush_yds', 'away_avg_TO', 'total_score', 'over_under_line', 'label'])
scores_df.to_csv('../datasets/scores_w_avgs.csv', index=False)
scores_df

Unnamed: 0,schedule_date,schedule_season,home_id,away_id,weather_temperature,weather_wind_mph,score_home,home_avg_first_downs,home_avg_yds,home_avg_pass_yds,...,home_avg_TO,score_away,away_avg_first_downs,away_avg_yds,away_avg_pass_yds,away_avg_rush_yds,away_avg_TO,total_score,over_under_line,label
0,1/14/1968,1967,GNB,RAI,60.0,12.0,33,17.647059,304.117647,167.764706,...,2.529412,14,17.764706,361.764706,218.705882,143.058824,2.352941,47,43.0,over
1,1/12/1969,1968,CLT,NYJ,66.0,12.0,7,18.411765,332.529412,203.941176,...,2.529412,16,18.705882,365.470588,247.411765,118.058824,1.823529,23,40.0,under
2,1/11/1970,1969,KAN,MIN,55.0,14.0,23,17.823529,315.470588,165.117647,...,2.588235,7,16.941176,293.705882,162.176471,131.529412,2.235294,30,39.0,under
3,1/17/1971,1970,CLT,DAL,59.0,11.0,16,17.058824,301.470588,201.411765,...,2.529412,13,16.117647,306.823529,139.647059,167.176471,2.000000,29,36.0,under
4,1/16/1972,1971,DAL,MIA,34.0,18.0,24,19.823529,342.000000,179.000000,...,2.176471,3,16.294118,311.176471,146.647059,164.529412,1.705882,27,34.0,under
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10391,1/17/2021,2020,KAN,CLE,,,22,24.764706,417.117647,304.058824,...,1.000000,17,22.117647,370.882353,225.294118,145.588235,0.882353,39,56.0,under
10392,1/17/2021,2020,NOR,TAM,,,20,23.294118,378.294118,236.529412,...,1.294118,30,22.764706,391.647059,291.529412,100.117647,0.882353,50,53.0,under
10393,1/24/2021,2020,GNB,TAM,,,26,22.235294,386.294118,256.000000,...,0.764706,31,22.705882,392.352941,294.941176,97.411765,0.941176,57,53.0,over
10394,1/24/2021,2020,KAN,BUF,,,38,24.823529,421.235294,311.235294,...,1.058824,24,23.882353,376.235294,272.117647,104.117647,1.176471,62,55.0,over


In [15]:
df = pd.read_csv('../datasets/scores_w_avgs.csv')
df = df.dropna().reset_index(drop=True)
df

Unnamed: 0,schedule_date,schedule_season,home_id,away_id,weather_temperature,weather_wind_mph,score_home,home_avg_first_downs,home_avg_yds,home_avg_pass_yds,...,home_avg_TO,score_away,away_avg_first_downs,away_avg_yds,away_avg_pass_yds,away_avg_rush_yds,away_avg_TO,total_score,over_under_line,label
0,1/14/1968,1967,GNB,RAI,60.0,12.0,33,17.647059,304.117647,167.764706,...,2.529412,14,17.764706,361.764706,218.705882,143.058824,2.352941,47,43.0,over
1,1/12/1969,1968,CLT,NYJ,66.0,12.0,7,18.411765,332.529412,203.941176,...,2.529412,16,18.705882,365.470588,247.411765,118.058824,1.823529,23,40.0,under
2,1/11/1970,1969,KAN,MIN,55.0,14.0,23,17.823529,315.470588,165.117647,...,2.588235,7,16.941176,293.705882,162.176471,131.529412,2.235294,30,39.0,under
3,1/17/1971,1970,CLT,DAL,59.0,11.0,16,17.058824,301.470588,201.411765,...,2.529412,13,16.117647,306.823529,139.647059,167.176471,2.000000,29,36.0,under
4,1/16/1972,1971,DAL,MIA,34.0,18.0,24,19.823529,342.000000,179.000000,...,2.176471,3,16.294118,311.176471,146.647059,164.529412,1.705882,27,34.0,under
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9682,12/27/2020,2020,HTX,CIN,72.0,0.0,31,20.117647,373.470588,279.058824,...,1.117647,37,20.882353,336.000000,232.529412,103.470588,1.411765,68,46.0,over
9683,1/3/2021,2020,DET,MIN,72.0,0.0,35,21.588235,347.529412,249.294118,...,1.294118,37,22.941176,378.764706,243.235294,135.529412,1.470588,72,53.5,over
9684,1/3/2021,2020,HTX,OTI,72.0,0.0,38,20.529412,379.176471,287.411765,...,1.117647,41,23.588235,390.470588,227.235294,163.235294,0.705882,79,55.5,over
9685,1/3/2021,2020,CLT,JAX,72.0,0.0,28,22.411765,372.000000,246.823529,...,1.000000,14,19.588235,327.705882,234.411765,93.294118,1.529412,42,48.0,under


In [16]:

cols = ['weather_temperature', 'weather_wind_mph', 'home_avg_first_downs', 'home_avg_yds', 'home_avg_pass_yds', 'home_avg_rush_yds', 'home_avg_TO', 'away_avg_first_downs', 'away_avg_yds', 'away_avg_pass_yds', 'away_avg_rush_yds', 'away_avg_TO']
def new_labels(row):
    if row['label'] == 'over':
        return -1
    else:
        return 1

df['encoded_label'] = df.apply(new_labels, axis=1)
df.to_csv('../datasets/non_pca_weather_scores_w_avgs.csv')

In [17]:
from sklearn.preprocessing import StandardScaler
features = df[cols]
features = StandardScaler().fit_transform(features)

labels = df['encoded_label']
labels

0      -1
1       1
2       1
3       1
4       1
       ..
9682   -1
9683   -1
9684   -1
9685    1
9686    1
Name: encoded_label, Length: 9687, dtype: int64

In [18]:
from sklearn.decomposition import PCA

pca = PCA(n_components = 0.95)
principalComponents = pca.fit_transform(features)

pca_df = pd.DataFrame(data = principalComponents, 
                      columns = ['pc1', 'pc2', 'pc3', 'pc4', 'pc5', 'pc6', 'pc7', 'pc8'])

final_df = pd.concat([pca_df, labels], axis = 1)
final_df.to_csv('../datasets/pca_weather_scores_w_avgs.csv', index=False)