In [2]:
import requests
import pandas as pd
import numpy as np
import time

def load_nfl_teams():
    nfl_teams = pd.read_csv('../datasets/nfl_teams.csv')
    nfl_teams.loc[(nfl_teams.team_name == 'St. Louis Cardinals'), 'team_id_pfr'] = 'CRD'
    labels = list(nfl_teams.columns)[4:] + ['team_name_short']
    nfl_teams.drop(labels, axis=1, inplace=True)
    return nfl_teams

def load_scores():
    return pd.read_csv('../datasets/spreadspoke_scores.csv')

def transform_date(date):
    mapping = {
        '1'  : 'January',
        '2'  : 'February',
        '3'  : 'March',
        '4'  : 'April',
        '5'  : 'May',
        '6'  : 'June',
        '7'  : 'July',
        '8'  : 'August',
        '9'  : 'September',
        '10' : 'October',
        '11' : 'November',
        '12' : 'December',
    }
    
    arr = date.split('/')
    month = arr[0]
    day = arr[1]
    
    return mapping[month] + ' ' + day

def get_team_id(name):
    teams = load_nfl_teams()
    return teams.loc[teams['team_name'] == name, 'team_id_pfr'].iloc[0]

def get_game_stats(date, year, home_team_id):
    scores = load_scores()
    teams = load_nfl_teams()
    
    # Get the URL for this team and d,p the data into a dataframe
    url = 'https://www.pro-football-reference.com/teams/' + home_team_id.lower() + '/' + str(year) +'.htm'
    df = pd.read_html(url)[1]

    # Sort out some weird formatting leftover from the HTML
    df.columns = [col[1] for col in df.columns.values]

    # Drop the unnecessary labels (already in score dataset)
    drop_labels = [
        'Week',
        'Day',
        'Tm',
        'Rec',
        'OT',
        'Unnamed: 3_level_1',
        'Unnamed: 4_level_1',
        'Unnamed: 5_level_1',
        'Unnamed: 8_level_1'
    ]
    df.drop(columns=drop_labels, inplace=True)
    
    # In later seasons, PFR added an "expected points" metric
    # If we see this happening, we drop those expected points metrics
    if len(df.columns) > 13:
        df.drop(columns=['Offense', 'Defense', 'Sp. Tms'], inplace=True)

    # Rename some of the labels for clarity
    new_labels = [list(df.columns)[0].lower()] + ['away_team', 'opp_score', 'home_first_downs', 'home_total_yds', 'home_pass_yds', 'home_rush_yds', 'home_TO', 'away_first_downs', 'away_total_yds', 'away_pass_yds', 'away_rush_yds', 'away_TO']
    df.columns = [val for val in new_labels]
    df.drop(columns=['opp_score'], inplace=True)
    
    # Grab the stats from the specific date in question
    new_date = transform_date(date)
    stats = df.loc[df['date'] == new_date]
    away_team_name = stats['away_team'].to_string(index=False).strip()
    away_team_id = teams.loc[teams['team_name'] == away_team_name, 'team_id_pfr'].iloc[0]
    stats.loc[:, 'away_team'] = away_team_id
    
    arr = date.split('/')
    game_id = ''.join(arr) + home_team_id
    stats.insert(0, 'game_id', game_id)
    return stats.iloc[0]


def get_game_id(date, home_team):
    arr = date.split('/')
    return ''.join(arr) + home_team

In [None]:
scores = load_scores()

stats = ['game_id',
            'date',
            'away_team',
            'home_first_downs',
            'home_total_yds',
            'home_pass_yds',
            'home_rush_yds',
            'home_TO',
            'away_first_downs',
            'away_total_yds',
            'away_pass_yds',
            'away_rush_yds',
            'away_TO']
df = pd.DataFrame(columns=stats)

teams = load_nfl_teams()

# 0 thru 4031 are fine
# index 4032 is bad
# 4033 thru 8500 are good

scores.drop(4032, inplace=True)

# This loop takes forever; it gets the stats for every game in the original dataset
i = 0
for index, row in scores.iterrows():
    home_team_name = row['team_home']
    home_team_id = teams.loc[teams['team_name'] == home_team_name, 'team_id_pfr'].iloc[0]
    year = row['schedule_season']
    date = row['schedule_date']
    game_stats = get_game_stats(date, year, home_team_id)
    args = {'game_id' : game_stats['game_id'],
            'date' : game_stats['date'],
            'away_team' : game_stats['away_team'],
            'home_first_downs' : game_stats['home_first_downs'],
            'home_total_yds' : game_stats['home_total_yds'],
            'home_pass_yds' : game_stats['home_pass_yds'],
            'home_rush_yds' : game_stats['home_rush_yds'],
            'home_TO' : game_stats['home_TO'],
            'away_first_downs' : game_stats['away_first_downs'],
            'away_total_yds' : game_stats['away_total_yds'],
            'away_pass_yds' : game_stats['away_pass_yds'],
            'away_rush_yds' : game_stats['away_rush_yds'],
            'away_TO' : game_stats['away_TO']}

    df = df.append(args, ignore_index=True)
    i += 1
    
df.to_csv('../datasets/all_game_stats.csv')

In [None]:
# Cleaning up some things from the data that I missed in the cell above
# Basically I didn't want to run all of the requests again
# So I took the data and stored it in a CSV. This is the cleanup of that data
new_df = pd.read_csv('../datasets/all_game_stats.csv')
new_df.rename(columns={'Unnamed: 0': 'home_team'}, inplace=True)

for index, row in new_df.iterrows():
    new_df.iloc[index, 0] = row.loc['game_id'][-3:]
new_df
    
new_df.to_csv('../datasets/all_game_stats.csv', index=False)

In [8]:
df = pd.read_csv('../datasets/all_game_stats.csv')
teams = load_nfl_teams()
scores = load_scores()

team_ids = teams['team_id_pfr']
for team_id in team_ids:
    team_df = df.loc[(df['home_team'] == team_id) | (df['away_team'] == team_id)]
    filename = '../datasets/team_data/unclean_data/' + team_id + '_game_stats.csv'
    team_df.to_csv(filename)


In [6]:
cols = ['team_id', 'game_id', 'date', 'opp_team_id', 'first_downs',
       'total_yds', 'pass_yds', 'rush_yds', 'TO',
       'opp_first_downs', 'opp_total_yds', 'opp_pass_yds', 'opp_rush_yds',
       'opp_TO']

# This method creates a new CSV file for each team
# This was done because in the previous CSV file we knew which team was home
# and which was away, but that made it much more difficult to get a rolling average
# of each team's statistics because if a team was home one week and away the other,
# we would have to go back to make sure if they're home or away each week,
# and then find their previous 16 games, add them up, and take an average. It would've been a pain in the ass

# This way, we have intermediary datasets that both allow us to take a rolling average very easily
# with the pandas.rolling() builtin, but also allows us to more easily pick and choose
# which stats we want to include in our model and have them stay a uniform, consistent name throughout.

for team_id in team_ids:
    file = '../datasets/team_data/unclean_data/' + team_id + '_game_stats.csv'
    dirty_df = pd.read_csv(file)
    clean_df = pd.DataFrame(columns=cols)
    app = {'team_id': team_id}
    for index, row in dirty_df.iterrows():
        # If home team...
        app['game_id'] = row['game_id']
        app['date'] = row['date']
        if row['home_team'] == team_id:
            # Set the opposing team's id
            app['opp_team_id']     = row['away_team']
            
            # Set the team's offensive stats 
            app['first_downs']     = row['home_first_downs']
            app['total_yds']       = row['home_total_yds']
            app['pass_yds']        = row['home_pass_yds']
            app['rush_yds']        = row['home_rush_yds']
            app['TO']              = row['home_TO']
            
            # Set the opposition team's stats
            app['opp_first_downs'] = row['away_first_downs']
            app['opp_total_yds']   = row['away_total_yds']
            app['opp_pass_yds']    = row['away_pass_yds']
            app['opp_rush_yds']    = row['away_rush_yds']
            app['opp_TO']          = row['away_TO']
        else:
            # Set opposing team's id
            app['opp_team_id']     = row['home_team']
            
            # Set team's offensive stats
            app['first_downs']     = row['away_first_downs']
            app['total_yds']       = row['away_total_yds']
            app['pass_yds']        = row['away_pass_yds']
            app['rush_yds']        = row['away_rush_yds']
            app['TO']              = row['away_TO']
            
            # Set the opposition team's stats
            app['opp_first_downs'] = row['home_first_downs']
            app['opp_total_yds']   = row['home_total_yds']
            app['opp_pass_yds']    = row['home_pass_yds']
            app['opp_rush_yds']    = row['home_rush_yds']
            app['opp_TO']          = row['home_TO']

        clean_df = clean_df.append(app, ignore_index=True)
    clean_df = clean_df.iloc[:, [1, 2, 0, 4, 5, 6, 7, 8, 3, 9, 10, 11, 12, 13]]
    clean_df.to_csv('../datasets/team_data/clean_data/' + team_id + '_game_stats.csv', index=False)


In [11]:
# Example of what one of the CSV files we created above looks like
df = pd.read_csv(f'../datasets/team_data/clean_data/HTX_game_stats.csv')
df.head()

Unnamed: 0,game_id,date,team_id,first_downs,total_yds,pass_yds,rush_yds,TO,opp_team_id,opp_first_downs,opp_total_yds,opp_pass_yds,opp_rush_yds,opp_TO
0,982002HTX,September 8,HTX,13.0,210.0,123.0,87.0,1.0,DAL,11.0,267.0,112.0,155.0,2.0
1,9152002SDG,September 15,HTX,7.0,118.0,29.0,89.0,3.0,SDG,16.0,267.0,143.0,124.0,1.0
2,9222002HTX,September 22,HTX,10.0,204.0,78.0,126.0,2.0,CLT,16.0,339.0,251.0,88.0,2.0
3,9292002PHI,September 29,HTX,12.0,242.0,151.0,91.0,3.0,PHI,21.0,391.0,289.0,102.0,3.0
4,10132002HTX,October 13,HTX,18.0,338.0,197.0,141.0,0.0,BUF,25.0,403.0,230.0,173.0,1.0


In [5]:
# The following loop calculates each team's rolling average stats over the last 16 games and
# renames the statistic columns to have avg_ in front to better indicate their form

for team_id in team_ids:
    df = pd.read_csv(f'../datasets/team_data/clean_data/{team_id}_game_stats.csv')
    cols = df.columns
    new_cols = {}
    
    # Create mapping to rename the columns
    for col in enumerate(cols):
        if col[0] > 2 and col[0] != 8:
            s = 'avg_' + col[1]
            new_cols[col[1]] = s
        else:
            new_cols[col[1]] = col[1]
    
    # Combine the rolling average of every statistic with the qualitative data
    avg_df = pd.concat([df.loc[:, 'game_id'], df.loc[:, 'date'], df.loc[:, 'team_id'], df.loc[:, 'opp_team_id'], df.rolling(17).mean()], axis=1)
    avg_df = avg_df.rename(columns=new_cols)
    
    # Save to a CSV file. We can now access these values more easily for each game
    avg_df.to_csv(f'../datasets/team_data/avg_data/{team_id}_avgs.csv', index=False)

In [12]:
# This loop is here more as a utility
# We wanted to keep features low for this first model,
# so initially we'll only grab the total yards and turnovers for each team

# If you change which columns we use, we'll also have to change some highlighted ares in cell 1 (which for some reason 
# is lower than this one)
from IPython.display import display
for team_id in team_ids:
    avg_df = pd.read_csv(f'../datasets/team_data/avg_data/{team_id}_avgs.csv')
    # Change the following line and run this if we want to change the stats we grab
    avg_df = avg_df.loc[:, ['game_id', 'avg_total_yds', 'avg_TO']]
    avg_df.to_csv(f'../datasets/team_data/avg_data/{team_id}_avgs.csv', index=False)

In [None]:
# The following cell inserts each team's ID into the scores data to make it a bit easier to stay consistent
# It also drops columns that we don't want for the initial model, but they can easily be brought back by changing
# the second to last line of this cell.
scores = load_scores()
teams = load_nfl_teams()
def get_home_team_id(row):
    team_name = row['team_home']
    team = teams[teams['team_name'] == team_name]
    return team['team_id_pfr'].item()

def get_away_team_id(row):
    team_name = row['team_away']
    team = teams[teams['team_name'] == team_name]
    return team['team_id_pfr'].item()
    
def insert_ids(row):
    home_id = get_home_team_id(row)
    away_id = get_away_team_id(row)
    return pd.Series([home_id, away_id])
    
scores[['home_id', 'away_id']] = scores.apply(insert_ids, axis=1)
scores.drop(columns=['team_favorite_id', 'spread_favorite', 'stadium_neutral', 'weather_temperature', 'weather_wind_mph', 'weather_humidity', 'weather_detail', 'Unnamed: 17', 'Unnamed: 18', 'Unnamed: 19'], inplace=True)
scores.to_csv('../datasets/unclean_scores.csv', index=False)

In [18]:
# Just some more cleaning on the data. Added a total score column and gave each a label: over or under

scores = pd.read_csv('../datasets/unclean_scores.csv')
scores = scores.drop(columns=['stadium'])
scores = scores[scores['over_under_line'].notna()]
scores['total_score'] = scores['score_home'] + scores['score_away']

def label_game(row):
    total = row['total_score']
    ou = row['over_under_line']
    
    # This part is a little weird but essentially it checks to see if the
    # input string is empty. NaN removal didn't catch some of these, so we make the empty
    # ones nan and then remove them later.
    if len(ou) < 2:
        return np.nan
    
    if float(total) > float(ou):
        return 'over'
    else:
        return 'under'
scores['label'] = scores.apply(label_game, axis=1)

scores = scores[(scores['label'] == 'over') | (scores['label'] == 'under')]
drop = ['team_home', 'team_away', 'schedule_week', 'schedule_playoff']
scores = scores.drop(columns=drop, axis=1)
scores = scores.reindex(columns=['schedule_date', 'schedule_season', 'home_id', 'away_id', 'score_home', 'score_away', 'total_score', 'over_under_line', 'label'])

scores.to_csv('../datasets/clean_scores.csv', index=False)

In [None]:
# The following code finally gives us our clean dataset
# We will need to change some stuff to add in more stats but it's nothing more than adding
# a few lines of code
# This adds the rolling average stats of each team for each individual game to the dataset
scores = pd.read_csv('../datasets/clean_scores.csv')
def get_game_avg(row):
    home_id = row['home_id']
    away_id = row['away_id']
    game_id = get_game_id(row['schedule_date'], home_id)
    
    # TODO: Change this if we end up adding more statistics to the dataset (columns of dataset will be different)
    home_df = pd.read_csv(f'../datasets/team_data/avg_data/{home_id}_avgs.csv')
    home_avgs = home_df[home_df['game_id'] == game_id]
    home_avg_yards = home_avgs['avg_total_yds'].iloc[0]
    home_avg_TO = home_avgs['avg_TO'].iloc[0]
    
    away_df = pd.read_csv(f'../datasets/team_data/avg_data/{away_id}_avgs.csv')
    away_avgs = away_df[away_df['game_id'] == game_id]
    away_avg_yards = away_avgs['avg_total_yds'].iloc[0]
    away_avg_TO = away_avgs['avg_TO'].iloc[0]
    return pd.Series([home_avg_yards, home_avg_TO, away_avg_yards, away_avg_TO])

#get_game_avg(scores.iloc[10000])
scores[['home_avg_yards', 'home_avg_TO', 'away_avg_yards', 'away_avg_TO']] = scores.apply(get_game_avg, axis=1)

scores.to_csv('../datasets/scores_w_avgs.csv', index=False)

In [20]:
# Reordered the columns to make it a bit more readable from a coding standpoint
scores_df = pd.read_csv('../datasets/scores_w_avgs.csv')
scores_df = scores_df.reindex(columns=['schedule_date', 'schedule_season', 'home_id', 'away_id', 'score_home', 'home_avg_yards', 'home_avg_TO', 'score_away', 'away_avg_yards', 'away_avg_TO', 'total_score', 'over_under_line', 'label'])
scores_df.to_csv('../datasets/scores_w_avgs.csv', index=False)
scores_df

Unnamed: 0,schedule_date,schedule_season,home_id,away_id,score_home,home_avg_yards,home_avg_TO,score_away,away_avg_yards,away_avg_TO,total_score,over_under_line,label
0,1/14/1968,1967,GNB,RAI,33,304.117647,2.529412,14,361.764706,2.352941,47,43.0,over
1,1/12/1969,1968,CLT,NYJ,7,332.529412,2.529412,16,365.470588,1.823529,23,40.0,under
2,1/11/1970,1969,KAN,MIN,23,315.470588,2.588235,7,293.705882,2.235294,30,39.0,under
3,1/17/1971,1970,CLT,DAL,16,301.470588,2.529412,13,306.823529,2.000000,29,36.0,under
4,1/16/1972,1971,DAL,MIA,24,342.000000,2.176471,3,311.176471,1.705882,27,34.0,under
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10391,1/17/2021,2020,KAN,CLE,22,417.117647,1.000000,17,370.882353,0.882353,39,56.0,under
10392,1/17/2021,2020,NOR,TAM,20,378.294118,1.294118,30,391.647059,0.882353,50,53.0,under
10393,1/24/2021,2020,GNB,TAM,26,386.294118,0.764706,31,392.352941,0.941176,57,53.0,over
10394,1/24/2021,2020,KAN,BUF,38,421.235294,1.058824,24,376.235294,1.176471,62,55.0,over


In [None]:
# We now have a dataset that contains the result of >10000 NFL games,
# along with each team's rolling average over the past 16 games of their
# yards per game and turnovers per game
# All rows should be filled with proper information