In [None]:
from google.colab import files

files.upload()

In [None]:
import pandas as pd
import numpy as np
import os

In [None]:
team_dict = {
    'Minnesota' : 'MIN', 'NewOrleans' : 'NO', 'Carolina' : 'CAR',
    'NYGiants' : 'NYG', 'Miami' : 'MIA', 'Cincinnati' : 'CIN', 
    'Buffalo' : 'BUF', 'Atlanta' : 'ATL', 'Pittsburgh' : 'PIT',
    'Detroit' : 'DET', 'Chicago' : 'CHI', 'NewEngland' : 'NE',
    'Cleveland' : 'CLE', 'TampaBay' : 'TB', 'Denver' : 'DEN',
    'Jacksonville' : 'JAX', 'Indianapolis' : 'IND',
    'Houston' : 'HOU', 'Oakland' : 'LV', 'Tennessee' : 'TEN',
    'GreenBay' : 'GB', 'Philadelphia' : 'PHI', 'SanFrancisco' : 'SF',
    'Seattle' : 'SEA', 'Arizona' : 'ARI', 'St.Louis' : 'LA', 
    'Dallas' : 'DAL', 'Washington' : 'WAS', 'Baltimore' : 'BAL',
    'NYJets' : 'NYJ', 'SanDiego': 'LAC', 'KansasCity' : 'KC',
    'HoustonTexans' : 'HOU', 'LasVegas' : 'LV', 'LAChargers' : 'LAC',
    'LARams' : 'LA', 'LVRaiders' : 'LV', 'KCChiefs' : 'KC', 'Kansas' : 'KC',
    'Tampa' : 'TB', 'Washingtom' : 'WAS', 'LosAngeles' : 'LA', 'BuffaloBills' : 'BUF',
    'NewYork' : 'NYG'
}

In [None]:
folder = '/content'

df = pd.DataFrame()

for file in os.listdir(folder):

  if 'xlsx' in file:
    df_ = pd.read_excel(file)
    df_['year'] = file[9:13]
    df = pd.concat([df, df_])


df.columns = df.columns.str.lower()
df['year'] = df['year'].astype(int)
df['team'] = df['team'].map(team_dict)

In [None]:
games = []
for i in range(0, len(df), 2):
  games.append(i)
  games.append(i)

df['game_identifier'] = games

In [None]:
test = (
    df
    .groupby('game_identifier')
    .agg({
        'year' : 'first',
        'date' : 'first',
        'team' : ['first', 'last'],
        'ml' : ['first', 'last']})
    .reset_index(drop=True)
)

test.columns = ['year', 'date', 'away_team', 'home_team', 'away_ml', 'home_ml']

In [None]:
def calculate_win_probability(x):
  "func to convert a moneyline to a win probability"

  if x < 0 :
    return abs(x) / (100 + abs(x))

  else:
    return 1 - abs(x) / (100 + abs(x))

In [None]:
# converting moneylines to win probabilities with vig

test['away_win_proba_vig'] = test['away_ml'].apply(calculate_win_probability)
test['home_win_proba_vig'] = test['home_ml'].apply(calculate_win_probability)

# calculating win probabilities without vig through normalization

test['sum_vig_proba'] = test['away_win_proba_vig'].add(test['home_win_proba_vig'])
test['away_win_proba_no_vig'] = test['away_win_proba_vig'].div(test['sum_vig_proba'])
test['home_win_proba_no_vig'] = test['home_win_proba_vig'].div(test['sum_vig_proba'])

In [None]:
# converting a messy date column into a proper pandas datetime w/weekday
# the weekday column will be necessary to assign 'weeks' to create game_id col

test['month'] = [str(x)[:-2] for x in test.date]
test['day'] = [str(x)[-2:] for x in test.date]
test['year'] = np.where(test.month.isin(['1', '2']), test['year'] + 1, test['year'])

test['date_dt'] = [pd.to_datetime(str(x) + '-' + y + '-' + z) for x, y, z in zip(test.year, test.month, test.day)]
test['weekday'] = test['date_dt'].dt.day_name()

In [None]:
unique_dates = test.groupby(['year', 'date_dt'])['weekday'].unique().reset_index()
unique_dates['weekday'] = unique_dates['weekday'].apply(lambda x: x[0])

In [None]:
unique_dates['season'] = (
    np.where(
        unique_dates['date_dt'].dt.month.isin([1, 2]),
        unique_dates['year'] - 1,
        unique_dates['year']
        )
)

# counting each unique weekday in the season to figure out the NFL season week
unique_dates['weekday_count'] = unique_dates.groupby(['season', 'weekday']).cumcount() + 1
unique_dates['week'] = unique_dates['weekday_count']

# matching the Saturday and Thursday weeks with the succeeding Sunday's week
# order matters w/this solution

unique_dates.loc[unique_dates['weekday'] == 'Saturday', 'week'] = unique_dates['weekday_count'].shift(-1)
unique_dates.loc[unique_dates['weekday'] == 'Thursday', 'week'] = unique_dates['weekday_count'].shift(-1)

In [None]:
final_df = test.merge(unique_dates[['date_dt', 'season', 'week']], how='left', on='date_dt')

# converting columns to ints -> strs to create a game_id column that can be 
# used to merge onto the games_df

final_df['week'] = final_df['week'].astype(int)
final_df['week'] = final_df['week'].astype(str).str.zfill(width=2)
final_df['season'] = final_df['season'].astype(str)

final_df['game_id'] = [y + '_' + w + '_' + a + '_' + h for y, w, a,
                       h in zip(final_df['season'], final_df['week'], final_df['away_team'], final_df['home_team'])]


In [None]:
final_df = final_df[['game_id', 'away_team', 'home_team', 'home_ml', 'home_win_proba_no_vig']].copy()

final_df.to_csv('nfl_moneylines_2007_2021.csv', index=False)