In [None]:
!nvidia-smi

Wed Jan 18 17:22:31 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   72C    P0    31W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!pip install torchsummaryX

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchsummaryX
  Downloading torchsummaryX-1.3.0-py3-none-any.whl (3.6 kB)
Installing collected packages: torchsummaryX
Successfully installed torchsummaryX-1.3.0


In [None]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


In [None]:
import numpy as np
import pandas as pd
import torch
import os
from torch.utils.data import Dataset, DataLoader
from torchsummaryX import summary
from matplotlib import pyplot as plt 
import gc
import warnings
warnings.filterwarnings('ignore')
np.set_printoptions(suppress=True)
pd.options.display.max_columns = 100

# Data Loading
We use 2 `csv` files containing statistics for games and rankings, respectively. The data was acquired from [this Kaggle link](https://www.kaggle.com/datasets/nathanlauga/nba-games).

In [None]:
DATA_DIR = './drive/MyDrive/basketball_analysis/game-prediction'

games = pd.read_csv(os.path.join(DATA_DIR, 'games.csv'))
print(games.columns)
rankings = pd.read_csv(os.path.join(DATA_DIR, 'ranking.csv'))
print(rankings.columns)
print(games.iloc[0])
print(rankings.iloc[0])

Index(['GAME_DATE_EST', 'GAME_ID', 'GAME_STATUS_TEXT', 'HOME_TEAM_ID',
       'VISITOR_TEAM_ID', 'SEASON', 'TEAM_ID_home', 'PTS_home', 'FG_PCT_home',
       'FT_PCT_home', 'FG3_PCT_home', 'AST_home', 'REB_home', 'TEAM_ID_away',
       'PTS_away', 'FG_PCT_away', 'FT_PCT_away', 'FG3_PCT_away', 'AST_away',
       'REB_away', 'HOME_TEAM_WINS'],
      dtype='object')
Index(['TEAM_ID', 'LEAGUE_ID', 'SEASON_ID', 'STANDINGSDATE', 'CONFERENCE',
       'TEAM', 'G', 'W', 'L', 'W_PCT', 'HOME_RECORD', 'ROAD_RECORD',
       'RETURNTOPLAY'],
      dtype='object')
GAME_DATE_EST       2023-01-14
GAME_ID               22200641
GAME_STATUS_TEXT         Final
HOME_TEAM_ID        1610612748
VISITOR_TEAM_ID     1610612749
SEASON                    2022
TEAM_ID_home        1610612748
PTS_home                 111.0
FG_PCT_home              0.517
FT_PCT_home              0.833
FG3_PCT_home               0.5
AST_home                  17.0
REB_home                  51.0
TEAM_ID_away        1610612749
PTS_away   

# Create a mapping of teams

In [None]:
all_teams = list(set(list(games['HOME_TEAM_ID']) + list(games['VISITOR_TEAM_ID'])))
team_mapping = {}
team_name_to_id = {}
for i in range(len(all_teams)):
  team_mapping[all_teams[i]] = i
  team_name_to_id[rankings[rankings['TEAM_ID'] == all_teams[i]].iloc[0]['TEAM']] = all_teams[i]
team_name_to_id

{'Atlanta': 1610612737,
 'Boston': 1610612738,
 'Cleveland': 1610612739,
 'New Orleans': 1610612740,
 'Chicago': 1610612741,
 'Dallas': 1610612742,
 'Denver': 1610612743,
 'Golden State': 1610612744,
 'Houston': 1610612745,
 'LA Clippers': 1610612746,
 'L.A. Lakers': 1610612747,
 'Miami': 1610612748,
 'Milwaukee': 1610612749,
 'Minnesota': 1610612750,
 'Brooklyn': 1610612751,
 'New York': 1610612752,
 'Orlando': 1610612753,
 'Indiana': 1610612754,
 'Philadelphia': 1610612755,
 'Phoenix': 1610612756,
 'Portland': 1610612757,
 'Sacramento': 1610612758,
 'San Antonio': 1610612759,
 'Oklahoma City': 1610612760,
 'Toronto': 1610612761,
 'Utah': 1610612762,
 'Memphis': 1610612763,
 'Washington': 1610612764,
 'Detroit': 1610612765,
 'Charlotte': 1610612766}

# Data Preprocessing

## Feature selection

* Last 10 games stats of each team
* Last 3 matchups between 2 teams
* Current ranking of each team

## Labels
* 1 = Home team wins, 0 = Road team wins

Also, normalize individual statistics to make our model converge faster.

In [None]:
max_points = max(games['PTS_home'].max(), games['PTS_away'].max())
max_assists = max(games['AST_home'].max(), games['AST_away'].max())
max_reb = max(games['REB_home'].max(), games['REB_away'].max())
print(f'Max points: {max_points}, Max assists: {max_assists}, Max reb: {max_reb}')

Max points: 168.0, Max assists: 50.0, Max reb: 81.0


In [None]:
TEAM_HISTORY = 10
def get_last_games(t, before):
  try:
    home_team_games = games[(games['HOME_TEAM_ID'] == t)]
    home_team_games['IS_HOME'] = 1
    drop_cols = ['GAME_ID', 'GAME_STATUS_TEXT', 'HOME_TEAM_ID',
        'VISITOR_TEAM_ID', 'TEAM_ID_home', 'TEAM_ID_away',
        'PTS_away', 'FG_PCT_away', 'FT_PCT_away', 'FG3_PCT_away', 'AST_away',
        'REB_away']
    home_team_games.drop(columns=drop_cols, inplace=True)
    rename_cols = {
        'PTS_home': 'PTS', 'FG_PCT_home': 'FG_PCT',
        'FT_PCT_home': 'FT_PCT', 'FG3_PCT_home': 'FG3_PCT', 'AST_home': 'AST', 'REB_home': 'REB', 'HOME_TEAM_WINS': 'WIN'
    }
    home_team_games.rename(columns=rename_cols, inplace=True)
    away_team_games =  games[(games['VISITOR_TEAM_ID'] == t)]
    away_team_games['IS_HOME'] = 0
    away_team_games['WIN'] = away_team_games['HOME_TEAM_WINS'].map({1: 0, 0: 1})
    rename_cols = {
        'PTS_away': 'PTS', 'FG_PCT_away': 'FG_PCT',
        'FT_PCT_away': 'FT_PCT', 'FG3_PCT_away': 'FG3_PCT', 'AST_away': 'AST', 'REB_away': 'REB'
    }
    away_team_games.rename(columns=rename_cols, inplace=True)
    team_games = pd.concat((home_team_games, away_team_games))
    before_date = pd.to_datetime(before)
    team_games.loc[:, 'GAME_DATE_EST'] = pd.to_datetime(team_games['GAME_DATE_EST'])
    team_games = team_games.sort_values(by=['GAME_DATE_EST'])
    team_games.dropna(inplace=True)
    team_games = team_games[team_games['GAME_DATE_EST'] < before_date][-TEAM_HISTORY:]
    if len(team_games)<TEAM_HISTORY:
      return None
    team_games = team_games.loc[:, ['WIN', 'IS_HOME', 'PTS', 'FG_PCT', 'FT_PCT', 'FG3_PCT', 'AST', 'REB']]
    team_games.loc[:, 'PTS'] = team_games['PTS']/max_points
    team_games.loc[:, 'AST'] = team_games['AST']/max_assists
    team_games.loc[:, 'REB'] = team_games['REB']/max_reb
    team_games.reset_index(drop=True, inplace=True)
    return team_games
  except:
    return None

Let's get the last 10 games for a the New Orleans Pelicans before December 12, 2022. 

In [None]:
get_last_games(1610612740, '2022-12-13')

Unnamed: 0,WIN,IS_HOME,PTS,FG_PCT,FT_PCT,FG3_PCT,AST,REB
0,1,0,0.738095,0.472,0.865,0.364,0.46,0.654321
1,0,0,0.660714,0.442,0.909,0.405,0.5,0.407407
2,1,0,0.666667,0.495,0.625,0.417,0.64,0.617284
3,0,0,0.696429,0.461,0.714,0.32,0.7,0.62963
4,0,0,0.720238,0.435,0.714,0.304,0.46,0.728395
5,0,0,0.72619,0.495,0.706,0.412,0.64,0.530864
6,1,0,0.684524,0.5,0.778,0.158,0.4,0.617284
7,1,0,0.767857,0.573,0.7,0.424,0.66,0.518519
8,0,0,0.660714,0.43,0.871,0.323,0.52,0.506173
9,1,0,0.696429,0.435,0.962,0.286,0.5,0.654321


In [None]:
MATCHUP_HISTORY = 3
def get_matchup_history(home, away, before):
  matchup_games_home = games[((games['HOME_TEAM_ID'] == home) & (games['VISITOR_TEAM_ID'] == away))]
  matchup_games_home['IS_HOME'] = 1
  matchup_games_home['WIN'] = matchup_games_home['HOME_TEAM_WINS']
  matchup_games_away = games[((games['HOME_TEAM_ID'] == away) & (games['VISITOR_TEAM_ID'] == home))]
  rename_cols =   rename_cols = {
      'PTS_away': 'PTS_home', 'FG_PCT_away': 'FG_PCT_home',
      'FT_PCT_away': 'FT_PCT_home', 'FG3_PCT_away': 'FG3_PCT_home', 'AST_away': 'AST_home', 'REB_away': 'REB_home',
      'PTS_home': 'PTS_away', 'FG_PCT_home': 'FG_PCT_away',
      'FT_PCT_home': 'FT_PCT_away', 'FG3_PCT_home': 'FG3_PCT_away', 'AST_home': 'AST_away', 'REB_home': 'REB_away'
  }
  matchup_games_away.rename(columns=rename_cols, inplace=True)
  matchup_games_away['IS_HOME'] = 0
  matchup_games_away['WIN'] = matchup_games_away['HOME_TEAM_WINS'].map({1: 0, 0: 1})
  before_date = pd.to_datetime(before)
  matchup_games = pd.concat((matchup_games_home, matchup_games_away))
  matchup_games = matchup_games.dropna()
  matchup_games.loc[:, 'GAME_DATE_EST'] = pd.to_datetime(matchup_games['GAME_DATE_EST'])
  matchup_games = matchup_games.sort_values(by=['GAME_DATE_EST'])
  matchup_games = matchup_games[matchup_games['GAME_DATE_EST'] < before_date][-MATCHUP_HISTORY:]
  if len(matchup_games)<MATCHUP_HISTORY:
    return None
  matchup_games.loc[:, 'PTS_home'] = matchup_games['PTS_home']/max_points
  matchup_games.loc[:, 'PTS_away'] = matchup_games['PTS_away']/max_points

  matchup_games.loc[:, 'AST_home'] = matchup_games['AST_home']/max_assists
  matchup_games.loc[:, 'AST_away'] = matchup_games['AST_away']/max_assists


  matchup_games.loc[:, 'REB_home'] = matchup_games['REB_home']/max_reb
  matchup_games.loc[:, 'REB_away'] = matchup_games['REB_away']/max_reb
  matchup_games = matchup_games.loc[:, ['WIN', 'IS_HOME', 'PTS_home', 'FG_PCT_home', 'FT_PCT_home', 'FG3_PCT_home', 'AST_home', 'REB_home',
                                        'PTS_away', 'FG_PCT_away', 'FT_PCT_away', 'FG3_PCT_away', 'AST_away', 'REB_away']]
  matchup_games.reset_index(drop=True, inplace=True)
  return matchup_games

Let's get the last 3 games between the New Orleans Pelicans and the Phoenix Suns before December 13, 2022.

In [None]:
get_matchup_history(1610612740, 1610612756, '2022-12-13')

Unnamed: 0,WIN,IS_HOME,PTS_home,FG_PCT_home,FT_PCT_home,FG3_PCT_home,AST_home,REB_home,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away
0,0,0,0.660714,0.442,0.909,0.405,0.5,0.407407,0.738095,0.522,0.9,0.303,0.66,0.580247
1,1,1,0.761905,0.511,0.8,0.296,0.54,0.54321,0.696429,0.5,0.692,0.5,0.6,0.45679
2,1,1,0.767857,0.581,0.75,0.32,0.62,0.530864,0.738095,0.467,0.765,0.342,0.68,0.518519


In [None]:
def get_ranking(t, date):
  team_ranking = rankings[rankings['TEAM_ID'] == t]
  team_ranking.loc[:, 'STANDINGSDATE'] = pd.to_datetime(team_ranking['STANDINGSDATE'])
  date = pd.to_datetime(date)
  max_games = team_ranking['G'].max()
  team_ranking = team_ranking[team_ranking['STANDINGSDATE'] == date][:1]
  team_ranking.loc[:, 'G'] = team_ranking['G'].apply(lambda x: float(x)/max_games)
  team_ranking.loc[:, 'HOME_W_PCT'] = team_ranking['HOME_RECORD'].apply(lambda x: float(x.split('-')[0])/(max(1, float(x.split('-')[0]) + float(x.split('-')[1]))))
  team_ranking.loc[:, 'AWAY_W_PCT'] = team_ranking['ROAD_RECORD'].apply(lambda x: float(x.split('-')[0])/(max(1, float(x.split('-')[0]) + float(x.split('-')[1]))))
  team_ranking.drop(columns = ['SEASON_ID', 'W', 'L', 'HOME_RECORD', 'ROAD_RECORD', 'TEAM_ID', 'LEAGUE_ID', 'STANDINGSDATE', 'CONFERENCE', 'TEAM', 'RETURNTOPLAY'], inplace=True)
  return team_ranking

Let's get the ranking of the Pelicans on December 13, 2022.

In [None]:
get_ranking(1610612740, '2022-12-13')

Unnamed: 0,G,W_PCT,HOME_W_PCT,AWAY_W_PCT
826,0.329268,0.667,0.8,0.5


**Let's get a single training instance:**

Suppose we want to predict the following game: 

Golden State Warriors vs. Boston Celtics on December 10, 2022

In [None]:
warriors_id = rankings[rankings['TEAM'] == 'Golden State']['TEAM_ID'].iloc[0]
celtics_id = rankings[rankings['TEAM'] == 'Boston']['TEAM_ID'].iloc[0]
print(f'Warriors id: {warriors_id}, Celtics id: {celtics_id}')
date = '2022-12-10'
game = games[(games['GAME_DATE_EST'] == date) & (games['HOME_TEAM_ID'] == warriors_id) & (games['VISITOR_TEAM_ID'] == celtics_id)]
game

Warriors id: 1610612744, Celtics id: 1610612738


Unnamed: 0,GAME_DATE_EST,GAME_ID,GAME_STATUS_TEXT,HOME_TEAM_ID,VISITOR_TEAM_ID,SEASON,TEAM_ID_home,PTS_home,FG_PCT_home,FT_PCT_home,FG3_PCT_home,AST_home,REB_home,TEAM_ID_away,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away,HOME_TEAM_WINS
258,2022-12-10,22200392,Final,1610612744,1610612738,2022,1610612744,123.0,0.511,0.8,0.333,26.0,53.0,1610612738,107.0,0.437,0.731,0.3,17.0,39.0,1


In [None]:
warriors_history = get_last_games(warriors_id, date)
print('Warriors history:')
warriors_history

Warriors history:


Unnamed: 0,WIN,IS_HOME,PTS,FG_PCT,FT_PCT,FG3_PCT,AST,REB
0,0,0,0.64881,0.464,0.765,0.367,0.62,0.506173
1,0,0,0.767857,0.515,0.667,0.442,0.62,0.432099
2,0,0,0.625,0.493,0.92,0.375,0.46,0.358025
3,0,0,0.684524,0.467,0.938,0.34,0.54,0.530864
4,0,0,0.708333,0.457,1.0,0.429,0.6,0.493827
5,1,0,0.755952,0.535,0.733,0.511,0.76,0.493827
6,0,0,0.494048,0.378,0.895,0.233,0.34,0.419753
7,1,0,0.815476,0.575,0.85,0.426,0.72,0.580247
8,0,0,0.672619,0.477,0.783,0.256,0.54,0.617284
9,0,0,0.732143,0.459,0.76,0.333,0.52,0.518519


In [None]:
celtics_history = get_last_games(celtics_id, date)
print('Celtics history:')
celtics_history

Celtics history:


Unnamed: 0,WIN,IS_HOME,PTS,FG_PCT,FT_PCT,FG3_PCT,AST,REB
0,0,0,0.672619,0.409,0.963,0.268,0.52,0.641975
1,1,0,0.791667,0.534,1.0,0.529,0.6,0.419753
2,1,0,0.64881,0.463,0.75,0.324,0.52,0.555556
3,1,0,0.696429,0.429,0.875,0.375,0.54,0.518519
4,1,0,0.75,0.545,0.9,0.457,0.58,0.604938
5,1,0,0.696429,0.482,0.882,0.435,0.56,0.567901
6,0,0,0.636905,0.437,0.706,0.38,0.56,0.469136
7,1,0,0.613095,0.432,0.9,0.395,0.36,0.592593
8,1,0,0.690476,0.489,0.714,0.361,0.54,0.604938
9,1,0,0.744048,0.485,0.846,0.356,0.58,0.654321


In [None]:
matchup_history = get_matchup_history(warriors_id, celtics_id, date)
print('Matchup_history:')
matchup_history

Matchup_history:


Unnamed: 0,WIN,IS_HOME,PTS_home,FG_PCT_home,FT_PCT_home,FG3_PCT_home,AST_home,REB_home,PTS_away,FG_PCT_away,FT_PCT_away,FG3_PCT_away,AST_away,REB_away
0,1,0,0.636905,0.44,0.8,0.349,0.4,0.679012,0.577381,0.4,0.737,0.395,0.44,0.518519
1,1,1,0.619048,0.466,0.867,0.225,0.46,0.481481,0.559524,0.413,0.677,0.344,0.36,0.580247
2,1,0,0.613095,0.413,1.0,0.413,0.54,0.54321,0.535714,0.425,0.917,0.393,0.54,0.506173


In [None]:
warriors_ranking = get_ranking(warriors_id, date)
print('Warriors ranking:')
warriors_ranking

Warriors ranking:


Unnamed: 0,G,W_PCT,HOME_W_PCT,AWAY_W_PCT
877,0.329268,0.519,0.857143,0.153846


In [None]:
celtics_ranking = get_ranking(celtics_id, date)
print('Celtics ranking:')
celtics_ranking

Celtics ranking:


Unnamed: 0,G,W_PCT,HOME_W_PCT,AWAY_W_PCT
5145,0.329268,0.778,0.846154,0.714286


In [None]:
warriors_win = game['HOME_TEAM_WINS'].iloc[0]
warriors_win

1

Finally, let's actually create the data vector:

In [None]:
x = (warriors_history.to_numpy(), celtics_history.to_numpy(), matchup_history.to_numpy(), warriors_ranking.to_numpy(), celtics_ranking.to_numpy())
x

(array([[0.        , 0.        , 0.64880952, 0.464     , 0.765     ,
         0.367     , 0.62      , 0.50617284],
        [0.        , 0.        , 0.76785714, 0.515     , 0.667     ,
         0.442     , 0.62      , 0.43209877],
        [0.        , 0.        , 0.625     , 0.493     , 0.92      ,
         0.375     , 0.46      , 0.35802469],
        [0.        , 0.        , 0.68452381, 0.467     , 0.938     ,
         0.34      , 0.54      , 0.5308642 ],
        [0.        , 0.        , 0.70833333, 0.457     , 1.        ,
         0.429     , 0.6       , 0.49382716],
        [1.        , 0.        , 0.75595238, 0.535     , 0.733     ,
         0.511     , 0.76      , 0.49382716],
        [0.        , 0.        , 0.49404762, 0.378     , 0.895     ,
         0.233     , 0.34      , 0.41975309],
        [1.        , 0.        , 0.81547619, 0.575     , 0.85      ,
         0.426     , 0.72      , 0.58024691],
        [0.        , 0.        , 0.67261905, 0.477     , 0.783     ,
         0.

Now let's create a function that encapsulates all this logic above:

In [None]:
def create_dataset(df, for_team = None):
  X = []
  Y = []
  if for_team is not None:
    print(f'Fetching for specific team: {for_team}')
    df = df[(df['HOME_TEAM_ID'] == for_team) | (df['VISITOR_TEAM_ID'] == for_team)]
  for (i, g) in df.iterrows():
    if i%1000 == 0:
      print(f'{i} of {len(df.index)}')
    home = g['HOME_TEAM_ID']
    away = g['VISITOR_TEAM_ID']
    date = g['GAME_DATE_EST']
    home_history = get_last_games(home, date)
    if home_history is None:
      continue
    away_history = get_last_games(away, date)
    if away_history is None:
      continue
    matchup_history = get_matchup_history(home, away, date)
    if matchup_history is None:
      continue
    home_history = home_history.to_numpy()
    away_history = away_history.to_numpy()
    matchup_history = matchup_history.to_numpy()
    r1 = get_ranking(home, date).to_numpy()
    r2 = get_ranking(away, date).to_numpy()
    label = g['HOME_TEAM_WINS']
    X.append((home_history, away_history, matchup_history, r1, r2))
    Y.append(label)
  return np.array(X), np.array(Y)

Save the data for future use!

In [None]:
import pickle
if True:
  all_X, all_Y = create_dataset(games, team_name_to_id['Washington'])
  with open(os.path.join(DATA_DIR, 'x.pkl'), 'wb') as f:
    pickle.dump(all_X, f, protocol=pickle.HIGHEST_PROTOCOL)

  with open(os.path.join(DATA_DIR, 'y.pkl'), 'wb') as f:
    pickle.dump(all_Y, f, protocol=pickle.HIGHEST_PROTOCOL)
else:
  with open(os.path.join(DATA_DIR, 'x.pkl'), 'rb') as f:
    all_X = pickle.load(f)
  with open(os.path.join(DATA_DIR, 'y.pkl'), 'rb') as f:
    all_Y = pickle.load(f)
print(f'Created dataset with {len(all_X)} examples')
print('Example:')
print(f'{all_X[0]} --> {all_Y[0]}')

Fetching for specific team: 1610612764
Created dataset with 1664 examples
Example:
[array([[0.        , 0.        , 0.76190476, 0.527     , 0.565     ,
         0.487     , 0.8       , 0.2962963 ],
        [0.        , 0.        , 0.55357143, 0.429     , 0.923     ,
         0.36      , 0.4       , 0.51851852],
        [0.        , 0.        , 0.69642857, 0.463     , 0.792     ,
         0.256     , 0.48      , 0.59259259],
        [1.        , 0.        , 0.67261905, 0.475     , 0.703     ,
         0.407     , 0.44      , 0.50617284],
        [0.        , 0.        , 0.66666667, 0.561     , 0.765     ,
         0.333     , 0.4       , 0.45679012],
        [1.        , 0.        , 0.74404762, 0.557     , 0.708     ,
         0.313     , 0.6       , 0.49382716],
        [1.        , 0.        , 0.70833333, 0.56      , 0.8       ,
         0.429     , 0.66      , 0.65432099],
        [1.        , 0.        , 0.70238095, 0.532     , 0.688     ,
         0.259     , 0.58      , 0.66666667

# Dataset Creation

Create `StatsDataset` and `DataLoader` from the dataset.

In [None]:
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

In [None]:
class StatsDataset(Dataset):
  def __init__(self, stats, labels):
    self.stats = stats
    self.labels = labels

  def __len__(self):
    return len(self.stats)

  def __getitem__(self, idx):
    return torch.from_numpy(self.stats[idx][0]), torch.from_numpy(self.stats[idx][1]), torch.from_numpy(self.stats[idx][2]), self.stats[idx][3], self.stats[idx][4], self.labels[idx]

In [None]:
BATCH_SIZE = 64
train_X, val_X = np.split(all_X, [int(len(all_X)*0.9)])
train_Y, val_Y = np.split(all_Y, [int(len(all_Y)*0.9)])
print(f'Training examples: {len(train_X)}, Validation examples: {len(val_X)}')
train_dataset = StatsDataset(train_X, train_Y)
val_dataset = StatsDataset(val_X, val_Y)

train_loader = torch.utils.data.DataLoader(train_dataset,num_workers= 4,
                                           batch_size=BATCH_SIZE, pin_memory= True,
                                          shuffle= True)
val_loader = torch.utils.data.DataLoader(val_dataset,num_workers= 4,
                                           batch_size=BATCH_SIZE, pin_memory= True,
                                           shuffle= True)
print(f'Training size: {len(train_loader)}')
print(f'Val size: {len(val_loader)}')
for x1, x2, x3, x4, x5, y in train_loader:
  print(f'Home history: {x1.shape}')
  print(f'Away history: {x2.shape}')
  print(f'Matchup history: {x3.shape}')
  print(f'Rank: {x4.shape}')
  print(f'Away Rank: {x5.shape}')
  print(f'Label: {y}')
  break

Training examples: 1497, Validation examples: 167
Training size: 24
Val size: 3
Home history: torch.Size([64, 10, 8])
Away history: torch.Size([64, 10, 8])
Matchup history: torch.Size([64, 3, 14])
Rank: torch.Size([64, 1, 4])
Away Rank: torch.Size([64, 1, 4])
Label: tensor([1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0,
        0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0,
        1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0])


In [None]:
gc.collect()

63

# Model Creation
Let's create our model.

Our model should consist of 3 LSTM's to maintain the history of each team AND to maintain the history of the matchups between the 2 teams.

We'll combine the outputs of these LSTMs with a linear layer.

Next, we also will make a simple linear layer to handle the rankings.

Finally, we will combine everything through one fully connected layer.

The output of thiw `fc` layer will go into a sigmoid function since our output is binary.

In [None]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [None]:
TEAM_DIM = x1.shape[2]
MATCHUP_DIM = x3.shape[2]
RANK_DIM = x4.shape[2]

In [None]:
# Model Definition
class GamePredictionNetwork(nn.Module):
    def __init__(self, 
                 team_dim,
                 matchup_dim,
                 rank_dim, 
                 hidden_dim, 
                 n_layers,
                 dropout):
        
      super().__init__()
      
      self.home_lstm = nn.LSTM(team_dim, hidden_dim, num_layers = n_layers, dropout = 0.0, batch_first=True)
      self.away_lstm = nn.LSTM(team_dim, hidden_dim, num_layers = n_layers, dropout = 0.0, batch_first=True)
      self.matchup_lstm = nn.LSTM(matchup_dim, hidden_dim, num_layers = n_layers, dropout = 0.0, batch_first=True)
      
      self.lstm_linear = nn.Sequential(
          nn.Linear(hidden_dim*3, hidden_dim),
          nn.Dropout(0.0),
      )
      
      self.home_rank = nn.Sequential(
          nn.Linear(rank_dim, rank_dim*8),
          nn.GELU(),
          nn.Linear(rank_dim*8, rank_dim*16),
          nn.Dropout(0.0),
          nn.GELU(),
          nn.Linear(rank_dim*16, rank_dim*8),
          nn.GELU(),
          nn.Linear(rank_dim*8, rank_dim),
          nn.GELU(),
          nn.Dropout(0.0),
      )

      self.away_rank = nn.Sequential(
          nn.Linear(rank_dim, rank_dim*8),
          nn.GELU(),
          nn.Linear(rank_dim*8, rank_dim*16),
          nn.Dropout(0.0),
          nn.GELU(),
          nn.Linear(rank_dim*16, rank_dim*8),
          nn.GELU(),
          nn.Linear(rank_dim*8, rank_dim),
          nn.GELU(),     
          nn.Dropout(0.0),
      )
      
      self.fc = nn.Linear(rank_dim*2+hidden_dim, 1)

      self.sigmoid = nn.Sigmoid()

    def forward(self, home_history, away_history, matchup_history, home_ranking, away_ranking):
      _, (h_h, _) = self.home_lstm(home_history)
      _, (a_h, _) = self.away_lstm(away_history)
      _, (m_h, _) = self.matchup_lstm(matchup_history)

      lstm_combined = self.lstm_linear(torch.cat([h_h[-1].unsqueeze(1), a_h[-1].unsqueeze(1), m_h[-1].unsqueeze(1)], 2))
      home_ranking = self.home_rank(home_ranking)
      away_ranking = self.away_rank(away_ranking)
      rank_combined = torch.cat((home_ranking, away_ranking), dim=2)
      combined = torch.cat([lstm_combined, rank_combined], 2)
      
      out = self.fc(combined)
      out = self.sigmoid(out).squeeze(dim=1).squeeze(dim=1)
      return out

Potential Ablations

* Dropout
 - LSTM: 0.2, Linear: 0.3
   - Train Accuracy: 75%, Val Accuracy: 78%
 - LSTM: 0.3, Linear: 0.4
    - Train Accuracy: , Val Accuracy: 

 - LSTM: 0.3, Linear: 0.5
    - Train Accuracy: , Val Accuracy: 

 - LSTM: 0.5, Linear: 0.6
    - Train Accuracy: , Val Accuracy: 

* Optimizer
 - AdamW
    - Train Accuracy: , Val Accuracy: 
 - Adam
    - Train Accuracy: , Val Accuracy: 

* Activation Functions
 - Tanh 
    - Train Accuracy: , Val Accuracy: 

 - GELU
    - Train Accuracy: 75%, Val Accuracy: 78%

 - ReLU
    - Train Accuracy: 74%, Val Accuracy: 75%

* Linear layer architecture
  - Cylinder
     - Train Accuracy: , Val Accuracy: 

  - Pyramid
     - Train Accuracy: , Val Accuracy: 

  - Reverse Pyramid
     - Train Accuracy: , Val Accuracy: 


* Sequence Length
  - Longer (10, 3)
       - Train Accuracy: , Val Accuracy: 
  - Shorter (5, 2)
       - Train Accuracy: 74%, Val Accuracy: 77%


In [None]:
LEARNING_RATE = 0.01
model = GamePredictionNetwork(TEAM_DIM, MATCHUP_DIM, RANK_DIM, 64, 1, 0).cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.BCELoss()
summary(model, x1.float().cuda(), x2.float().cuda(), x3.float().cuda(), x4.float().cuda(), x5.float().cuda())

                        Kernel Shape  Output Shape   Params Mult-Adds
Layer                                                                
0_home_lstm                        -  [64, 10, 64]  18.944k   18.432k
1_away_lstm                        -  [64, 10, 64]  18.944k   18.432k
2_matchup_lstm                     -   [64, 3, 64]   20.48k   19.968k
3_lstm_linear.Linear_0     [192, 64]   [64, 1, 64]  12.352k   12.288k
4_lstm_linear.Dropout_1            -   [64, 1, 64]        -         -
5_home_rank.Linear_0         [4, 32]   [64, 1, 32]    160.0     128.0
6_home_rank.GELU_1                 -   [64, 1, 32]        -         -
7_home_rank.Linear_2        [32, 64]   [64, 1, 64]   2.112k    2.048k
8_home_rank.Dropout_3              -   [64, 1, 64]        -         -
9_home_rank.GELU_4                 -   [64, 1, 64]        -         -
10_home_rank.Linear_5       [64, 32]   [64, 1, 32]    2.08k    2.048k
11_home_rank.GELU_6                -   [64, 1, 32]        -         -
12_home_rank.Linear_

Unnamed: 0_level_0,Kernel Shape,Output Shape,Params,Mult-Adds
Layer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0_home_lstm,-,"[64, 10, 64]",18944.0,18432.0
1_away_lstm,-,"[64, 10, 64]",18944.0,18432.0
2_matchup_lstm,-,"[64, 3, 64]",20480.0,19968.0
3_lstm_linear.Linear_0,"[192, 64]","[64, 1, 64]",12352.0,12288.0
4_lstm_linear.Dropout_1,-,"[64, 1, 64]",,
5_home_rank.Linear_0,"[4, 32]","[64, 1, 32]",160.0,128.0
6_home_rank.GELU_1,-,"[64, 1, 32]",,
7_home_rank.Linear_2,"[32, 64]","[64, 1, 64]",2112.0,2048.0
8_home_rank.Dropout_3,-,"[64, 1, 64]",,
9_home_rank.GELU_4,-,"[64, 1, 64]",,


In [None]:
def evaluate(model, optimizer, criterion, loader, scheduler = None):
  epoch_loss = 0
  epoch_acc = 0
  n_examples = 0
  model.eval()
  with torch.no_grad():
    for i, data in enumerate(loader):
      
      home_history, away_history, matchup_history, home_ranking, away_ranking, labels = data
      home_history = home_history.float().cuda()
      away_history = away_history.float().cuda()
      matchup_history = matchup_history.float().cuda()
      home_ranking = home_ranking.float().cuda()
      away_ranking = away_ranking.float().cuda()
      labels = labels.float().cuda()
      predictions = model(home_history, away_history, matchup_history, home_ranking, away_ranking)
          
      loss = criterion(predictions, labels)

      # Accumulate epoch stats
      epoch_loss += loss.item()
      epoch_acc += (predictions.round() == labels).sum().item()
      n_examples += predictions.size(0)

  return epoch_loss/n_examples, epoch_acc/n_examples*100

In [None]:
def train_epoch(model, optimizer, criterion, loader, scheduler = None):
  epoch_loss = 0
  epoch_acc = 0
  n_examples = 0
  model.train()
  for i, data in enumerate(loader):
    home_history, away_history, matchup_history, home_ranking, away_ranking, labels = data

    home_history = home_history.float().cuda()
    away_history = away_history.float().cuda()
    matchup_history = matchup_history.float().cuda()
    home_ranking = home_ranking.float().cuda()
    away_ranking = away_ranking.float().cuda()
    labels = labels.float().cuda()

    optimizer.zero_grad()
    predictions = model(home_history, away_history, matchup_history, home_ranking, away_ranking)
    # back prop + optimize
    loss = criterion(predictions, labels)
    loss.backward()
    optimizer.step()

    # Accumulate epoch stats
    epoch_loss += loss.item()
    epoch_acc += (predictions.round() == labels).sum().item()
    n_examples += predictions.size(0)

  return epoch_loss/n_examples, epoch_acc/n_examples*100

In [None]:
gc.collect()

0

In [None]:
EPOCHS = 100

for epoch in range(EPOCHS):
  train_loss, train_acc = train_epoch(model, optimizer, criterion, train_loader)
  val_loss, val_acc = evaluate(model, optimizer, criterion, val_loader)
  print(f'Epoch {epoch}:')
  print(f'Train loss: {train_loss}, Train acc: {train_acc}%')
  print(f'Val loss: {val_loss}, Val acc: {val_acc}%')

Epoch 0:
Train loss: 0.010429618632546567, Train acc: 60.320641282565134%
Val loss: 0.010916692768028396, Val acc: 67.06586826347305%
Epoch 1:
Train loss: 0.00885607589859921, Train acc: 70.67468269873079%
Val loss: 0.010090832403320038, Val acc: 69.46107784431138%
Epoch 2:
Train loss: 0.008603971165342974, Train acc: 71.67668670674684%
Val loss: 0.009995393053500239, Val acc: 68.8622754491018%
Epoch 3:
Train loss: 0.00853512178840204, Train acc: 72.94589178356713%
Val loss: 0.00938129335820318, Val acc: 69.46107784431138%
Epoch 4:
Train loss: 0.00832552167679679, Train acc: 72.87909151636607%
Val loss: 0.009274383148033461, Val acc: 70.05988023952095%
Epoch 5:
Train loss: 0.008318233247112256, Train acc: 71.20908483633934%
Val loss: 0.009842110965066327, Val acc: 71.25748502994011%
Epoch 6:
Train loss: 0.008505985807719513, Train acc: 70.875083500334%
Val loss: 0.009366163058195286, Val acc: 71.25748502994011%
Epoch 7:
Train loss: 0.008415899100746405, Train acc: 72.01068804275216%
Va

# There we have it!

Our model can achieve an approximate ~75-78% accuracy on validation and training data. This outperforms most existing prediction models.