Moneyball was a good movie, wasn't it? <br>
https://www.youtube.com/watch?v=Tzin1DgexlE

In [1]:
import statsapi
import re
import os
import ast
import json

import tensorflow as tf
tf.test.gpu_device_name()

'/device:GPU:0'

### Collect Data

In [4]:
# get raw game info and filter out non regular season games
games_raw = statsapi.schedule(start_date='03/29/2018',end_date='10/28/2018')
games_raw = [game for game in games_raw if game['game_type'] == 'R']

In [5]:
# parse through the raw games data
games = [] # list of filtered game data
team_ids = set([]) # set of unique team ids
for raw_game in games_raw:    # update team_ids with the teams playing
    team_ids.add(home_id := str(raw_game['home_id']))
    team_ids.add(away_id := str(raw_game['away_id']))
    
    # filter game data
    winner = 1 if raw_game['home_score'] > raw_game['away_score'] else (0 if raw_game['home_score'] == raw_game['away_score'] else -1)
    games.append({'winner': winner, 'home_id': home_id, 'away_id': away_id})


In [6]:
# create set of all player names
roster_regex = r'#\d+\s+(?:\w|\d)+\s+(.*)'
team_players = {}
player_names = set([])
for team_id in team_ids:
    roster = statsapi.roster(team_id, season='2018')
    roster_names = re.findall(roster_regex, roster)
    team_players[team_id] = roster_names
    player_names.update(roster_names)

In [9]:
# create set of all player names and ids
player_ids = {}
ping_count = 0
for player_name in player_names:
    print("Pings: %d" % ping_count, end='\r')
    ping_count += 1

    player_id = statsapi.lookup_player(player_name, season=2018)[0]['id']
    player_ids[player_name] = str(player_id)

Pings: 1351

In [10]:
# get stats for all players
data_regex = r'(\w+): (\d*.?\d+)'
player_data = {}
broken_players = []
ping_count = 0
for _, player_id in player_ids.items():
    print("Pings: %d" % ping_count, end='\r')
    ping_count += 1
    
    try:
        datalines = statsapi.player_stats(player_id, group='hitting', type='career').splitlines()
    except: # TODO what do we do about the missing player data?
        broken_players.append(player_id)
        continue

    data = [re.findall(data_regex, line)[0] for line in datalines if re.findall(data_regex, line)]
    player_data[player_id] = {}
    for data_key, data in data:
        player_data[player_id][data_key] = float(data)

Pings: 1351

### Dump Data

In [11]:
# print out games, team_ids, players, and player_data to file to save API calls
if not os.path.isdir('raw_data'):
   os.mkdir('raw_data')

with open('raw_data/games.data', 'w') as file:
   file.writelines([str(game) + '\n' for game in games])
   
with open('raw_data/team_players.data', 'w') as file:
    json.dump(team_players, file, indent=4)
   
with open('raw_data/player_ids.data', 'w') as file:
    json.dump(player_ids, file, indent=4)

with open('raw_data/player_data.data', 'w') as file:
    json.dump(player_data, file, indent=4)

### Read Data

In [2]:
# read in games, team_ids, and players from cached files
with open('raw_data/games.data', 'r') as file:
    games = [ast.literal_eval(line) for line in file]
   
with open('raw_data/team_players.data', 'r') as file:
    team_players = json.load(file)
   
with open('raw_data/player_ids.data', 'r') as file:
    player_ids = json.load(file)

with open('raw_data/player_data.data', 'r') as file:
    player_data = json.load(file)

### Create Dataset

In [4]:
# create a data set for the model
X = []
Y = []
feature_fields = ['avg', 'obp', 'slg', 'ops']
for game in games:
    # don't include tied games
    if game['winner'] == 0: continue

    print(game['home_id'], team_players.keys())
    # get the roster for the two playing teams
    home_roster = team_players[game['home_id']]
    away_roster = team_players[game['away_id']]
    
    # get the full player data for everyone on both teams
    home_players = [player_data[player_ids[player]] for player in home_roster if player_ids[player] in player_data]
    away_players = [player_data[player_ids[player]] for player in away_roster if player_ids[player] in player_data]

    # extract the features we care about into player vectors for both teams
    home_vectors = [[data[field] for field in feature_fields] for data in home_players if data['gamesPlayed'] > 100]
    away_vectors = [[data[field] for field in feature_fields] for data in home_players if data['gamesPlayed'] > 100]

    # remove ones with all zeros
    home_vectors = [vec for vec in home_vectors if any(e != 0 for e in vec)]
    away_vectors = [vec for vec in away_vectors if any(e != 0 for e in vec)]

    if len(home_vectors) == 0 or len(away_vectors) == 0:
        raise Exception

    # TODO: augment data with permutations and wrap all in tf/keras dataset
    for _ in range((NUM_PERM := 10)):
        print(len(home_vectors))
        break

146 dict_keys(['135', '145', '120', '117', '115', '113', '134', '141', '146', '109', '136', '140', '110', '111', '133', '118', '121', '137', '158', '119', '114', '139', '138', '142', '108', '112', '116', '147', '144', '143'])
26


NameError: name 'torch' is not defined

### Model Definition
Model & problem folumation relies heavily on the [Deepset](https://arxiv.org/pdf/1703.06114.pdf#page=9&zoom=100,150,325) paper

**Problem**
- We want to compare two teams based on the performance of their players
- Players are encoded as an $R^n$ feature vector
- We are given two teams, each a Set $T$ of players $(T \subset 2^{R^n})$, and one of $\{1, 0, -1\}$ depending on the outcome of a game

**Model**
- I train a model to "encode" any given team into a single real value. This "score" value can be used to compare teams directly.
- This model has two parts
    - $ \phi : R^N \to R^M$ takes a player ($x \in R^N$) and returns an "embedding" ($\phi(x) \in R^M$) of that player
    - $ \rho : 2^{R^M} \to R$ takes a set of embedded players $(\{\phi(x_i)\} \in 2^{R^M})$ and returns the team's score $(\rho(\sum^{|T|}_i\phi({x_i})) \in R)$
- To use this model, we apply $\rho$ to both teams. If the first team's score is higher, we return 1. If not we return -1. (I filtered out the ties. I'll figure those out later)

In [None]:
# get feature dim and choose embedding dim
N = len(feature_fields)
M = N

# function to spawn a keras implementation of the phi approximator
def PHI():
    return tf.keras.Sequential([
                tf.keras.layers.Dense(N*4, activation='relu'),
                tf.keras.layers.Dense(N*4, activation='relu'),
                tf.keras.layers.Dense(M)
            ])

# function to spawn a keras implementation of the rho approximator
def RHO():
    return tf.keras.Sequential([
        tf.keras.layers.Dense(M*4, activation='relu')
        tf.keras.layers.Dense(M*4, activation='relu')
        tf.keras.layers.Dense(1, activation='tanh')
    ])

In [6]:
# \/ \/ \/ \/ \/ \/ \/ TO DELETE \/ \/ \/ \/ \/ \/ \/ 

In [None]:
# sub-model: encodes a team (list of feature vectors) into a single vector
PLAYER_FEATS = len(feature_fields)
LSTM_OUT_FEATS = 4
class TeamEncoder(nn.Module):
    def __init__(self):
        super(TeamEncoder, self).__init__()
        self.lstm = nn.LSTM(input_size=PLAYER_FEATS,
                            hidden_size=LSTM_OUT_FEATS,
                            num_layers=1,
                            batch_first=True)

    def forward(self, x):
        return self.lstm(x)

# sub-model: estimates winner given a "game vector"
class GamePredictor(nn.Module):
    def __init__(self):
        super(GamePredictor, self).__init__()
        self.l1 = nn.Linear(LSTM_OUT_FEATS, 4)
        self.l2 = nn.Linear(4, 4)
        self.l3 = nn.Linear(4, 1)

    def forward(self, x):
        x = torch.tanh(self.l1(x))
        x = torch.tanh(self.l2(x))
        x = torch.sigmoid(self.l3(x))
        return x

In [None]:
# create instances of the two sub-models
team_encoder = TeamEncoder().double()
game_predictor = GamePredictor().double()

# create loss and optimizer
loss_func = nn.BCELoss()
team_optim = optim.SGD(team_encoder.parameters(), lr=0.01)
game_optim = optim.SGD(game_predictor.parameters(), lr=0.01)

# train the TeamEncoder and GamePredictor models
for epoch in range(300):
    # zero the grads to start each epoch
    team_optim.zero_grad()
    game_optim.zero_grad()
    
    # loop through each of the games
    avg_loss = 0
    for (home_vec, away_vec), winner in zip(X, Y):
        # encode the two teams using the TeamEncoder model
        _, (home_enc, _) = team_encoder(home_vec)
        _, (away_enc, _) = team_encoder(away_vec)

        # concatenate the two team encodings and pass it through the game predictor
        # game_enc = torch.cat((home_enc, away_enc), dim=2)
        game_enc = home_enc - away_enc
        pred = game_predictor(game_enc).reshape((1,1))
        
        # calculate loss for this game and back propigate
        loss = loss_func(pred, winner)
        loss.backward()
        avg_loss += loss.item() / len(X)

        #if epoch == 0:
        #    print(game_enc)
        #    print(pred)
        #    print('---------------------------------------')

    # step our two optimizers
    team_optim.step()
    game_optim.step()
    if epoch % 10 == 0:
        print(avg_loss)

RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x4 and 8x4)