In [28]:
import pandas as pd
import numpy as np
import sys
import os

import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn import preprocessing, metrics

import torch
from torch import optim
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor

import matplotlib.pyplot as plt
from tqdm.notebook import tqdm as tqdm

import tensorflow as tf

In [2]:
pd.set_option('max_columns', 500)

# Task 1

I use the PyTorch framework to implement my 2-layer Neural Network. I used the pytorch documentation available at https://pytorch.org/tutorials/beginner/basics/buildmodel_tutorial.html as reference when building my model. PyTorch makes it easier to build a model by subclassing the nn.Module class. You use the custom class to implement forward propagation and to train the model.

# Task 2

## 1. Exploratory Data Analysis
### Load data files and combine into 1 DataFrame

In [3]:
fds = [x[0] for x in os.walk('archive')][1:]

In [4]:
dfs = []
for fd in fds:
    dfs.append(pd.read_csv(f"{fd}\\vegas.txt"))
odds_df = pd.concat(dfs, ignore_index=True)

### Drop unnessecary columns

In [5]:
odds_df.drop(columns=['PercentBet_OU', 'Open_Line_OU', 'Open_Odds_OU', 'Pinnacle_Line_OU',
       'Pinnacle_Odds_OU', '5dimes_Line_OU', '5dimes_Odds_OU',
       'Heritage_Line_OU', 'Heritage_Odds_OU', 'Bovada_Line_OU',
       'Bovada_Odds_OU', 'Betonline_Line_OU', 'Betonline_Odds_OU', 'Average_Odds_OU', 'Best_Line_OU', 'Worst_Line_OU',
       'Best_Odds_OU', 'Worst_Odds_OU',
                     'Open_Line_Spread', 'Open_Odds_Spread', 'Pinnacle_Line_Spread',
       'Pinnacle_Odds_Spread', '5dimes_Line_Spread', '5dimes_Odds_Spread',
       'Heritage_Line_Spread', 'Heritage_Odds_Spread', 'Bovada_Line_Spread',
       'Bovada_Odds_Spread', 'Betonline_Line_Spread', 'Betonline_Odds_Spread'], inplace=True)

### Drop insufficient rows

In [6]:
odds_df = odds_df[(~odds_df['Average_Line_ML'].isna()) * (odds_df['Average_Line_ML']!=0)]

### Replace n/a columns with average of other moneylines.

In [7]:
ml = ['Open_Line_ML', 'Pinnacle_ML', '5dimes_ML',
       'Heritage_ML', 'Bovada_ML', 'Betonline_ML']
odds_df.loc[:, ml] = odds_df[ml].where(~odds_df[ml].isna(), odds_df['Average_Line_ML'], axis=0)

### Feature Engineering

In [8]:
#Convert MoneyLines to Fractional Odds.
#This makes the range for these features (0,inf) as opposed to (-inf,-100] U [100,inf)

lines = ['Open_Line_ML', 'Pinnacle_ML', '5dimes_ML',
       'Heritage_ML', 'Bovada_ML', 'Betonline_ML', 'Average_Odds_Spread',
        'Best_Odds_Spread', 'Worst_Odds_Spread', 'Average_Line_ML','Best_Line_ML','Worst_Line_ML' ]
cond = odds_df[lines] > 0
pos = (odds_df[lines]/100)
neg = (-100/odds_df[lines])

odds_df.loc[:,lines] = pos.where(cond, other=neg)

In [9]:
cond2 = odds_df['Result'] == 'W'
odds_df.loc[cond2,'Result'] = 1
odds_df.loc[~cond2,'Result'] = 0

In [10]:
odds_df['Location'].replace(to_replace='home',value=1.0, inplace=True)
odds_df['Location'].replace(to_replace='away',value=-1.0,inplace=True)

In [11]:
scaled_features = ['PercentBet_ML', 'Open_Line_ML', 'Pinnacle_ML', '5dimes_ML',
       'Heritage_ML', 'Bovada_ML', 'Betonline_ML', 'PercentBet_Spread',
       'Average_Line_Spread', 'Average_Odds_Spread', 'Best_Line_Spread',
       'Worst_Line_Spread', 'Best_Odds_Spread', 'Worst_Odds_Spread', 
       'Average_Line_OU']
atributes = ['Pts', 'Spread', 'Result']

In [12]:
scaler = preprocessing.StandardScaler()
odds_df.loc[:,scaled_features] = scaler.fit_transform(odds_df[scaled_features])

### See Data

In [13]:
data = odds_df
data

Unnamed: 0,Date,Location,Team,OppTeam,TeamId,GameId,PercentBet_ML,Open_Line_ML,Pinnacle_ML,5dimes_ML,Heritage_ML,Bovada_ML,Betonline_ML,Average_Line_ML,Best_Line_ML,Worst_Line_ML,PercentBet_Spread,Average_Line_Spread,Average_Odds_Spread,Best_Line_Spread,Worst_Line_Spread,Best_Odds_Spread,Worst_Odds_Spread,Average_Line_OU,Pts,Spread,Result,Total
0,2012-10-30,-1.0,Washington,Cleveland,1610612764,21200001,-0.215798,0.210024,0.121038,0.086718,0.149203,0.037396,0.143883,2.090000,2.150000,2.000000,-0.325510,0.814890,0.046142,0.729006,0.813276,-0.761344,0.768062,-1.340814,84.0,-10.0,0,178.0
1,2012-10-30,1.0,Cleveland,Washington,1610612739,21200001,0.247130,-0.532391,-0.508086,-0.520333,-0.532197,-0.268073,-0.517031,0.406504,0.425532,0.392157,0.342055,-0.814900,-1.188992,-0.813198,-0.729014,-0.761344,-0.868950,-1.340814,94.0,10.0,1,178.0
2,2012-10-30,-1.0,Boston,Miami,1610612738,21200002,-0.207226,0.210024,0.248782,0.240078,0.229367,0.054774,0.241076,2.348000,2.440000,2.250000,-0.099015,0.919722,-0.223379,0.939307,0.953484,1.280837,-0.868950,-1.649198,107.0,-13.0,0,227.0
3,2012-10-30,1.0,Miami,Boston,1610612748,21200002,0.238557,-0.532391,-0.531342,-0.532622,-0.544073,-0.270645,-0.533693,0.370370,0.384615,0.357143,0.115560,-0.919732,-0.045775,-0.953398,-0.939327,-0.761344,0.768062,-1.649198,120.0,13.0,1,227.0
4,2012-10-30,-1.0,Dallas,L.A. Lakers,1610612742,21200003,-0.378680,0.741670,0.808599,0.930199,0.910767,0.315436,0.824235,3.946000,4.200000,3.750000,-0.051331,1.223037,0.499098,1.219707,1.304005,0.239919,0.768062,-1.696895,99.0,8.0,1,190.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17203,2019-04-10,1.0,Denver,Minnesota,1610612743,21801228,-0.190080,-0.644243,-0.620646,-0.633891,-0.648958,-0.318843,-0.629344,0.112511,0.125945,0.100000,0.580471,-1.770969,0.225396,-1.794601,-1.710472,-0.761344,0.768062,0.969188,99.0,4.0,1,194.0
17204,2019-04-10,-1.0,Utah,L.A. Clippers,1610612762,21801229,0.512884,0.124143,0.267567,0.374268,0.389697,0.132972,0.299392,2.628000,2.750000,2.490000,-0.605648,1.024555,-0.314451,1.079507,0.673068,0.239919,-0.868950,1.551417,137.0,-6.0,0,280.0
17205,2019-04-10,1.0,L.A. Clippers,Utah,1610612746,21801229,-0.481553,-0.519284,-0.534259,-0.554378,-0.571065,-0.285110,-0.542950,0.321337,0.355872,0.294118,0.622193,-1.024564,-1.105846,-0.672998,-1.079535,-0.761344,-5.743062,1.551417,143.0,6.0,1,280.0
17206,2019-04-10,-1.0,Sacramento,Portland,1610612758,21801230,0.362862,0.606714,-0.160749,-0.162493,-0.171456,-0.110313,-0.186574,1.310000,1.350000,1.250000,0.502985,0.512974,0.318169,0.869206,0.532860,-0.761344,0.768062,1.674771,131.0,-5.0,0,267.0


## 2.2 Perform a train-dev-test split.

In [14]:
n,_ = data.shape

train_dev_df, test_df = train_test_split(data, test_size=.1, random_state=4000)

train_df, dev_df = train_test_split(train_dev_df, test_size=.1, random_state=4000)

### 2.3 Implement the 2-layer Nerual Network

In [15]:
class basicMLP(nn.Module):
    
    def __init__(self):
        super(basicMLP, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            #feature layer
            nn.Linear(16,100),
            nn.Tanh(),
            #hidden layers
            
            nn.Linear(100,100),    
            nn.Tanh(), 
            
            nn.Linear(100,100),    
            nn.Tanh(), 
            
            #output layer
            nn.Linear(100,2)
            
        )
        
    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

In [16]:
class Wrapper(nn.Module):
    
    def __init__(self, model):
        super(Wrapper, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            model,
            nn.Softmax(1)
        )
        
    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

In [17]:
class OddsData(Dataset):
    def __init__(self, df):
        features = ['Location','PercentBet_ML', 'Open_Line_ML', 'Pinnacle_ML', '5dimes_ML',
       'Heritage_ML', 'Bovada_ML', 'Betonline_ML', 'PercentBet_Spread',
       'Average_Line_Spread', 'Average_Odds_Spread', 'Best_Line_Spread',
       'Worst_Line_Spread', 'Best_Odds_Spread', 'Worst_Odds_Spread', 
       'Average_Line_OU']
        self.data_df = df.reset_index()
        
        x = df.loc[:,features].values
        y = df.loc[:,'Result'].values.astype(int)
        
        self.X = torch.tensor(x, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=int)
        
        #self.y = self.y.type(torch.LongTensor)
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return (self.X[idx] ,self.y[idx])

In [18]:
train_data = OddsData(train_df)
dev_data   = OddsData(dev_df)
test_data  = OddsData(test_df)

In [19]:
bs = 64
train_loader = DataLoader(train_data, batch_size=bs, shuffle=True, num_workers=0)
dev_loader = DataLoader(dev_data, batch_size=bs, shuffle=True, num_workers=0)
test_loader = DataLoader(test_data, batch_size=bs, shuffle=True, num_workers=0)

In [20]:
labels = {
    0: 'L',
    1: 'W'
}
device = 'cpu'

In [21]:
def train_epoch(model, criterion, optimizer, trainloader):
   
    size = len(trainloader.dataset)
    
    for batchnum, (X,y) in enumerate(trainloader):
        model.train()
        
        yhat = model(X)
        loss = criterion(yhat, y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [22]:
def validate(model, criterion, valloader):
    model.eval()
    
    size = len(valloader.dataset)
    num_batches = len(valloader)
    test_loss, correct = 0,0
    
    with torch.no_grad():
        for X, y in valloader:
            yhat = model(X)
            test_loss += criterion(yhat, y).item()
            correct += ( yhat.argmax(1) == y).type(torch.float).sum().item()
            
    loss = test_loss / num_batches
    accuracy = correct / size
    
    return (loss, accuracy)

In [None]:
model = basicMLP().to('cpu')

In [25]:
optimizer = optim.Adagrad(model.parameters(), lr=.0001, weight_decay=.001)

crit = nn.CrossEntropyLoss()

In [23]:
model = torch.load('model')

In [29]:
epochs = 10000
error = np.empty((epochs,2))

In [30]:
for epoch in tqdm(range(epochs)):
    
    train_epoch(model, crit, optimizer, train_loader)
    
    loss, accuracy = validate(model, crit, test_loader )
    if epoch % 100 == 0:
        print(f"Epoch {epoch+1}\n-------------------------------")
        print(f"Test Error: \n Accuracy: {(accuracy*100):>0.1f}%, Avg loss: {loss:>8f} \n")
    error[epoch] = [accuracy,loss]

  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch 1
-------------------------------
Test Error: 
 Accuracy: 68.0%, Avg loss: 0.622624 

Epoch 101
-------------------------------
Test Error: 
 Accuracy: 69.5%, Avg loss: 0.580877 

Epoch 201
-------------------------------
Test Error: 
 Accuracy: 69.6%, Avg loss: 0.580266 

Epoch 301
-------------------------------
Test Error: 
 Accuracy: 69.5%, Avg loss: 0.579927 

Epoch 401
-------------------------------
Test Error: 
 Accuracy: 69.4%, Avg loss: 0.579629 

Epoch 501
-------------------------------
Test Error: 
 Accuracy: 69.3%, Avg loss: 0.579942 

Epoch 601
-------------------------------
Test Error: 
 Accuracy: 69.3%, Avg loss: 0.580009 

Epoch 701
-------------------------------
Test Error: 
 Accuracy: 69.3%, Avg loss: 0.579926 

Epoch 801
-------------------------------
Test Error: 
 Accuracy: 69.3%, Avg loss: 0.579517 

Epoch 901
-------------------------------
Test Error: 
 Accuracy: 69.3%, Avg loss: 0.579674 

Epoch 1001
-------------------------------
Test Error: 
 Accur

Epoch 8701
-------------------------------
Test Error: 
 Accuracy: 69.6%, Avg loss: 0.578422 

Epoch 8801
-------------------------------
Test Error: 
 Accuracy: 69.6%, Avg loss: 0.578407 

Epoch 8901
-------------------------------
Test Error: 
 Accuracy: 69.6%, Avg loss: 0.578989 

Epoch 9001
-------------------------------
Test Error: 
 Accuracy: 69.6%, Avg loss: 0.578714 

Epoch 9101
-------------------------------
Test Error: 
 Accuracy: 69.6%, Avg loss: 0.578445 

Epoch 9201
-------------------------------
Test Error: 
 Accuracy: 69.6%, Avg loss: 0.578790 

Epoch 9301
-------------------------------
Test Error: 
 Accuracy: 69.6%, Avg loss: 0.578487 

Epoch 9401
-------------------------------
Test Error: 
 Accuracy: 69.6%, Avg loss: 0.578583 

Epoch 9501
-------------------------------
Test Error: 
 Accuracy: 69.6%, Avg loss: 0.578217 

Epoch 9601
-------------------------------
Test Error: 
 Accuracy: 69.6%, Avg loss: 0.578324 

Epoch 9701
-------------------------------
Test Er

In [27]:
validate(model, crit, train_loader), validate(model, crit, dev_loader), validate(model, crit, test_loader)

((0.5869111595077252, 0.6880562733275911),
 (0.6051433753967285, 0.6623628147191737),
 (0.5786497052069064, 0.6955258570598489))

In [74]:
wrap = Wrapper(model)

In [46]:
winnings, bet, _ = evaluate(wrap,dev_data, min_dif=.047, standardize=False)
winnings.sum(), bet.sum(), winnings.sum()/ bet.sum()

(36.78974609569066, 39.0, 0.9433268229664272)

## Evaluation

The basic accuracy here is not that relevant to us. What we care about is how the model performs against the odds. For this purpose we use our model to select which bets are 'good'. I.e. the model predicts a higher probability of success than the implied odds. Hence a positive EV proposition. Because of the Vig, a model the perfectly mimicks the books implied odds will lose ~%5.

In [51]:
def implied_probability(frac):
    return 1/(1+frac)
def to_frac(lines):
    cond = odds_df[lines] > 0
    pos = (odds_df[lines]/100)
    neg = (-100/odds_df[lines])

    return pos.where(cond, other=neg)

In [56]:
def evaluate(model, data, min_dif=0, standardize=False):
    imp_prob = implied_probability(data.data_df['Best_Line_ML'])
    yhat_prob = model(data.X)[:,1]
    if type(yhat_prob) is torch.Tensor:
        yhat_prob = yhat_prob.detach().numpy() #predicted probability of Win
    
    dif = yhat_prob - imp_prob
    good_bets_idx = dif[dif > min_dif].index
    n = len(good_bets_idx)
    if standardize:
        bet = 1/(data.data_df.loc[good_bets_idx]['Best_Line_ML']+1)
    else:
        bet = np.ones(n)
    winnings = (data.data_df.loc[good_bets_idx]['Best_Line_ML']+1) * bet * data.y[good_bets_idx].detach().numpy()
    
    return (winnings, bet,good_bets_idx)

In [90]:
winnings, bet, gbi = evaluate(wrap, test_data, min_dif=0.0, standardize=False)
winnings.sum(), sum(bet), (winnings.sum()) / sum(bet)

(591.7779334027127, 632.0, 0.9363574895612543)

In [92]:
winnings, bet, gbi = evaluate(wrap, test_data, min_dif=0.03, standardize=True)
winnings.sum(), sum(bet), (winnings.sum()) / sum(bet)

(69.0, 68.94594457911502, 1.0007840261122676)

In [48]:
torch.save(model, "model")

# Task 3:

The number of nodes in the hidden layers being set at 100 is faily arbitrary. The use of Softmax activation function in the final layer of the Wrapper is because we want to output probabilities. I did use regulariation because the first couple times I trained the model the training accuracy was much higher than the test accuracy, hence it was overfitting. I used Adagrad optimization.

# Task 4:

In [38]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression()
lr_model.fit(train_data.X, train_data.y)
yhat = lr_model.predict(test_data.X)

In [39]:
metrics.accuracy_score(test_data.y, yhat)

0.6984311446833237

In [68]:
winnings, bet, gbi = evaluate(lr_model.predict_proba, test_data, min_dif=0.0, standardize=False)
winnings.sum(), sum(bet), (winnings.sum()) / sum(bet)

(619.7721722836905, 631.0, 0.9822062952197947)

In [94]:
winnings, bet, gbi = evaluate(lr_model.predict_proba, test_data, min_dif=0.03, standardize=True)
winnings.sum(), sum(bet), (winnings.sum()) / sum(bet)

(35.0, 35.34780093559485, 0.9901606061370394)

In [48]:
book_yhat = np.where(test_data.data_df['Best_Line_ML'] < 1,1,0)

In [49]:
metrics.accuracy_score(test_data.y, book_yhat)

0.6995932597327136

We compare the model to a baseline LogisticRegression model. The LR model performs better or comparable to the the neural network. This is somewhat surpising to me as the NN does not appear to be overfitting. What it really goes to show is that you really can't beat the book.