In [85]:
import pandas as pd
import dask.dataframe as dd
import pyarrow as pa
import numpy as np
import random
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import torch
import torch.nn.functional as F
from torch import nn

  from .autonotebook import tqdm as notebook_tqdm


In [60]:
df = pd.read_csv('/Users/stefanfeiler/Desktop/result.csv')

In [17]:
# We need to seperate 80% of games and 20% of games for test and train splits and then confirm their counts are proportional to those values 

# Step 1: Get a list of unique groups
groups = list(df.groupby('game_id').groups.keys())

# Step 2: Shuffle the list of unique groups randomly
random.shuffle(groups)

# Step 3: Calculate the number of groups for the 20% DataFrame and the 80% DataFrame
n_groups_20_percent = int(len(groups) * 0.2)
n_groups_80_percent = len(groups) - n_groups_20_percent

# Step 4: Use the loc accessor to select rows for each group and add them to the appropriate DataFrame
df_20_percent = pd.DataFrame()
df_80_percent = pd.DataFrame()

for i, group in enumerate(groups):
    if i < n_groups_20_percent:
        df_20_percent = pd.concat([df_20_percent, df.loc[df['game_id'] == group]])
    else:
        df_80_percent = pd.concat([df_80_percent, df.loc[df['game_id'] == group]])

# Optional: Reset the index of the resulting DataFrames
df_20_percent = df_20_percent.reset_index(drop=True)
df_80_percent = df_80_percent.reset_index(drop=True)

In [18]:
df.columns

Index(['team_1', 'barstool_team_1_prob', 'betfair_team_1_prob',
       'betmgm_team_1_prob', 'betonlineag_team_1_prob',
       'betrivers_team_1_prob', 'bovada_team_1_prob',
       'circasports_team_1_prob', 'draftkings_team_1_prob',
       'fanduel_team_1_prob', 'foxbet_team_1_prob', 'gtbets_team_1_prob',
       'pinnacle_team_1_prob', 'pointsbetus_team_1_prob',
       'sugarhouse_team_1_prob', 'twinspires_team_1_prob',
       'unibet_team_1_prob', 'williamhillus_team_1_prob',
       'wynnbet_team_1_prob', 'game_id', 'winning_team',
       'minutes_since_commence', 'snapshot_time_taken', 'hour_of_start',
       'day_of_week', 'barstool_last_update_time', 'betfair_last_update_time',
       'betmgm_last_update_time', 'betonlineag_last_update_time',
       'betrivers_last_update_time', 'bovada_last_update_time',
       'circasports_last_update_time', 'draftkings_last_update_time',
       'fanduel_last_update_time', 'foxbet_last_update_time',
       'gtbets_last_update_time', 'pinnacle_la

In [61]:
def add_category(column_name, data):
    
    column_index = data.columns.tolist().index(column_name)
    
    arr = data[column_name].values.reshape(-1,1)
    
    coder = OneHotEncoder(sparse_output=False)
    
    onehots = coder.fit_transform(arr)
    
    print(onehots.shape)
    
    return onehots

In [62]:
def add_numeric(column_name, data):
    
    column_index = data.columns.tolist().index(column_name)
    
    arr = data[column_name].values.reshape(-1,1).astype('float')
    
    return arr

In [64]:
data = df
full_data = np.concatenate(
    [
        add_numeric('barstool_team_1_prob', data),
        add_numeric('betfair_team_1_prob', data),
        add_numeric('betmgm_team_1_prob', data),
        add_numeric('betonlineag_team_1_prob', data),
        add_numeric('betrivers_team_1_prob', data),
        add_numeric('bovada_team_1_prob', data),
        add_numeric('circasports_team_1_prob', data),
        add_numeric('draftkings_team_1_prob', data),
        add_numeric('fanduel_team_1_prob', data),
        add_numeric('foxbet_team_1_prob', data),
        add_numeric('gtbets_team_1_prob', data),
        add_numeric('pinnacle_team_1_prob', data),
        add_numeric('pointsbetus_team_1_prob', data),
        add_numeric('sugarhouse_team_1_prob', data),
        add_numeric('twinspires_team_1_prob', data),
        add_numeric('unibet_team_1_prob', data),
        add_numeric('williamhillus_team_1_prob', data),
        add_numeric('wynnbet_team_1_prob', data),
        add_numeric('minutes_since_commence', data),
        add_category('team_1', data),
        add_category('hour_of_start', data),
        add_category('day_of_week', data),
    ],
    1
)

(87562, 30)
(87562, 9)
(87562, 7)


In [65]:
data = df_80_percent
train_data = np.concatenate(
    [
        add_numeric('barstool_team_1_prob', data),
        add_numeric('betfair_team_1_prob', data),
        add_numeric('betmgm_team_1_prob', data),
        add_numeric('betonlineag_team_1_prob', data),
        add_numeric('betrivers_team_1_prob', data),
        add_numeric('bovada_team_1_prob', data),
        add_numeric('circasports_team_1_prob', data),
        add_numeric('draftkings_team_1_prob', data),
        add_numeric('fanduel_team_1_prob', data),
        add_numeric('foxbet_team_1_prob', data),
        add_numeric('gtbets_team_1_prob', data),
        add_numeric('pinnacle_team_1_prob', data),
        add_numeric('pointsbetus_team_1_prob', data),
        add_numeric('sugarhouse_team_1_prob', data),
        add_numeric('twinspires_team_1_prob', data),
        add_numeric('unibet_team_1_prob', data),
        add_numeric('williamhillus_team_1_prob', data),
        add_numeric('wynnbet_team_1_prob', data),
        add_numeric('minutes_since_commence', data),
        add_category('team_1', data),
        add_category('hour_of_start', data),
        add_category('day_of_week', data),
    ],
    1
)

(71618, 30)
(71618, 9)
(71618, 7)


In [166]:
data = df_20_percent
test_data = np.concatenate(
    [
        add_numeric('barstool_team_1_prob', data),
        add_numeric('betfair_team_1_prob', data),
        add_numeric('betmgm_team_1_prob', data),
        add_numeric('betonlineag_team_1_prob', data),
        add_numeric('betrivers_team_1_prob', data),
        add_numeric('bovada_team_1_prob', data),
        add_numeric('circasports_team_1_prob', data),
        add_numeric('draftkings_team_1_prob', data),
        add_numeric('fanduel_team_1_prob', data),
        add_numeric('foxbet_team_1_prob', data),
        add_numeric('gtbets_team_1_prob', data),
        add_numeric('pinnacle_team_1_prob', data),
        add_numeric('pointsbetus_team_1_prob', data),
        add_numeric('sugarhouse_team_1_prob', data),
        add_numeric('twinspires_team_1_prob', data),
        add_numeric('unibet_team_1_prob', data),
        add_numeric('williamhillus_team_1_prob', data),
        add_numeric('wynnbet_team_1_prob', data),
        add_numeric('minutes_since_commence', data),
        add_category('team_1', data),
        add_category('hour_of_start', data),
        add_category('day_of_week', data),
    ],
    1
)

(15944, 30)
(15944, 9)
(15944, 7)


In [167]:
print(full_data.shape)

(87562, 65)


In [168]:
# Define the indices of the columns you want to standardize
continuous_vars_full = full_data[:, :19]
continuous_vars_test = test_data[:, :19]
continuous_vars_train = train_data[:, :19]

In [169]:
# Create an instance of StandardScaler and fit it on the training data
scaler = StandardScaler()
scaler.fit(continuous_vars_full)

In [170]:
# Standardize the columns of the training data
X_train = np.hstack((scaler.transform(continuous_vars_train), train_data[:, 19:]))
X_test = np.hstack((scaler.transform(continuous_vars_test), test_data[:, 19:]))

In [184]:
# Make our y var
y_train = df_80_percent['target'].values
y_test = df_20_percent['target'].values

In [245]:
# Making Torch datasets with our splits
train_data = torch.utils.data.TensorDataset(torch.tensor(X_train).float(), torch.tensor(y_train).float())
test_data = torch.utils.data.TensorDataset(torch.tensor(X_test).float(), torch.tensor(y_test).float())

In [246]:
# Set up loaders for each of our datasets 
loader = torch.utils.data.DataLoader(train_data, batch_size = 64, shuffle = True)
test_loader = torch.utils.data.DataLoader(test_data, batch_size = 64, shuffle = True)

In [273]:
# Set up the layers and activation functions of our model
model = torch.nn.Sequential(   
    torch.nn.Linear(65,800),
    torch.nn.SiLU(),
    torch.nn.Linear(800,64),
    torch.nn.SiLU(),
    torch.nn.Linear(64,64),
    torch.nn.SiLU(),
    torch.nn.Linear(64,64),
    torch.nn.SiLU(),
    torch.nn.Linear(64,64),
    torch.nn.SiLU(),
    torch.nn.Linear(64,64),
    torch.nn.SiLU(),
    torch.nn.Linear(64,16),
    torch.nn.SiLU(),
    torch.nn.Linear(16,1),
    torch.nn.Sigmoid()
)

In [274]:
# Defines our scoring function
def scoring_function(pred, label):
    return F.binary_cross_entropy(pred, label)

# Defines number of epochs we want to train through
num_epochs = 100

# Defines our optimizer and the learning rate 
optimizer = torch.optim.Adam( model.parameters(), .001 )

In [275]:
def train():  
    
    # Initializes a list that will contain our batch losses for an individual epoch
    epoch_losses = []
    
    # Defines how we want to step through each batch in the epoch
    for batch in loader:
        
        # Resets the grdient to zero
        optimizer.zero_grad()

        # Prepare the input and output tensors for the current batch
        batchX = torch.tensor(batch[0], dtype=torch.float32)
        batchY = torch.tensor(batch[1], dtype=torch.float32)
        #batchY = batchY.unsqueeze(1)

        # Forward pass
        y_pred = model.forward(batchX)
        
        print(y_pred)
        #print(batchY[0])
        
        batchY = batchY.reshape(-1, 1)
        
        batchY.reshape(-1)
        
        # Compute the loss
        loss = scoring_function(y_pred, batchY)
        
        # Store the loss for this batch in the list
        epoch_losses.append(loss.detach().clone())

        # Compute the gradient of the error with respect to the model parameters
        loss.mean().backward()

        # update the model parameters
        optimizer.step()

    all_epoch_loss = torch.tensor(epoch_losses)
    return all_epoch_loss

In [None]:
e = 1
epoch_index_list = []
accuracy_list = []
# Calls the train function for each of our epochs, prints the running results
for epoch in range(num_epochs):
    ep_result = train()
    #accuracy = test_model()
    #accuracy_list.append(accuracy)
    epoch_index_list.append(e)
    e+=1
    print('Epoch {}, Average Error: {}'.format(epoch, ep_result.mean()))

  batchX = torch.tensor(batch[0], dtype=torch.float32)
  batchY = torch.tensor(batch[1], dtype=torch.float32)


tensor([[0.5113],
        [0.5113],
        [0.5113],
        [0.5113],
        [0.5113],
        [0.5113],
        [0.5113],
        [0.5114],
        [0.5113],
        [0.5113],
        [0.5113],
        [0.5114],
        [0.5113],
        [0.5113],
        [0.5113],
        [0.5113],
        [0.5113],
        [0.5113],
        [0.5114],
        [0.5113],
        [0.5114],
        [0.5113],
        [0.5114],
        [0.5113],
        [0.5114],
        [0.5113],
        [0.5114],
        [0.5113],
        [0.5113],
        [0.5113],
        [0.5114],
        [0.5113],
        [0.5113],
        [0.5113],
        [0.5113],
        [0.5113],
        [0.5114],
        [0.5113],
        [0.5113],
        [0.5114],
        [0.5113],
        [0.5113],
        [0.5113],
        [0.5114],
        [0.5114],
        [0.5113],
        [0.5113],
        [0.5113],
        [0.5114],
        [0.5113],
        [0.5113],
        [0.5113],
        [0.5113],
        [0.5113],
        [0.5113],
        [0