In [277]:
import pandas as pd
import dask.dataframe as dd
import pyarrow as pa
import numpy as np
import random
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import torch
import torch.nn.functional as F
from torch import nn

In [324]:
df = pd.read_csv('/Users/stefanfeiler/Desktop/result.csv')

In [325]:
# We need to seperate 80% of games and 20% of games for test and train splits and then confirm their counts are proportional to those values 

# Step 1: Get a list of unique groups
groups = list(df.groupby('game_id').groups.keys())

# Step 2: Shuffle the list of unique groups randomly
random.shuffle(groups)

# Step 3: Calculate the number of groups for the 20% DataFrame and the 80% DataFrame
n_groups_20_percent = int(len(groups) * 0.2)
n_groups_80_percent = len(groups) - n_groups_20_percent

# Step 4: Use the loc accessor to select rows for each group and add them to the appropriate DataFrame
df_20_percent = pd.DataFrame()
df_80_percent = pd.DataFrame()

for i, group in enumerate(groups):
    if i < n_groups_20_percent:
        df_20_percent = pd.concat([df_20_percent, df.loc[df['game_id'] == group]])
    else:
        df_80_percent = pd.concat([df_80_percent, df.loc[df['game_id'] == group]])

# Optional: Reset the index of the resulting DataFrames
df_20_percent = df_20_percent.reset_index(drop=True)
df_80_percent = df_80_percent.reset_index(drop=True)

In [326]:
df.columns

Index(['team_1', 'barstool_team_1_prob', 'betfair_team_1_prob',
       'betmgm_team_1_prob', 'betonlineag_team_1_prob',
       'betrivers_team_1_prob', 'bovada_team_1_prob',
       'circasports_team_1_prob', 'draftkings_team_1_prob',
       'fanduel_team_1_prob', 'foxbet_team_1_prob', 'gtbets_team_1_prob',
       'pinnacle_team_1_prob', 'pointsbetus_team_1_prob',
       'sugarhouse_team_1_prob', 'twinspires_team_1_prob',
       'unibet_team_1_prob', 'williamhillus_team_1_prob',
       'wynnbet_team_1_prob', 'game_id', 'winning_team',
       'minutes_since_commence', 'snapshot_time_taken', 'hour_of_start',
       'day_of_week', 'barstool_last_update_time', 'betfair_last_update_time',
       'betmgm_last_update_time', 'betonlineag_last_update_time',
       'betrivers_last_update_time', 'bovada_last_update_time',
       'circasports_last_update_time', 'draftkings_last_update_time',
       'fanduel_last_update_time', 'foxbet_last_update_time',
       'gtbets_last_update_time', 'pinnacle_la

In [327]:
def add_category(column_name, data):
    
    column_index = data.columns.tolist().index(column_name)
    
    arr = data[column_name].values.reshape(-1,1)
    
    coder = OneHotEncoder(sparse_output=False)
    
    onehots = coder.fit_transform(arr)
    
    print(onehots.shape)
    
    return onehots

In [328]:
def add_numeric(column_name, data):
    
    column_index = data.columns.tolist().index(column_name)
    
    arr = data[column_name].values.reshape(-1,1).astype('float')
    
    return arr

In [329]:
data = df
full_data = np.concatenate(
    [
        add_numeric('barstool_team_1_prob', data),
        add_numeric('betfair_team_1_prob', data),
        add_numeric('betmgm_team_1_prob', data),
        add_numeric('betonlineag_team_1_prob', data),
        add_numeric('betrivers_team_1_prob', data),
        add_numeric('bovada_team_1_prob', data),
        add_numeric('circasports_team_1_prob', data),
        add_numeric('draftkings_team_1_prob', data),
        add_numeric('fanduel_team_1_prob', data),
        add_numeric('foxbet_team_1_prob', data),
        add_numeric('gtbets_team_1_prob', data),
        add_numeric('pinnacle_team_1_prob', data),
        add_numeric('pointsbetus_team_1_prob', data),
        add_numeric('sugarhouse_team_1_prob', data),
        add_numeric('twinspires_team_1_prob', data),
        add_numeric('unibet_team_1_prob', data),
        add_numeric('williamhillus_team_1_prob', data),
        add_numeric('wynnbet_team_1_prob', data),
        add_numeric('minutes_since_commence', data),
        add_category('team_1', data),
        add_category('hour_of_start', data),
        add_category('day_of_week', data),
    ],
    1
)

(87562, 30)
(87562, 9)
(87562, 7)


In [330]:
data = df_80_percent
train_data = np.concatenate(
    [
        add_numeric('barstool_team_1_prob', data),
        add_numeric('betfair_team_1_prob', data),
        add_numeric('betmgm_team_1_prob', data),
        add_numeric('betonlineag_team_1_prob', data),
        add_numeric('betrivers_team_1_prob', data),
        add_numeric('bovada_team_1_prob', data),
        add_numeric('circasports_team_1_prob', data),
        add_numeric('draftkings_team_1_prob', data),
        add_numeric('fanduel_team_1_prob', data),
        add_numeric('foxbet_team_1_prob', data),
        add_numeric('gtbets_team_1_prob', data),
        add_numeric('pinnacle_team_1_prob', data),
        add_numeric('pointsbetus_team_1_prob', data),
        add_numeric('sugarhouse_team_1_prob', data),
        add_numeric('twinspires_team_1_prob', data),
        add_numeric('unibet_team_1_prob', data),
        add_numeric('williamhillus_team_1_prob', data),
        add_numeric('wynnbet_team_1_prob', data),
        add_numeric('minutes_since_commence', data),
        add_category('team_1', data),
        add_category('hour_of_start', data),
        add_category('day_of_week', data),
    ],
    1
)

(64464, 30)
(64464, 9)
(64464, 7)


In [331]:
data = df_20_percent
test_data = np.concatenate(
    [
        add_numeric('barstool_team_1_prob', data),
        add_numeric('betfair_team_1_prob', data),
        add_numeric('betmgm_team_1_prob', data),
        add_numeric('betonlineag_team_1_prob', data),
        add_numeric('betrivers_team_1_prob', data),
        add_numeric('bovada_team_1_prob', data),
        add_numeric('circasports_team_1_prob', data),
        add_numeric('draftkings_team_1_prob', data),
        add_numeric('fanduel_team_1_prob', data),
        add_numeric('foxbet_team_1_prob', data),
        add_numeric('gtbets_team_1_prob', data),
        add_numeric('pinnacle_team_1_prob', data),
        add_numeric('pointsbetus_team_1_prob', data),
        add_numeric('sugarhouse_team_1_prob', data),
        add_numeric('twinspires_team_1_prob', data),
        add_numeric('unibet_team_1_prob', data),
        add_numeric('williamhillus_team_1_prob', data),
        add_numeric('wynnbet_team_1_prob', data),
        add_numeric('minutes_since_commence', data),
        add_category('team_1', data),
        add_category('hour_of_start', data),
        add_category('day_of_week', data),
    ],
    1
)

(23098, 30)
(23098, 9)
(23098, 7)


In [332]:
print(full_data.shape)

(87562, 65)


In [336]:
# Define the indices of the columns you want to standardize
continuous_vars_full = full_data[:, :19]
continuous_vars_test = test_data[:, :19]
continuous_vars_train = train_data[:, :19]

(64464, 46)

In [288]:
# Create an instance of StandardScaler and fit it on the training data
scaler = StandardScaler()
scaler.fit(continuous_vars_full)

In [337]:
# Standardize the columns of the training data
X_train = np.hstack((scaler.transform(continuous_vars_train), train_data[:, 19:-1]))
X_test = np.hstack((scaler.transform(continuous_vars_test), test_data[:, 19:-1]))

In [338]:
X_train[0].shape

(64,)

In [339]:
# Make our y var
y_train = df_80_percent['target'].values
y_test = df_20_percent['target'].values

In [340]:
# Making Torch datasets with our splits
train_data = torch.utils.data.TensorDataset(torch.tensor(X_train).float(), torch.tensor(y_train).float())
test_data = torch.utils.data.TensorDataset(torch.tensor(X_test).float(), torch.tensor(y_test).float())

In [343]:
# Set up loaders for each of our datasets 
loader = torch.utils.data.DataLoader(train_data, batch_size = 64, shuffle = True)
test_loader = torch.utils.data.DataLoader(test_data, batch_size = 64, shuffle = True)

In [402]:
# Set up the layers and activation functions of our model
model = torch.nn.Sequential(   
    torch.nn.Linear(64,800),
    torch.nn.ReLU(),
    torch.nn.Linear(800,64),
    torch.nn.ReLU(),
    torch.nn.Linear(64,64),
    torch.nn.ReLU(),
    torch.nn.Linear(64,64),
    torch.nn.ReLU(),
    torch.nn.Linear(64,64),
    torch.nn.ReLU(),
    torch.nn.Linear(64,64),
    torch.nn.ReLU(),
    torch.nn.Linear(64,16),
    torch.nn.ReLU(),
    torch.nn.Linear(16,1),
    torch.nn.Sigmoid()
)

In [403]:
# Defines our scoring function
def scoring_function(pred, label):
    return F.binary_cross_entropy(pred, label)

# Defines number of epochs we want to train through
num_epochs = 10

# Defines our optimizer and the learning rate 
optimizer = torch.optim.Adam( model.parameters(), .001 )

In [404]:
def train():  
    
    # Initializes a list that will contain our batch losses for an individual epoch
    epoch_losses = []
    
    # Defines how we want to step through each batch in the epoch
    for batch in loader:
        
        # Resets the grdient to zero
        optimizer.zero_grad()

        # Prepare the input and output tensors for the current batch
        batchX = torch.tensor(batch[0], dtype=torch.float32)
        batchY = torch.tensor(batch[1], dtype=torch.float32)
        #batchY = batchY.unsqueeze(1)

        # Forward pass
        y_pred = model.forward(batchX)
        
        #print(y_pred)
        #print(batchY[0])
        
        batchY = batchY.reshape(-1, 1)
        
        batchY.reshape(-1)
        
        # Compute the loss
        loss = scoring_function(y_pred, batchY)
        
        # Store the loss for this batch in the list
        epoch_losses.append(loss.detach().clone())

        # Compute the gradient of the error with respect to the model parameters
        loss.mean().backward()

        # update the model parameters
        optimizer.step()

    all_epoch_loss = torch.tensor(epoch_losses)
    return all_epoch_loss

In [405]:
e = 1
epoch_index_list = []
accuracy_list = []
# Calls the train function for each of our epochs, prints the running results
for epoch in range(num_epochs):
    ep_result = train()
    #accuracy = test_model()
    #accuracy_list.append(accuracy)
    epoch_index_list.append(e)
    e+=1
    print('Epoch {}, Average Error: {}'.format(epoch, ep_result.mean()))

  batchX = torch.tensor(batch[0], dtype=torch.float32)
  batchY = torch.tensor(batch[1], dtype=torch.float32)


Epoch 0, Average Error: 0.5270767211914062
Epoch 1, Average Error: 0.3349873423576355
Epoch 2, Average Error: 0.20958243310451508
Epoch 3, Average Error: 0.16215328872203827
Epoch 4, Average Error: 0.13778752088546753
Epoch 5, Average Error: 0.12171861529350281
Epoch 6, Average Error: 0.1116696372628212
Epoch 7, Average Error: 0.1025172770023346
Epoch 8, Average Error: 0.09650149196386337
Epoch 9, Average Error: 0.08949577808380127


In [406]:
# Defines a function such that each prediction is categorized into either favorable or unfavorable review
def transform_prediction_to_target_format(num):
    if num > 0.5:
        return 1
    else:
        return 0

In [439]:
# Initializes an empty list that will contain the modified predictions (favorable = 1, unfavorable = 0)
itemized_predictions = []
bs_probs = []



# Set the model to evaluation mode
model.eval()

# Iterate over the test set and collect the model's predictions
predictions = []
for batch in test_loader:
    
    original_data = scaler.inverse_transform(batch[0][:, :19])
    original_data = original_data[:, :18]
    
    
    mask = original_data > 0 #.00
    
    filtered_arr = original_data[mask]
    
    # Mask the zeros in each row
    #masked_arr = np.ma.masked_equal(original_data[:, :18], 0)
    
    #print(filtered_arr)

    # Calculate the mean of the non-zero values in each row
    row_means = np.ma.mean(filtered_arr)
    
    #print(row_means)
    
    #first_col = original_data[:, 0]
    bs_probs.append(row_means)
    
    # Get the raw outputs
    with torch.no_grad():
        outputs = model(batch[0])
    
    # Individually categorizes the predictions
    for output in outputs:
        #pred = torch.sigmoid(output)
        pred = output
        itemized_predictions.append(transform_prediction_to_target_format(pred))
        
# Calculate the accuracy on the test set
correct_predictions = 0
for i in range(len(itemized_predictions)):
    if itemized_predictions[i] == y_test[i]:
        correct_predictions += 1

accuracy = correct_predictions / len(y_test)
print(f"Accuracy: {accuracy}")
arr = np.array(bs_probs[:-1])
print(f"Average prob: {arr.mean()}")
q1 = np.percentile(arr, 25)
q2 = np.percentile(arr, 50)
q3 = np.percentile(arr, 75)
print(f"1st quartile: {q1}")
print(f"2nd quartile: {q2}")
print(f"3rd quartile: {q3}")

Accuracy: 0.5005628192917135
Average prob: 0.3840659548394302
1st quartile: 0.3629305176407024
2nd quartile: 0.3830253029831613
3rd quartile: 0.40457401653105124


In [432]:
ev = (160*.49) - (100 * (1-0.38412074891674536))

In [433]:
print(ev)

16.812074891674534
