In [1]:
import pandas as pd
import dask.dataframe as dd
import pyarrow as pa
import numpy as np
import random
import sklearn
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import torch
import torch.nn.functional as F
from torch import nn
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt

In [164]:
# Read the raw data file
df = pd.read_parquet('/Users/stefanfeiler/Desktop/SmartBetter/SmartBetter/data/mlb_raw_final_for_model.parquet')

In [165]:
#df = df.drop('home_away', axis=1)

In [166]:
# We need to seperate 80% of games and 20% of games for test and train splits and then confirm their counts are proportional to those values 

# Step 1: Get a list of unique groups
groups = list(df.groupby('game_id').groups.keys())

# Step 2: Shuffle the list of unique groups randomly
random.shuffle(groups)

# Step 3: Calculate the number of groups for the 20% DataFrame and the 80% DataFrame
n_groups_20_percent = int(len(groups) * 0.2)
n_groups_80_percent = len(groups) - n_groups_20_percent

# Step 4: Use the loc accessor to select rows for each group and add them to the appropriate DataFrame
df_20_percent = pd.DataFrame()
df_80_percent = pd.DataFrame()

for i, group in enumerate(groups):
    if i < n_groups_20_percent:
        df_20_percent = pd.concat([df_20_percent, df.loc[df['game_id'] == group]])
    else:
        df_80_percent = pd.concat([df_80_percent, df.loc[df['game_id'] == group]])

# Optional: Reset the index of the resulting DataFrames
df_20_percent = df_20_percent.reset_index(drop=True)
df_80_percent = df_80_percent.reset_index(drop=True)

In [167]:
df.columns

Index(['team_1', 'barstool_team_1_prob', 'betfair_team_1_prob',
       'betmgm_team_1_prob', 'betonlineag_team_1_prob',
       'betrivers_team_1_prob', 'bovada_team_1_prob',
       'circasports_team_1_prob', 'draftkings_team_1_prob',
       'fanduel_team_1_prob', 'foxbet_team_1_prob', 'gtbets_team_1_prob',
       'pinnacle_team_1_prob', 'pointsbetus_team_1_prob',
       'sugarhouse_team_1_prob', 'twinspires_team_1_prob',
       'unibet_team_1_prob', 'williamhillus_team_1_prob',
       'wynnbet_team_1_prob', 'game_id', 'winning_team',
       'minutes_since_commence', 'snapshot_time_taken', 'hour_of_start',
       'day_of_week', 'home_away', 'barstool_last_update_time',
       'betfair_last_update_time', 'betmgm_last_update_time',
       'betonlineag_last_update_time', 'betrivers_last_update_time',
       'bovada_last_update_time', 'circasports_last_update_time',
       'draftkings_last_update_time', 'fanduel_last_update_time',
       'foxbet_last_update_time', 'gtbets_last_update_time',

In [168]:
def add_category(column_name, data):
    
    column_index = data.columns.tolist().index(column_name)
    
    arr = data[column_name].values.reshape(-1,1)
    
    coder = OneHotEncoder(sparse_output=False)
    
    onehots = coder.fit_transform(arr)
    
    print(onehots.shape)
    
    return onehots

In [169]:
def add_numeric(column_name, data):
    
    # Get the index from the full df of the input column name
    column_index = data.columns.tolist().index(column_name)
    
    # Make an array of the column values 
    arr = data[column_name].values.reshape(-1,1).astype('float')
    
    return arr

In [170]:
data = df
full_data = np.concatenate(
    [
        add_numeric('barstool_team_1_prob', data),
        add_numeric('betfair_team_1_prob', data),
        add_numeric('betmgm_team_1_prob', data),
        add_numeric('betonlineag_team_1_prob', data),
        add_numeric('betrivers_team_1_prob', data),
        add_numeric('bovada_team_1_prob', data),
        add_numeric('circasports_team_1_prob', data),
        add_numeric('draftkings_team_1_prob', data),
        add_numeric('fanduel_team_1_prob', data),
        add_numeric('foxbet_team_1_prob', data),
        add_numeric('gtbets_team_1_prob', data),
        add_numeric('pinnacle_team_1_prob', data),
        add_numeric('pointsbetus_team_1_prob', data),
        add_numeric('sugarhouse_team_1_prob', data),
        add_numeric('twinspires_team_1_prob', data),
        add_numeric('unibet_team_1_prob', data),
        add_numeric('williamhillus_team_1_prob', data),
        add_numeric('wynnbet_team_1_prob', data),
        add_numeric('minutes_since_commence', data),
        add_numeric('home_away', data),
        add_category('team_1', data),
        add_category('hour_of_start', data),
        add_category('day_of_week', data)
    ],
    1
)

(87562, 30)
(87562, 9)
(87562, 7)


In [171]:
data = df_80_percent
train_data = np.concatenate(
    [
        add_numeric('barstool_team_1_prob', data),
        add_numeric('betfair_team_1_prob', data),
        add_numeric('betmgm_team_1_prob', data),
        add_numeric('betonlineag_team_1_prob', data),
        add_numeric('betrivers_team_1_prob', data),
        add_numeric('bovada_team_1_prob', data),
        add_numeric('circasports_team_1_prob', data),
        add_numeric('draftkings_team_1_prob', data),
        add_numeric('fanduel_team_1_prob', data),
        add_numeric('foxbet_team_1_prob', data),
        add_numeric('gtbets_team_1_prob', data),
        add_numeric('pinnacle_team_1_prob', data),
        add_numeric('pointsbetus_team_1_prob', data),
        add_numeric('sugarhouse_team_1_prob', data),
        add_numeric('twinspires_team_1_prob', data),
        add_numeric('unibet_team_1_prob', data),
        add_numeric('williamhillus_team_1_prob', data),
        add_numeric('wynnbet_team_1_prob', data),
        add_numeric('minutes_since_commence', data),
        add_numeric('home_away', data),
        add_category('team_1', data),
        add_category('hour_of_start', data),
        add_category('day_of_week', data)
    ],
    1
)

(71600, 30)
(71600, 9)
(71600, 7)


In [172]:
data = df_20_percent
test_data = np.concatenate(
    [
        add_numeric('barstool_team_1_prob', data),
        add_numeric('betfair_team_1_prob', data),
        add_numeric('betmgm_team_1_prob', data),
        add_numeric('betonlineag_team_1_prob', data),
        add_numeric('betrivers_team_1_prob', data),
        add_numeric('bovada_team_1_prob', data),
        add_numeric('circasports_team_1_prob', data),
        add_numeric('draftkings_team_1_prob', data),
        add_numeric('fanduel_team_1_prob', data),
        add_numeric('foxbet_team_1_prob', data),
        add_numeric('gtbets_team_1_prob', data),
        add_numeric('pinnacle_team_1_prob', data),
        add_numeric('pointsbetus_team_1_prob', data),
        add_numeric('sugarhouse_team_1_prob', data),
        add_numeric('twinspires_team_1_prob', data),
        add_numeric('unibet_team_1_prob', data),
        add_numeric('williamhillus_team_1_prob', data),
        add_numeric('wynnbet_team_1_prob', data),
        add_numeric('minutes_since_commence', data),
        add_numeric('home_away', data),
        add_category('team_1', data),
        add_category('hour_of_start', data),
        add_category('day_of_week', data)
    ],
    1
)

(15962, 30)
(15962, 9)
(15962, 7)


In [173]:
print(full_data.shape)
print(train_data.shape)
print(test_data.shape)

(87562, 66)
(71600, 66)
(15962, 66)


In [174]:
# Define the indices of the columns you want to standardize
continuous_vars_full = full_data[:, :19]
continuous_vars_test = test_data[:, :19]
continuous_vars_train = train_data[:, :19]

In [175]:
# Create an instance of StandardScaler and fit it on the training data
scaler = StandardScaler()
scaler.fit(continuous_vars_full)

In [176]:
# Standardize the columns of the training data
X_train = np.hstack((scaler.transform(continuous_vars_train), train_data[:, 19:-1]))
X_test = np.hstack((scaler.transform(continuous_vars_test), test_data[:, 19:-1]))

In [177]:
X_train[0].shape

(65,)

In [178]:
# Make our y var
y_train = df_80_percent['target'].values
y_test = df_20_percent['target'].values

In [179]:
# Making Torch datasets with our splits
train_data = torch.utils.data.TensorDataset(torch.tensor(X_train).float(), torch.tensor(y_train).float())
test_data = torch.utils.data.TensorDataset(torch.tensor(X_test).float(), torch.tensor(y_test).float())

In [180]:
# Set up loaders for each of our datasets 
train_loader = torch.utils.data.DataLoader(train_data, batch_size = 64, shuffle = True)
test_loader = torch.utils.data.DataLoader(test_data, batch_size = 64, shuffle = True)

In [181]:
# Set up the layers and activation functions of our model
# model = torch.nn.Sequential(   
#     torch.nn.Linear(65,256),
#     torch.nn.SiLU(),
#     torch.nn.Linear(256,128),
#     torch.nn.SiLU(),
#     torch.nn.Linear(128,64),
#     torch.nn.SiLU(),
#     torch.nn.Linear(64,64),
#     torch.nn.SiLU(),
#     torch.nn.Linear(64,16),
#     torch.nn.SiLU(),
#     torch.nn.Linear(16,1)
# )

In [188]:
# Set up the layers and activation functions of our model
model = torch.nn.Sequential(   
    torch.nn.Linear(65,800),
    torch.nn.SiLU(),
    torch.nn.Linear(800,800),
    torch.nn.SiLU(),
    torch.nn.Linear(800,800),
    torch.nn.SiLU(),
    torch.nn.Linear(800,800),
    torch.nn.SiLU(),
    torch.nn.Linear(800,256),
    torch.nn.SiLU(),
    torch.nn.Linear(256,1)
)

In [189]:
params_count = (65*800)+(800*800)+(800*800)+(800*800)+(800*256)+256
params_count

66832

In [190]:
# Defines our scoring function
def scoring_function(pred, label):
    return nn.functional.binary_cross_entropy_with_logits(pred, label)

# Defines number of epochs we want to train through
num_epochs = 25

# Defines our optimizer and the learning rate 
optimizer = torch.optim.Adam( model.parameters(), lr=.001  )#, weight_decay=.0001)

In [191]:
def calc_epoch_ev(wpp, avg_odds):
    profit_if_win = (100*(1/avg_odds))- 100
    return ((profit_if_win * wpp) - ((1-wpp)*100))

In [192]:
def train():
    # Initializes a list that will contain our batch losses for an individual epoch
    epoch_losses = []
    
    # Defines how we want to step through each batch in the epoch
    for batch in train_loader:
        
        # Resets the gradient to zero
        optimizer.zero_grad()

        # Prepare the input and output tensors for the current batch
        batchX = torch.tensor(batch[0], dtype=torch.float32)
        batchY = torch.tensor(batch[1], dtype=torch.float32)
        
        # Forward pass
        y_pred = model.forward(batchX)
        batchY = batchY.unsqueeze(1)  # Reshape to (batch_size, 1)
        
        # Compute the loss with weighted BCEWithLogitsLoss
        pos_weight = torch.tensor([5.0])  # higher weight for positive class
        
        criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
        loss = criterion(y_pred, batchY)
        
        # Store the loss for this batch in the list
        epoch_losses.append(loss.detach().clone())

        # Compute the gradient of the error with respect to the model parameters
        loss.mean().backward()

        # update the model parameters
        optimizer.step()
        
    all_epoch_loss = torch.tensor(epoch_losses)
    epoch_loss = torch.mean(all_epoch_loss)
    
    return epoch_loss

In [193]:
def test_model():
    # Put model in evaluation mode
    model.eval()

    # Initialize counters
    hit_count = 0
    win_preds_count = 0
    win_pred_hit = 0
    mean_odds_list = np.array([])
    
    # Initialize y_test
    y_test = []

    # Loop over the test data
    for batch in test_loader:
        # Prepare the input and output tensors for the current batch
        batchX = torch.tensor(batch[0], dtype=torch.float32)
        batchY = torch.tensor(batch[1], dtype=torch.float32)

        # Make predictions
        predictions = model(batchX)

        # Apply threshold
        predictions = torch.where(predictions > 0.75, 1, 0)

        # Count the number of positive predictions
        win_preds_count += torch.sum(predictions == 1).item()

        # Add the labels of the current batch to y_test
        y_test.extend(batchY.tolist())

        # Count the number of correctly predicted positive cases
        for i in range(len(predictions)):
            if predictions[i].item() == 1 and batchY[i].item() == 1:
                win_pred_hit += 1
            if predictions[i].item() == batchY[i].item():
                hit_count += 1
        
        # Filter batchX by positive predictions
        idx = predictions.squeeze() == 1
        
        batchX_test_scaled = scaler.inverse_transform(batchX[:, :19])
        
        batchX_test_scaled_tensor = torch.from_numpy(batchX_test_scaled)
        
        batchX_pos = batchX_test_scaled_tensor.masked_select(idx.unsqueeze(-1)).view(-1, 19)
        
        batchX_pos_first_col = batchX_pos[:, 0]
        
        batchX_pos_gt = batchX_pos_first_col[torch.gt(batchX_pos_first_col, .001)]
        
        mean_batch_odds = torch.mean(batchX_pos_gt)
        
        mean_odds_list = np.append(mean_odds_list, mean_batch_odds)
        
            
    # Convert y_test to a tensor
    y_test = torch.tensor(y_test, dtype=torch.float32)

    # Compute hit percentage and accuracy of win predictions
    hit_percent = hit_count / len(test_data)
    win_pred_accuracy = win_pred_hit / win_preds_count if win_pred_hit > 0 and win_preds_count > 0 else 0
    
    ev = calc_epoch_ev(win_pred_accuracy, np.mean(mean_odds_list))

    # Return the results
    return hit_percent, win_pred_accuracy, win_preds_count, np.mean(mean_odds_list), ev

In [194]:
# Calls the train function for each of our epochs, prints the running results
for epoch in range(num_epochs):
    
    ep_result = train()
    
    hit_percent, win_pred_accuracy, win_preds_count, avg_odds, ev = test_model()
    
    print(f'Epoch : {epoch + 1}, Loss: {ep_result}, Acc: {hit_percent: %}, WP Acc: {win_pred_accuracy: 2%}, WPs: {win_preds_count}, Avg Odds: {avg_odds}, EV: {ev}')

Epoch : 1, Loss: 1.0068098306655884, Acc:  66.119534%, WP Acc:  61.514365%, WPs: 11173, Avg Odds: 0.5966671508912762, EV: 3.0966174275468674
Epoch : 2, Loss: 0.9598028659820557, Acc:  65.405338%, WP Acc:  60.941532%, WPs: 11237, Avg Odds: 0.582149436032868, EV: 4.6836579500569755
Epoch : 3, Loss: 0.862000048160553, Acc:  64.164892%, WP Acc:  60.010626%, WPs: 11293, Avg Odds: 0.5749679901488706, EV: 4.3721164999088415
Epoch : 4, Loss: 0.6740446090698242, Acc:  67.842376%, WP Acc:  64.220092%, WPs: 10014, Avg Odds: 0.5947613964104749, EV: 7.976227540932307
Epoch : 5, Loss: 0.4726661741733551, Acc:  65.467986%, WP Acc:  63.213101%, WPs: 9343, Avg Odds: 0.5893542867424986, EV: 7.258235223006984
Epoch : 6, Loss: 0.3300187885761261, Acc:  66.276156%, WP Acc:  63.534070%, WPs: 9598, Avg Odds: 0.5907511126329554, EV: 7.547947416745323
Epoch : 7, Loss: 0.2581048011779785, Acc:  65.192332%, WP Acc:  64.071022%, WPs: 8617, Avg Odds: 0.5928521794439895, EV: 8.072508829562906
Epoch : 8, Loss: 0.222

In [114]:
teams = test_data[:, 20:49]

IndexError: too many indices for tensor of dimension 1

In [None]:
print(teams)

In [None]:
# Get the counts for each category
counts = np.sum(teams, axis=0)

print(counts)

# Plot the distribution using a bar chart
labels = [i for i in range(30)]
plt.bar(labels, counts)
plt.title('Distribution of Categorical Data')
plt.xlabel('Categories')
plt.ylabel('Counts')
plt.show()

# Plot the distribution using a histogram
categories = np.argmax(one_hot_data, axis=1)
plt.hist(categories, bins=3, align='left', rwidth=0.5)
plt.xticks([0, 1, 2], labels)
plt.title('Distribution of Categorical Data')
plt.xlabel('Categories')
plt.ylabel('Counts')
plt.show()

In [None]:
y_test = np.array(y_test)  # convert y_test to a numpy array
counts_1 = np.sum(teams[y_test.astype(int) == 1], axis=0)
counts_0 = np.sum(teams[y_test.astype(int) == 0], axis=0)
counts = np.sum(teams, axis=0)

print(counts_1)
print(counts_0)
print(counts)



In [None]:
# plot bars in stack manner
x = [i for i in range(30)]
plt.bar(x, counts_1, color='r')
#plt.bar(x, counts_0, bottom=counts_1, color='b')
plt.show()