In [40]:
import pandas as pd
import dask.dataframe as dd
import pyarrow as pa
import numpy as np
import random
import sklearn
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import torch
import torch.nn.functional as F
from torch import nn
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import pickle
import pyperclip
import datetime

In [90]:
# Read the raw data file
df = pd.read_parquet('/Users/stefanfeiler/Desktop/SmartBetter/SmartBetter/data/mlb_for_model.parquet')
df['number_of_game_today'] = df['number_of_game_today'].astype(int)

In [91]:
# Reassign two park_ids and one hour_of_start to the most common park_ids and hour_of_start because there are only 2 obsevations for each of these park_ids and splitting into test and train is impossible.
park_value = 'LOS03'
df.loc[df['park_id'] == 'WIL02', 'park_id'] = park_value
df.loc[df['park_id'] == 'DYE01', 'park_id'] = park_value

hour_value = 19
df.loc[df['hour_of_start'] == 20, 'hour_of_start'] = hour_value

In [92]:
# df.columns

In [93]:
encoders = {}

In [94]:
def add_category(column_name, data):
    
    column_index = data.columns.tolist().index(column_name)
    
    arr = data[column_name].values.reshape(-1,1)
    
    coder = OneHotEncoder(sparse_output=False)
    
    onehots = coder.fit_transform(arr)
    
    encoders[column_name] = coder
    
    return onehots

In [95]:
def add_numeric(column_name, data):
    
    # Get the index from the full df of the input column name
    column_index = data.columns.tolist().index(column_name)
    
    # Make an array of the column values 
    arr = data[column_name].values.reshape(-1,1).astype('float')
    
    return arr

In [96]:
def make_data(input_df):
    data = input_df
    return_data = np.concatenate(
        [
            add_numeric('barstool_1_odds', data),
            add_numeric('betclic_1_odds', data),
            add_numeric('betfair_1_odds', data),
            add_numeric('betfred_1_odds', data),
            add_numeric('betmgm_1_odds', data),
            add_numeric('betonlineag_1_odds', data),
            add_numeric('betrivers_1_odds', data),
            add_numeric('betus_1_odds', data),
            add_numeric('betway_1_odds', data),
            add_numeric('bovada_1_odds', data),
            add_numeric('casumo_1_odds', data),
            add_numeric('circasports_1_odds', data),
            add_numeric('coral_1_odds', data),
            add_numeric('draftkings_1_odds', data),
            add_numeric('fanduel_1_odds', data),
            add_numeric('foxbet_1_odds', data),
            add_numeric('gtbets_1_odds', data),
            add_numeric('ladbrokes_1_odds', data),
            add_numeric('lowvig_1_odds', data),
            add_numeric('marathonbet_1_odds', data),
            add_numeric('matchbook_1_odds', data),
            add_numeric('mrgreen_1_odds', data),
            add_numeric('mybookieag_1_odds', data),
            add_numeric('nordicbet_1_odds', data),
            add_numeric('onexbet_1_odds', data),
            add_numeric('paddypower_1_odds', data),
            add_numeric('pinnacle_1_odds', data),
            add_numeric('pointsbetus_1_odds', data),
            add_numeric('sport888_1_odds', data),
            add_numeric('sugarhouse_1_odds', data),
            add_numeric('superbook_1_odds', data),
            add_numeric('twinspires_1_odds', data),
            add_numeric('unibet_1_odds', data),
            add_numeric('unibet_eu_1_odds', data),
            add_numeric('unibet_uk_1_odds', data),
            add_numeric('unibet_us_1_odds', data),
            add_numeric('williamhill_1_odds', data),
            add_numeric('williamhill_us_1_odds', data),
            add_numeric('wynnbet_1_odds', data),
            add_numeric('minutes_since_commence', data),
            add_numeric('this_team_game_of_season', data),
            add_numeric('opponent_game_of_season', data),
            
            add_category('home_away', data),
            add_category('team_1', data),
            add_category('hour_of_start', data),
            add_category('day_of_week', data),
            add_category('number_of_game_today', data),
            add_category('day_night', data),
            add_category('park_id', data),
            add_category('this_team_league', data),
            add_category('opponent_league', data),
        ],
        1
    )
    return return_data

In [97]:
# unique_game_ids = df['game_id'].unique()
# unique_game_ids_train, unique_game_ids_test = train_test_split(unique_game_ids, test_size=0.2, random_state=42)
# training_data = df[df['game_id'].isin(unique_game_ids_train)]
# testing_data = df[df['game_id'].isin(unique_game_ids_test)]

In [98]:
# Obtain unique game IDs
unique_game_ids = df['game_id'].unique()

# Split unique game IDs into train, test, and validation sets
unique_game_ids_train, unique_game_ids_holdout = train_test_split(unique_game_ids, test_size=0.3, random_state=42)
unique_game_ids_test, unique_game_ids_val = train_test_split(unique_game_ids_holdout, test_size=0.5, random_state=42)

# Filter the original DataFrame based on the train, test, and validation game IDs
training_data = df[df['game_id'].isin(unique_game_ids_train)]
testing_data = df[df['game_id'].isin(unique_game_ids_test)]
validation_data = df[df['game_id'].isin(unique_game_ids_val)]

In [99]:
# Group by 'park_id' and count the number of unique 'game_id' values
park_id_game_count = df.groupby('hour_of_start')['game_id'].nunique().reset_index()

# Sort the counts in ascending order
sorted_counts = park_id_game_count.sort_values('game_id')

# Display the sorted counts
print(sorted_counts)

    hour_of_start  game_id
0               8       10
6              14       47
1               9       95
4              12      245
11             19      273
3              11      441
7              15      453
5              13      573
2              10      579
10             18      616
9              17      840
8              16     1632


In [100]:
full_y = df['target']
full_data = df.drop('target', axis='columns')

training_y = training_data['target']
training_data = training_data.drop('target', axis='columns')

testing_y = testing_data['target']
testing_data = testing_data.drop('target', axis='columns')

validation_y = validation_data['target']
validation_data = validation_data.drop('target', axis = 'columns')

In [101]:
full_data = make_data(df)
training_data = make_data(training_data)
testing_data = make_data(testing_data)
validation_data = make_data(validation_data)

In [102]:
print(full_data.shape)
print(training_data.shape)
print(testing_data.shape)
print(validation_data.shape)

(705958, 134)
(492668, 134)
(104948, 134)
(108342, 133)


In [52]:
full_data[:, :39]

array([[2.05, 2.2 , 2.14, ..., 0.  , 0.  , 0.  ],
       [2.17, 2.15, 1.53, ..., 0.  , 0.  , 0.  ],
       [2.02, 2.15, 1.96, ..., 0.  , 0.  , 0.  ],
       ...,
       [0.  , 0.  , 2.28, ..., 2.2 , 2.25, 0.  ],
       [0.  , 0.  , 2.28, ..., 2.2 , 2.25, 0.  ],
       [0.  , 0.  , 2.28, ..., 2.2 , 2.25, 0.  ]])

In [53]:
# Define the indices of the columns you want to standardize and those we don't
continuous_vars = full_data[:, :42]
categorical_vars = full_data[:, 42:]

In [54]:
# Create an instance of StandardScaler and fit it on the training data
scaler = StandardScaler()
scaler.fit(continuous_vars)

In [55]:
continuous_vars_train = training_data[:, :42]
categorical_vars_train = training_data[:, 42:]

continuous_vars_test = testing_data[:, :42]
categorical_vars_test = testing_data[:, 42:]

continuous_vars_full = full_data[:, :42]
categorical_vars_full = full_data[:, 42:]

In [56]:
# Standardize the columns of the dataset
X_train = np.hstack((scaler.transform(continuous_vars_train), categorical_vars_train))
X_test = np.hstack((scaler.transform(continuous_vars_test), categorical_vars_test))
X_full = np.hstack((scaler.transform(continuous_vars_full), categorical_vars_full))

In [57]:
print(X_train.shape)
print(X_test.shape)
print(X_full.shape)

(260770, 134)
(64975, 134)
(325745, 134)


In [58]:

# Convert input data to numpy arrays
X_train_np = X_train.astype(np.float32)
X_test_np = X_test.astype(np.float32)
y_train_np = training_y.values.astype(np.float32)  # Convert y_train to numpy array
y_test_np = testing_y.values.astype(np.float32)    # Convert y_test to numpy array

# Create Torch datasets with the numpy arrays
train_data = torch.utils.data.TensorDataset(torch.tensor(X_train_np), torch.tensor(y_train_np))
test_data = torch.utils.data.TensorDataset(torch.tensor(X_test_np), torch.tensor(y_test_np))

In [59]:
# Set a fixed random seed for reproducibility
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x7fbe132b55f0>

In [60]:
# Set up loaders for each of our datasets 
train_loader = torch.utils.data.DataLoader(train_data, batch_size = 64, shuffle = True)
test_loader = torch.utils.data.DataLoader(test_data, batch_size = 64, shuffle = True)

In [61]:
X_train_np.shape

(260770, 134)

In [23]:
# # Set up the layers and activation functions of our model
# model = torch.nn.Sequential(   
#     torch.nn.Linear(134,256),
#     torch.nn.ReLU(),
#     torch.nn.Linear(256,256),
#     torch.nn.ReLU(),
#     torch.nn.Linear(256,128),
#     torch.nn.ReLU(),
#     torch.nn.Linear(128,64),
#     torch.nn.ReLU(),
#     torch.nn.Linear(64,16),
#     torch.nn.ReLU(),
#     torch.nn.Linear(16,1)
# )
# params_count = (134*256)+(256*256)+(256*128)+(128*64)+(64*16)+16
# params_count


In [24]:
# # Set up the layers and activation functions of our model
# model = torch.nn.Sequential(   
#     torch.nn.Linear(134,256),
#     torch.nn.Sigmoid(),
#     torch.nn.Linear(256,256),
#     torch.nn.Sigmoid(),
#     torch.nn.Linear(256,128),
#     torch.nn.Sigmoid(),
#     torch.nn.Linear(128,64),
#     torch.nn.Sigmoid(),
#     torch.nn.Linear(64,16),
#     torch.nn.Sigmoid(),
#     torch.nn.Linear(16,1)
# )
# params_count = (134*256)+(256*256)+(256*128)+(128*64)+(64*16)+16
# params_count

In [25]:
# Set up the layers and activation functions of our model
# model = torch.nn.Sequential(   
#     torch.nn.Linear(137,800),
#     torch.nn.SiLU(),
#     torch.nn.Linear(800,800),
#     torch.nn.SiLU(),
#     torch.nn.Linear(800,800),
#     torch.nn.SiLU(),
#     torch.nn.Linear(800,800),
#     torch.nn.SiLU(),
#     torch.nn.Linear(800,256),
#     torch.nn.SiLU(),
#     torch.nn.Linear(256,1)
# )

In [62]:
# Defines our scoring function
def scoring_function(pred, label):
    return nn.functional.binary_cross_entropy_with_logits(pred, label)

# Defines number of epochs we want to train through
# num_epochs = 1

# Defines our optimizer and the learning rate 
# optimizer = torch.optim.Adam( model.parameters(), lr=.001)#  , weight_decay=.0001)

In [63]:
def calc_batch_stats(c_tensor, i_tensor):
    unscaled_correct = scaler.inverse_transform(c_tensor[:, :42]) if len(c_tensor) > 0 else np.array([]) # unscale the continuous variables
    unscaled_incorrect = scaler.inverse_transform(i_tensor[:, :42]) if len(i_tensor) > 0 else np.array([]) # unscale the continuous variables

    amt_correct_predictions = len(unscaled_correct) # amount of correct predictions
    amt_incorrect_predictions = len(unscaled_incorrect) # amount of incorrect predictions
    amt_total_bets = (amt_correct_predictions + amt_incorrect_predictions) # amount of total bets placed

    if len(unscaled_correct) > 0:
        unscaled_correct = unscaled_correct[:, :39] # only odds data
        sum = 0 # calculate the average market odds
        count = 0
        for row in unscaled_correct:
            for val in row:
                if val > 0.01: # don't average the 'missing' (0) values 
                    sum += val
                    count +=1
        average_odds_won = sum / count
    else:
        average_odds_won = 0

    money_profited = ((amt_correct_predictions * average_odds_won * 100) - (100 * amt_correct_predictions))

    money_lost = (amt_incorrect_predictions * 100)

    p_l = money_profited - money_lost
    
    if amt_total_bets > 0: 
        ev_per_bet = p_l/amt_total_bets
        correct_pred_percet = amt_correct_predictions/amt_total_bets
    else:
        correct_pred_percet = 0
        ev_per_bet = 0

    return p_l, correct_pred_percet, ev_per_bet, amt_total_bets

In [64]:
def calc_epoch_stats(c_list, i_list):
    odds_sum = 0 
    odds_count = 0
    correct_preds_count = 0
    incorrect_preds_count = 0

    # For each batch:
    for c_tensor in c_list:
        unscaled_correct = scaler.inverse_transform(c_tensor[:, :42]) if len(c_tensor) > 0 else np.array([]) # unscale the continuous variables
        amt_correct_predictions = len(unscaled_correct) # amount of correct predictions
        correct_preds_count += amt_correct_predictions

        if len(unscaled_correct) > 0:
            unscaled_correct = unscaled_correct[:, :39] # only odds data
            for row in unscaled_correct:
                for val in row:
                    if val > 0.01: # don't average the 'missing' (0) values 
                        odds_sum += val
                        odds_count +=1
    
    for i_tensor in i_list:
        unscaled_incorrect = scaler.inverse_transform(i_tensor[:, :42]) if len(i_tensor) > 0 else np.array([]) # unscale the continuous variables
        amt_incorrect_predictions = len(unscaled_incorrect) # amount of incorrect predictions
        incorrect_preds_count += amt_incorrect_predictions
        

    
    average_odds_won = odds_sum / odds_count
    amt_total_bets = (correct_preds_count + incorrect_preds_count) # amount of total bets placed
    money_profited = ((correct_preds_count * average_odds_won * 100) - (100 * correct_preds_count))
    money_lost = (incorrect_preds_count * 100)

    total_p_l = money_profited - money_lost

    if amt_total_bets > 0: 
        ev_per_bet = total_p_l/amt_total_bets
        correct_pred_percet = correct_preds_count/amt_total_bets
    else:
        correct_pred_percet = 0
        ev_per_bet = 0

    return total_p_l, correct_pred_percet, ev_per_bet, amt_total_bets

In [65]:
def calculate_auc(tpr, fpr):
    sorted_indices = np.argsort(fpr)  # Sort based on FPR
    sorted_tpr = np.array(tpr)[sorted_indices]
    sorted_fpr = np.array(fpr)[sorted_indices]

    # Calculate the AUC using the sorted TPR and FPR arrays
    auc = np.trapz(sorted_tpr, sorted_fpr)

    print(f"AUC: {auc}")

    return auc

In [66]:
def find_best_thresholds(my_thresholds, input_evs_per_bet, input_amounts_of_bets, input_precisions, input_auc):

  my_list = []

  thresholds_new = np.array([t.detach().numpy() for t in my_thresholds])
  evs_per_bet = np.array(input_evs_per_bet)
  amounts_of_bets = np.array(input_amounts_of_bets)
  precisions = np.array(input_precisions)


  min_bets = 1373
  max_bets = 6864

  # Select the subset that fits our bet frequency criteria
  filtered_amounts_of_bets = amounts_of_bets[(amounts_of_bets > min_bets) & (amounts_of_bets < max_bets)]
  filtered_evs = evs_per_bet[(amounts_of_bets > min_bets) & (amounts_of_bets < max_bets)]
  filtered_precisions = precisions[(amounts_of_bets > min_bets) & (amounts_of_bets < max_bets)]
  filtered_thresholds = thresholds_new[(amounts_of_bets > min_bets) & (amounts_of_bets < max_bets)]
  # Now sort by precisions
  sorted_indices = np.argsort(filtered_evs)[::-1]
  # sorted_indices = np.argsort(filtered_precisions)[::-1]
  
  sorted_filtered_precisions = filtered_precisions[sorted_indices]
  sorted_filtered_evs = filtered_evs[sorted_indices]
  sorted_filtered_thresholds = filtered_thresholds[sorted_indices]
  sorted_filtered_amounts = filtered_amounts_of_bets[sorted_indices]

  # Now select the best 5
  sorted_filtered_precisions_best = sorted_filtered_precisions[:5]
  sorted_filtered_evs_best = sorted_filtered_evs[:5]
  sorted_filtered_thresholds_best = sorted_filtered_thresholds[:5]
  sorted_filtered_amounts_best = sorted_filtered_amounts[:5]

  my_list.append(input_auc)
  for i in range(len(sorted_filtered_precisions_best)):
    my_list.append(sorted_filtered_thresholds_best[i])
    my_list.append(sorted_filtered_amounts_best[i])
    my_list.append(sorted_filtered_evs_best[i])
    my_list.append(sorted_filtered_precisions_best[i])

  return my_list

In [67]:
def calc_stats_full_train(list_predictions, list_targets, list_batchx, sigmoid):
    # We have fully trained our model

    # Define the thresholds we're testing based on the model architecture
    thresholds = []

    # Define some lists whose values we want to see change across thresholds
    tprs = []

    fprs = []

    precisions = []

    amounts_of_bets = []

    evs_per_bet = []

    # Gets data regarding the whole run 
    predictions = torch.cat(list_predictions, dim=0)
    targets = torch.cat(list_targets, dim=0)
    x_vals = torch.cat(list_batchx, dim=0)

    if sigmoid:
        for value in range(1, 101):
          threshold = value / 100.0
          thresholds.append(threshold)
    else:
       # Find the minimum and maximum values
        min_value = torch.min(predictions)
        max_value = torch.max(predictions)
        # Calculate the range and step size
        value_range = max_value - min_value
        step_size = value_range / 100.0
        # Step through the range
        for i in range(101):
            threshold = min_value + i * step_size
            thresholds.append(threshold)
    

    for threshold in thresholds:
        # Defines some variables 
        odds_sum = 0 
        bet_count = 0
        row_sum = 0
        row_count = 0

        thresh_predictions = torch.where(predictions > threshold, 1, 0)

        thresh_predictions = thresh_predictions.squeeze()
        
        # Splits our sets 
        true_pos = x_vals[(thresh_predictions == 1) & (targets == 1)] # True positives

        false_pos = x_vals[(thresh_predictions == 1) & (targets == 0)] # False positives
        
        true_neg = x_vals[(thresh_predictions == 0) & (targets == 0)] # True negatives
        false_neg = x_vals[(thresh_predictions == 0) & (targets == 1)] # False negatives

        # Gets info about our sets
        amount_of_correct_pos_preds = true_pos.shape[0]
        amount_of_incorrect_pos_preds = false_pos.shape[0]
        amount_of_bets = amount_of_correct_pos_preds + amount_of_incorrect_pos_preds
        
        # Unscale our true_pos set
        unscaled_true_pos = scaler.inverse_transform(true_pos[:, :42]) if len(true_pos) > 0 and hasattr(scaler, 'scale_') else np.array([])

        # If we have any amount of true positive predictions
        if len(unscaled_true_pos) > 0:
                # Select only the odds data
                unscaled_true_pos = unscaled_true_pos[:, :39] # only odds data
                # Iterate through each prediciton
                for row in unscaled_true_pos:

                    # Get the market average for that bet
                    row_sum = 0
                    row_count = 0
                    for val in row:
                        if val > 0.01 and val < 100: # don't average the 'missing' (0) values 
                            row_sum += val
                            row_count +=1
                    # If we have non-zero values in that row
                    if row_count > 0:
                        bet_avg_odds = row_sum / row_count
                        # Add this bets odds to the average 
                        odds_sum += bet_avg_odds
                        bet_count +=1
                    
        # Average odds across all positive predictions for this whole run
        if bet_count > 0:
            avg_odds = odds_sum / bet_count
            
            
            gross_p_l = (avg_odds * 100 * bet_count) - (bet_count * 100)

            net_p_l = gross_p_l - (100 * amount_of_incorrect_pos_preds)

            ev_per_bet = net_p_l / bet_count
            tpr = true_pos.shape[0] / (true_pos.shape[0] + false_neg.shape[0])
            fpr = false_pos.shape[0] / (false_pos.shape[0] + true_neg.shape[0])
            precision = true_pos.shape[0] / (true_pos.shape[0] + false_pos.shape[0])


        elif bet_count == 0:
            avg_odds = 0
            net_p_l = 0
            ev_per_bet = 0
            tpr = 0
            fpr = 0
            precision = 0
            

        tprs.append(tpr)

        fprs.append(fpr)

        precisions.append(precision)

        amounts_of_bets.append(amount_of_bets)

        evs_per_bet.append(ev_per_bet)
            
    auc = calculate_auc(tprs, fprs)

    info_list = find_best_thresholds(thresholds, evs_per_bet, amounts_of_bets, precisions, auc)
    
    return info_list

In [68]:
def calc_tpr_fpr_auc(list_predictions, list_targets, list_batchx):
    predictions = torch.cat(list_predictions, dim=0)

    targets = torch.cat(list_targets, dim=0)

    x_vals = torch.cat(list_batchx, dim=0)

    tprs = []

    fprs = []

    precisions = []

    min_pred = torch.min(predictions)

    max_pred = torch.min(predictions)

    # Convert the range step size to integers
    step_size = 0.01

    num_steps = int((max_pred - min_pred) / step_size)


    for value in range(1, 101):

        threshold = value / 100.0

        thresh_predictions = torch.where(predictions > threshold, 1, 0)

        true_pos = x_vals[(thresh_predictions.squeeze() == 1) & (targets.squeeze() == 1)] # True positives
        false_pos = x_vals[(thresh_predictions.squeeze() == 1) & (targets.squeeze() == 0)] # False positives
        true_neg = x_vals[(thresh_predictions.squeeze() == 0) & (targets.squeeze() == 0)] # True negatives
        false_neg = x_vals[(thresh_predictions.squeeze() == 0) & (targets.squeeze() == 1)] # False negatives

        tpr = true_pos.shape[0] / (true_pos.shape[0] + false_neg.shape[0])

        fpr = false_pos.shape[0] / (false_pos.shape[0] + true_neg.shape[0])

        precision = true_pos.shape[0] / (true_pos.shape[0] + false_pos.shape[0])

        tprs.append(tpr)

        fprs.append(fpr)

        precisions.append(precision)

    auc = calculate_auc(tprs, fprs)

    print(f'Precisions: {precisions}')
    
    return auc

In [69]:
def train(model, positive_weight):
    # Initializes a list that will contain our batch losses for an individual epoch
    epoch_losses = []
    
    # Defines how we want to step through each batch in the epoch
    for batch in train_loader:
        
        # Resets the gradient to zero
        optimizer.zero_grad()

        # Prepare the input and output tensors for the current batch
        batchX = torch.tensor(batch[0], dtype=torch.float32)
        batchY = torch.tensor(batch[1], dtype=torch.float32)
        
        # Forward pass
        y_pred = model.forward(batchX)
        batchY = batchY.unsqueeze(1)  # Reshape to (batch_size, 1)
        
        # Compute the loss with weighted BCEWithLogitsLoss
        pos_weight = torch.tensor([positive_weight])  # higher weight for positive class
        
        criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
        #criterion = nn.BCEWithLogitsLoss()

        loss = criterion(y_pred, batchY)
        
        # Store the loss for this batch in the list
        epoch_losses.append(loss.detach().clone())

        # Compute the gradient of the error with respect to the model parameters
        loss.mean().backward()

        # update the model parameters
        optimizer.step()
        
    all_epoch_loss = torch.tensor(epoch_losses)
    epoch_loss = torch.mean(all_epoch_loss)
    
    return epoch_loss

In [70]:
def test_model():
    epoch_correct_pred_tensors = []
    epoch_incorrect_pred_tensors = []

    # Put model in evaluation mode
    model.eval()

    # Loop over the test data
    for batch in test_loader:
        # Prepare the input and output tensors for the current batch
        batchX = torch.tensor(batch[0], dtype=torch.float32)
        batchY = torch.tensor(batch[1], dtype=torch.float32)

        # Make predictions
        predictions = model(batchX)

        # Apply threshold
        # # TODO: HYPERPARAM
        # preds_numpy = predictions.detach().numpy()

        # preds_list.append(preds_numpy)

        predictions = torch.where((predictions > 5) & (predictions < 15), 1, 0)

        correct_pred_rows = batchX[(predictions.squeeze() == 1) & (batchY.squeeze() == 1)] # find all rows where we make a correct win prediction

        incorrect_pred_rows = batchX[(predictions.squeeze() == 1) & (batchY.squeeze() == 0)] # find all rows where we make an incorrect win prediction

        epoch_correct_pred_tensors.append(correct_pred_rows)

        epoch_incorrect_pred_tensors.append(incorrect_pred_rows)

    # Return the results
    return calc_epoch_stats(epoch_correct_pred_tensors, epoch_incorrect_pred_tensors)

In [71]:
def test_model_tpr_fpr_auc():
    # New list for all of the batch predicitons 
    all_predictions = []

    # New list for all of the batch targets 
    all_targets = []

    # New list for each batch
    all_batchx = []

    # Put model in evaluation mode
    model.eval()

    # Loop over the test data
    for batch in test_loader:
        # Prepare the input and output tensors for the current batch
        batchX = torch.tensor(batch[0], dtype=torch.float32)
        batchY = torch.tensor(batch[1], dtype=torch.float32)

        # Make predictions
        predictions = model(batchX)

        all_predictions.append(predictions)

        all_targets.append(batchY)

        all_batchx.append(batchX)

    auc = calc_tpr_fpr_auc(all_predictions, all_targets, all_batchx)

    # Return the results
    return auc

In [72]:
def tune_model_test(model, sigmoid):
    # New list for all of the batch predicitons 
    all_predictions = []

    # New list for all of the batch targets 
    all_targets = []

    # New list for each batch
    all_batchx = []

    # Put model in evaluation mode
    model.eval()

    # Loop over the test data
    for batch in test_loader:
        # Prepare the input and output tensors for the current batch
        batchX = torch.tensor(batch[0], dtype=torch.float32)
        batchY = torch.tensor(batch[1], dtype=torch.float32)

        # Make predictions
        predictions = model(batchX)

        all_predictions.append(predictions)

        all_targets.append(batchY)

        all_batchx.append(batchX)

    #auc = calc_tpr_fpr_auc(all_predictions, all_targets, all_batchx)
    stats = calc_stats_full_train(all_predictions, all_targets, all_batchx, sigmoid)

    # Return the results
    return stats

In [73]:
def define_model(architecture, my_lr, my_weight_decay, my_num_epochs, my_batch_size):
    print(f'Architecture: {architecture} learning rate: {my_lr} weight decay: {my_weight_decay} num epochs: {my_num_epochs} batch size: {my_batch_size}')

    if architecture == 'sigmoid':
        model = torch.nn.Sequential(   
            torch.nn.Linear(134,256),
            torch.nn.Sigmoid(),
            torch.nn.Linear(256,256),
            torch.nn.Sigmoid(),
            torch.nn.Linear(256,128),
            torch.nn.Sigmoid(),
            torch.nn.Linear(128,64),
            torch.nn.Sigmoid(),
            torch.nn.Linear(64,16),
            torch.nn.Sigmoid(),
            torch.nn.Linear(16,1)
        )
    elif architecture == 'relu': 
        model = torch.nn.Sequential(   
            torch.nn.Linear(134,256),
            torch.nn.ReLU(),
            torch.nn.Linear(256,256),
            torch.nn.ReLU(),
            torch.nn.Linear(256,128),
            torch.nn.ReLU(),
            torch.nn.Linear(128,64),
            torch.nn.ReLU(),
            torch.nn.Linear(64,16),
            torch.nn.ReLU(),
            torch.nn.Linear(16,1)
        )
    elif architecture == 'silu':
        model = torch.nn.Sequential(   
            torch.nn.Linear(134,256),
            torch.nn.SiLU(),
            torch.nn.Linear(256,256),
            torch.nn.SiLU(),
            torch.nn.Linear(256,128),
            torch.nn.SiLU(),
            torch.nn.Linear(128,64),
            torch.nn.SiLU(),
            torch.nn.Linear(64,16),
            torch.nn.SiLU(),
            torch.nn.Linear(16,1)
        )


    
    return model

In [74]:
hyper_params = pd.read_csv('/Users/stefanfeiler/Desktop/SmartBetter/SmartBetter/data/tuning_info_test.csv')
stats_list = []
column_names = ['auc', 'thresh_1', 'tbp_1', 'ev_1', 'prec_1', 'thresh_2', 'tbp_2', 'ev_2', 'prec_2', 'thresh_3', 'tbp_3', 'ev_3', 'prec_3', 'thresh_4', 'tbp_4', 'ev_4', 'prec_4', 'thresh_5', 'tbp_5', 'ev_5', 'prec_5']
cols = pd.DataFrame(columns=column_names)
cols.to_csv('/Users/stefanfeiler/Desktop/SmartBetter/SmartBetter/data/tuning_info.csv', index=False)

# for i in range(len(hyper_params)):
for i in range(1):
  architecture = hyper_params['architecture'].iloc[i]
  lr = hyper_params['learning_rate'].iloc[i]
  weight_decay = hyper_params['weight_decay'].iloc[i]
  num_epochs = int(hyper_params['num_epochs'].iloc[i])
  batch_size = int(hyper_params['batch_size'].iloc[i])
  pos_weight = int(hyper_params['pos_weight'].iloc[i])
  if architecture == 'sigmoid':
    sigmoid = True
  elif architecture != 'sigmoid':
    sigmoid = False

  model = define_model(architecture, lr, weight_decay, num_epochs, batch_size)

  optimizer = torch.optim.Adam( model.parameters(), lr=lr)#  , weight_decay=.0001)


  # Set up loaders for each of our datasets 
  train_loader = torch.utils.data.DataLoader(train_data, batch_size = batch_size, shuffle = True)
  test_loader = torch.utils.data.DataLoader(test_data, batch_size = batch_size, shuffle = True)

  if weight_decay == 'on':
      optimizer = torch.optim.Adam( model.parameters(), lr=lr, weight_decay=.0001)
  else:
      optimizer = torch.optim.Adam( model.parameters(), lr=lr)

  best_loss = float('inf')

  patience = 3  # Number of epochs without improvement before stopping

  epochs_without_improvement = 0

  # Calls the train function for each of our epochs, prints the running results
  for epoch in range(num_epochs):

    ep_result = train(model, pos_weight)

    # Check if the validation loss has improved
    if ep_result < best_loss:
        best_loss = ep_result
        epochs_without_improvement = 0
    else:
        epochs_without_improvement += 1

    # Check if the training should stop
    if epochs_without_improvement >= patience:
        print("Early stopping: No improvement in training loss.")
        break
  
  # Gets the testing data
  info_list = tune_model_test(model, sigmoid)

  # do something with stats 
  info_df = pd.DataFrame(info_list).T

  info_df.to_csv('/Users/stefanfeiler/Desktop/SmartBetter/SmartBetter/data/tuning_info.csv', mode='a', header=False, index=False)

  info_df = pd.read_csv('/Users/stefanfeiler/Desktop/SmartBetter/SmartBetter/data/tuning_info.csv')

  new_df = pd.concat([hyper_params, info_df], axis = 1)

  new_df.to_csv('/Users/stefanfeiler/Desktop/SmartBetter/SmartBetter/data/tuning_results.csv', index=False)

  print(f'Model {i + 1}: {len(hyper_params)} trained successfully. Best EV: {info_df["ev_1"].iloc[i]}')


Architecture: silu learning rate: 0.001 weight decay: on num epochs: 5 batch size: 4096
AUC: 0.6229795988559543
Model 1: 15 trained successfully. Best EV: 10.203719589059382


In [None]:
# Assuming your model is named 'model'
torch.save(model.state_dict(), 'MODELS/mlb_model_pikes.pth')

# Assuming 'encoder' is your trained OneHotEncoder object
with open('MODELS/mlb_encoders_pikes.pkl', 'wb') as file:
    pickle.dump(encoders, file)

# Save the scaler using pickle
with open('MODELS/mlb_scaler_pikes.pkl', 'wb') as file:
    pickle.dump(scaler, file)

In [None]:
big_list = []

In [None]:
test1 = ['test']
test2 = ['test2']

In [None]:
big_list.append([test1])

In [None]:
big_list.append(test2)

In [None]:
big_list

In [None]:
teams = test_data[:, 20:49]

In [None]:
print(teams)

In [None]:
# Get the counts for each category
counts = np.sum(teams, axis=0)

print(counts)

# Plot the distribution using a bar chart
labels = [i for i in range(30)]
plt.bar(labels, counts)
plt.title('Distribution of Categorical Data')
plt.xlabel('Categories')
plt.ylabel('Counts')
plt.show()

# Plot the distribution using a histogram
categories = np.argmax(one_hot_data, axis=1)
plt.hist(categories, bins=3, align='left', rwidth=0.5)
plt.xticks([0, 1, 2], labels)
plt.title('Distribution of Categorical Data')
plt.xlabel('Categories')
plt.ylabel('Counts')
plt.show()

In [None]:
y_test = np.array(y_test)  # convert y_test to a numpy array
counts_1 = np.sum(teams[y_test.astype(int) == 1], axis=0)
counts_0 = np.sum(teams[y_test.astype(int) == 0], axis=0)
counts = np.sum(teams, axis=0)

print(counts_1)
print(counts_0)
print(counts)



In [None]:
# plot bars in stack manner
x = [i for i in range(30)]
plt.bar(x, counts_1, color='r')
#plt.bar(x, counts_0, bottom=counts_1, color='b')
plt.show()