In [423]:
import pandas as pd
import dask.dataframe as dd
import pyarrow as pa
import numpy as np
import random
import sklearn
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import torch
import torch.nn.functional as F
from torch import nn
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [424]:
# Read the raw data file
df = pd.read_parquet('/Users/stefanfeiler/Desktop/SmartBetter/SmartBetter/data/mlb_for_model.parquet')

In [425]:
def add_category(column_name, data):
    
    column_index = data.columns.tolist().index(column_name)
    
    arr = data[column_name].values.reshape(-1,1)
    
    coder = OneHotEncoder(sparse_output=False)
    
    onehots = coder.fit_transform(arr)
    
    #print(onehots.shape)
    
    return onehots

In [426]:
def add_numeric(column_name, data):
    
    # Get the index from the full df of the input column name
    column_index = data.columns.tolist().index(column_name)
    
    # Make an array of the column values 
    arr = data[column_name].values.reshape(-1,1).astype('float')
    
    return arr

In [427]:
def make_full_data(input_df):
    data = input_df
    full_data = np.concatenate(
        [
            add_numeric('barstool_1_odds', data),
            add_numeric('betclic_1_odds', data),
            add_numeric('betfair_1_odds', data),
            add_numeric('betfred_1_odds', data),
            add_numeric('betmgm_1_odds', data),
            add_numeric('betonlineag_1_odds', data),
            add_numeric('betrivers_1_odds', data),
            add_numeric('betus_1_odds', data),
            add_numeric('betway_1_odds', data),
            add_numeric('bovada_1_odds', data),
            add_numeric('casumo_1_odds', data),
            add_numeric('circasports_1_odds', data),
            add_numeric('coral_1_odds', data),
            add_numeric('draftkings_1_odds', data),
            add_numeric('fanduel_1_odds', data),
            add_numeric('foxbet_1_odds', data),
            add_numeric('gtbets_1_odds', data),
            add_numeric('ladbrokes_1_odds', data),
            add_numeric('lowvig_1_odds', data),
            add_numeric('marathonbet_1_odds', data),
            add_numeric('matchbook_1_odds', data),
            add_numeric('mrgreen_1_odds', data),
            add_numeric('mybookieag_1_odds', data),
            add_numeric('nordicbet_1_odds', data),
            add_numeric('onexbet_1_odds', data),
            add_numeric('paddypower_1_odds', data),
            add_numeric('pinnacle_1_odds', data),
            add_numeric('pointsbetus_1_odds', data),
            add_numeric('sport888_1_odds', data),
            add_numeric('sugarhouse_1_odds', data),
            add_numeric('superbook_1_odds', data),
            add_numeric('twinspires_1_odds', data),
            add_numeric('unibet_1_odds', data),
            add_numeric('unibet_eu_1_odds', data),
            add_numeric('unibet_uk_1_odds', data),
            add_numeric('unibet_us_1_odds', data),
            add_numeric('williamhill_1_odds', data),
            add_numeric('williamhill_us_1_odds', data),
            add_numeric('wynnbet_1_odds', data),
            add_numeric('minutes_since_commence', data),
            add_numeric('this_team_game_of_season', data),
            add_numeric('opponent_game_of_season', data),
            add_category('home_away', data),
            add_category('team_1', data),
            add_category('hour_of_start', data),
            add_category('day_of_week', data),
            add_category('number_of_game_today', data),
            add_category('day_night', data),
            add_category('park_id', data),
            add_category('this_team_league', data),
            add_category('opponent_league', data),
        ],
        1
    )
    return full_data

In [428]:
def make_train_data(input_df):
    data = input_df
    train_data = np.concatenate(
        [
            add_numeric('barstool_1_odds', data),
            add_numeric('betclic_1_odds', data),
            add_numeric('betfair_1_odds', data),
            add_numeric('betfred_1_odds', data),
            add_numeric('betmgm_1_odds', data),
            add_numeric('betonlineag_1_odds', data),
            add_numeric('betrivers_1_odds', data),
            add_numeric('betus_1_odds', data),
            add_numeric('betway_1_odds', data),
            add_numeric('bovada_1_odds', data),
            add_numeric('casumo_1_odds', data),
            add_numeric('circasports_1_odds', data),
            add_numeric('coral_1_odds', data),
            add_numeric('draftkings_1_odds', data),
            add_numeric('fanduel_1_odds', data),
            add_numeric('foxbet_1_odds', data),
            add_numeric('gtbets_1_odds', data),
            add_numeric('ladbrokes_1_odds', data),
            add_numeric('lowvig_1_odds', data),
            add_numeric('marathonbet_1_odds', data),
            add_numeric('matchbook_1_odds', data),
            add_numeric('mrgreen_1_odds', data),
            add_numeric('mybookieag_1_odds', data),
            add_numeric('nordicbet_1_odds', data),
            add_numeric('onexbet_1_odds', data),
            add_numeric('paddypower_1_odds', data),
            add_numeric('pinnacle_1_odds', data),
            add_numeric('pointsbetus_1_odds', data),
            add_numeric('sport888_1_odds', data),
            add_numeric('sugarhouse_1_odds', data),
            add_numeric('superbook_1_odds', data),
            add_numeric('twinspires_1_odds', data),
            add_numeric('unibet_1_odds', data),
            add_numeric('unibet_eu_1_odds', data),
            add_numeric('unibet_uk_1_odds', data),
            add_numeric('unibet_us_1_odds', data),
            add_numeric('williamhill_1_odds', data),
            add_numeric('williamhill_us_1_odds', data),
            add_numeric('wynnbet_1_odds', data),
            add_numeric('minutes_since_commence', data),
            add_numeric('this_team_game_of_season', data),
            add_numeric('opponent_game_of_season', data),
            add_category('home_away', data),
            add_category('team_1', data),
            add_category('hour_of_start', data),
            add_category('day_of_week', data),
            add_category('number_of_game_today', data),
            add_category('day_night', data),
            add_category('park_id', data),
            add_category('this_team_league', data),
            add_category('opponent_league', data),
        ],
        1
    )
    return train_data

In [429]:
def make_test_data(input_df):
    data = input_df
    test_data = np.concatenate(
        [
            add_numeric('barstool_1_odds', data),
            add_numeric('betclic_1_odds', data),
            add_numeric('betfair_1_odds', data),
            add_numeric('betfred_1_odds', data),
            add_numeric('betmgm_1_odds', data),
            add_numeric('betonlineag_1_odds', data),
            add_numeric('betrivers_1_odds', data),
            add_numeric('betus_1_odds', data),
            add_numeric('betway_1_odds', data),
            add_numeric('bovada_1_odds', data),
            add_numeric('casumo_1_odds', data),
            add_numeric('circasports_1_odds', data),
            add_numeric('coral_1_odds', data),
            add_numeric('draftkings_1_odds', data),
            add_numeric('fanduel_1_odds', data),
            add_numeric('foxbet_1_odds', data),
            add_numeric('gtbets_1_odds', data),
            add_numeric('ladbrokes_1_odds', data),
            add_numeric('lowvig_1_odds', data),
            add_numeric('marathonbet_1_odds', data),
            add_numeric('matchbook_1_odds', data),
            add_numeric('mrgreen_1_odds', data),
            add_numeric('mybookieag_1_odds', data),
            add_numeric('nordicbet_1_odds', data),
            add_numeric('onexbet_1_odds', data),
            add_numeric('paddypower_1_odds', data),
            add_numeric('pinnacle_1_odds', data),
            add_numeric('pointsbetus_1_odds', data),
            add_numeric('sport888_1_odds', data),
            add_numeric('sugarhouse_1_odds', data),
            add_numeric('superbook_1_odds', data),
            add_numeric('twinspires_1_odds', data),
            add_numeric('unibet_1_odds', data),
            add_numeric('unibet_eu_1_odds', data),
            add_numeric('unibet_uk_1_odds', data),
            add_numeric('unibet_us_1_odds', data),
            add_numeric('williamhill_1_odds', data),
            add_numeric('williamhill_us_1_odds', data),
            add_numeric('wynnbet_1_odds', data),
            add_numeric('minutes_since_commence', data),
            add_numeric('this_team_game_of_season', data),
            add_numeric('opponent_game_of_season', data),
            
            add_category('home_away', data),
            add_category('team_1', data),
            add_category('hour_of_start', data),
            add_category('day_of_week', data),
            add_category('number_of_game_today', data),
            add_category('day_night', data),
            add_category('park_id', data),
            add_category('this_team_league', data),
            add_category('opponent_league', data),
        ],
        1
    )
    return test_data

In [430]:
target = df['target']
df = df.drop('target', axis='columns')
full_data = make_full_data(df)

In [431]:
full_data[:, :39]

array([[1.66, 1.68, 1.72, ..., 1.67, 1.65, 1.67],
       [1.66, 1.68, 1.71, ..., 1.67, 1.65, 1.67],
       [1.66, 1.68, 1.71, ..., 1.67, 1.65, 1.67],
       ...,
       [0.  , 0.  , 2.22, ..., 2.15, 2.2 , 0.  ],
       [0.  , 0.  , 2.22, ..., 2.15, 2.2 , 0.  ],
       [0.  , 0.  , 2.22, ..., 2.15, 2.2 , 0.  ]])

In [432]:
# Define the indices of the columns you want to standardize and those we don't
continuous_vars = full_data[:, :42]
categorical_vars = full_data[:, 42:]

#continuous_vars_full = full_data[continuous_vars]


In [433]:
# Create an instance of StandardScaler and fit it on the training data
scaler = StandardScaler()
scaler.fit(continuous_vars)

In [434]:
# Standardize the columns of the training data
# X_train_s = np.hstack((scaler.transform(continuous_vars_train), X_train.iloc[:, 42:].values))
# X_test_s = np.hstack((scaler.transform(continuous_vars_test), X_test.iloc[:, 42:].values))
standardized_data = np.hstack((scaler.transform(continuous_vars), categorical_vars))

In [435]:
standardized_data.shape

(1158646, 137)

In [436]:
# Make our datasets
X_train, X_test, y_train, y_test = train_test_split(standardized_data, target, test_size=0.2, random_state=42)

# Convert input data to numpy arrays
X_train_np = X_train.astype(np.float32)
X_test_np = X_test.astype(np.float32)
y_train_np = y_train.values.astype(np.float32)  # Convert y_train to numpy array
y_test_np = y_test.values.astype(np.float32)    # Convert y_test to numpy array

# Create Torch datasets with the numpy arrays
train_data = torch.utils.data.TensorDataset(torch.tensor(X_train_np), torch.tensor(y_train_np))
test_data = torch.utils.data.TensorDataset(torch.tensor(X_test_np), torch.tensor(y_test_np))


In [437]:
# Set up loaders for each of our datasets 
train_loader = torch.utils.data.DataLoader(train_data, batch_size = 128, shuffle = True)
test_loader = torch.utils.data.DataLoader(test_data, batch_size = 128, shuffle = True)

In [438]:
X_train_np.shape

(926916, 137)

In [439]:
# Set up the layers and activation functions of our model
model = torch.nn.Sequential(   
    torch.nn.Linear(137,256),
    torch.nn.ReLU(),
    torch.nn.Linear(256,128),
    torch.nn.ReLU(),
    torch.nn.Linear(128,64),
    torch.nn.ReLU(),
    torch.nn.Linear(64,64),
    torch.nn.ReLU(),
    torch.nn.Linear(64,16),
    torch.nn.ReLU(),
    torch.nn.Linear(16,1)
)
params_count = (137*256)+(256*128)+(128*64)+(64*64)+(64*16)+16
params_count


81168

In [440]:
# # Set up the layers and activation functions of our model
# model = torch.nn.Sequential(   
#     torch.nn.Linear(137,256),
#     torch.nn.SiLU(),
#     torch.nn.Linear(256,128),
#     torch.nn.SiLU(),
#     torch.nn.Linear(128,16),
#     torch.nn.SiLU(),
#     torch.nn.Linear(16,1)
# )

In [441]:
# Set up the layers and activation functions of our model
# model = torch.nn.Sequential(   
#     torch.nn.Linear(137,800),
#     torch.nn.SiLU(),
#     torch.nn.Linear(800,800),
#     torch.nn.SiLU(),
#     torch.nn.Linear(800,800),
#     torch.nn.SiLU(),
#     torch.nn.Linear(800,800),
#     torch.nn.SiLU(),
#     torch.nn.Linear(800,256),
#     torch.nn.SiLU(),
#     torch.nn.Linear(256,1)
# )

In [442]:
params_count = (137*800)+(800*800)+(800*800)+(800*800)+(800*256)+256
params_count

2234656

In [443]:
# Defines our scoring function
def scoring_function(pred, label):
    return nn.functional.binary_cross_entropy_with_logits(pred, label)

# Defines number of epochs we want to train through
num_epochs = 10

# Defines our optimizer and the learning rate 
optimizer = torch.optim.Adam( model.parameters(), lr=.001  )#, weight_decay=.0001)

In [444]:
def calc_epoch_stats(c_tensor, i_tensor):
    unscaled_correct = scaler.inverse_transform(c_tensor[:, :42]) if len(c_tensor) > 0 else np.array([]) # unscale the continuous variables
    unscaled_incorrect = scaler.inverse_transform(i_tensor[:, :42]) if len(i_tensor) > 0 else np.array([]) # unscale the continuous variables

    amt_correct_predictions = len(unscaled_correct) # amount of correct predictions
    amt_incorrect_predictions = len(unscaled_incorrect) # amount of incorrect predictions
    amt_total_bets = (amt_correct_predictions + amt_incorrect_predictions) # amount of total bets placed

    if len(unscaled_correct) > 0:
        unscaled_correct = unscaled_correct[:, :39] # only odds data
        sum = 0 # calculate the average market odds
        count = 0
        for row in unscaled_correct:
            for val in row:
                if val > 0.01: # don't average the 'missing' (0) values 
                    sum += val
                    count +=1
        average_odds_won = sum / count
    else:
        average_odds_won = 0

    money_profited = ((amt_correct_predictions * average_odds_won * 100) - (100 * amt_correct_predictions))

    money_lost = (amt_incorrect_predictions * 100)

    p_l = money_profited - money_lost

    correct_pred_percet = amt_correct_predictions/amt_total_bets

    ev_per_bet = p_l/amt_total_bets

    return p_l, correct_pred_percet, ev_per_bet, amt_total_bets

In [445]:
def train():
    # Initializes a list that will contain our batch losses for an individual epoch
    epoch_losses = []
    
    # Defines how we want to step through each batch in the epoch
    for batch in train_loader:
        
        # Resets the gradient to zero
        optimizer.zero_grad()

        # Prepare the input and output tensors for the current batch
        batchX = torch.tensor(batch[0], dtype=torch.float32)
        batchY = torch.tensor(batch[1], dtype=torch.float32)
        
        # Forward pass
        y_pred = model.forward(batchX)
        batchY = batchY.unsqueeze(1)  # Reshape to (batch_size, 1)
        
        # Compute the loss with weighted BCEWithLogitsLoss
        pos_weight = torch.tensor([10.0])  # higher weight for positive class
        
        criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
        loss = criterion(y_pred, batchY)
        
        # Store the loss for this batch in the list
        epoch_losses.append(loss.detach().clone())

        # Compute the gradient of the error with respect to the model parameters
        loss.mean().backward()

        # update the model parameters
        optimizer.step()
        
    all_epoch_loss = torch.tensor(epoch_losses)
    epoch_loss = torch.mean(all_epoch_loss)
    
    return epoch_loss

In [446]:
def test_model():
    # Put model in evaluation mode
    model.eval()

    # Loop over the test data
    for batch in test_loader:
        # Prepare the input and output tensors for the current batch
        batchX = torch.tensor(batch[0], dtype=torch.float32)
        batchY = torch.tensor(batch[1], dtype=torch.float32)

        # Make predictions
        predictions = model(batchX)

        # Apply threshold
        # TODO: HYPERPARAM
        predictions = torch.where(predictions > 0.75, 1, 0)

        correct_pred_rows = batchX[(predictions.squeeze() == 1) & (batchY.squeeze() == 1)] # find all rows where we make a correct win prediction

        incorrect_pred_rows = batchX[(predictions.squeeze() == 1) & (batchY.squeeze() == 0)] # find all rows where we make an incorrect win prediction

        p_l, correct_pred_percet, ev_per_bet, amt_total_bets = calc_epoch_stats(correct_pred_rows, incorrect_pred_rows)

    # Return the results
    return p_l, correct_pred_percet, ev_per_bet, amt_total_bets

In [447]:
# Calls the train function for each of our epochs, prints the running results
for epoch in range(num_epochs):
    ep_result = train()
    p_l, correct_pred_percet, ev_per_bet, amt_total_bets = test_model()
    
    print(f'Epoch : {epoch + 1}, Loss: {ep_result}, WP Acc: {correct_pred_percet: 2%}, WPs: {amt_total_bets}, EV: {ev_per_bet}, P_L: {p_l}') 

Epoch : 1, Loss: 1.5318089723587036, WP Acc:  59.090909%, WPs: 44, EV: 16.29769000769327, P_L: 717.0983603385039
Epoch : 2, Loss: 1.1831661462783813, WP Acc:  62.857143%, WPs: 35, EV: 158.88612857711524, P_L: 5561.014500199033
Epoch : 3, Loss: 0.741929292678833, WP Acc:  72.413793%, WPs: 29, EV: 38.396216038391685, P_L: 1113.4902651133589
Epoch : 4, Loss: 0.5299798250198364, WP Acc:  84.848485%, WPs: 33, EV: 65.80093968718258, P_L: 2171.431009677025
Epoch : 5, Loss: 0.4128592014312744, WP Acc:  83.333333%, WPs: 30, EV: 207.3689766570811, P_L: 6221.069299712433
Epoch : 6, Loss: 0.33861300349235535, WP Acc:  80.645161%, WPs: 31, EV: 42.919755500695096, P_L: 1330.512420521548
Epoch : 7, Loss: 0.2918685972690582, WP Acc:  90.000000%, WPs: 30, EV: 62.97476042330685, P_L: 1889.2428126992054
Epoch : 8, Loss: 0.2564287781715393, WP Acc:  92.592593%, WPs: 27, EV: 98.18694809972385, P_L: 2651.047598692544
Epoch : 9, Loss: 0.23346322774887085, WP Acc:  95.833333%, WPs: 24, EV: 88.63225343575762, 

In [None]:
teams = test_data[:, 20:49]

In [None]:
print(teams)

In [None]:
# Get the counts for each category
counts = np.sum(teams, axis=0)

print(counts)

# Plot the distribution using a bar chart
labels = [i for i in range(30)]
plt.bar(labels, counts)
plt.title('Distribution of Categorical Data')
plt.xlabel('Categories')
plt.ylabel('Counts')
plt.show()

# Plot the distribution using a histogram
categories = np.argmax(one_hot_data, axis=1)
plt.hist(categories, bins=3, align='left', rwidth=0.5)
plt.xticks([0, 1, 2], labels)
plt.title('Distribution of Categorical Data')
plt.xlabel('Categories')
plt.ylabel('Counts')
plt.show()

In [None]:
y_test = np.array(y_test)  # convert y_test to a numpy array
counts_1 = np.sum(teams[y_test.astype(int) == 1], axis=0)
counts_0 = np.sum(teams[y_test.astype(int) == 0], axis=0)
counts = np.sum(teams, axis=0)

print(counts_1)
print(counts_0)
print(counts)



In [None]:
# plot bars in stack manner
x = [i for i in range(30)]
plt.bar(x, counts_1, color='r')
#plt.bar(x, counts_0, bottom=counts_1, color='b')
plt.show()