In [115]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [116]:
import pandas as pd

file_path = '/content/drive/My Drive/Sports Modeling/mlb/cleaned_over_under.csv'
df = pd.read_csv(file_path)

In [117]:
print(df.columns.tolist())

['Date', 'home_team', 'away_team', 'final_home', 'final_away', 'line', 'home_odds', 'away_odds']


In [118]:
# Get the count of null values in each column
null_counts = df.isnull().sum()

# Filter the columns with null values
null_counts = null_counts[null_counts > 0]

# Display the columns with their respective null value counts
print(null_counts)

home_odds    1
away_odds    1
dtype: int64


In [119]:
# Remove rows with any null values
df = df.dropna()

In [120]:
# rolling window (last 5 games)
window_size = 5

# 1. Feature: Recent Performance Based on Odds

# Rolling average of home and away odds
df['home_odds_recent_avg'] = df.groupby('home_team')['home_odds'].transform(lambda x: x.rolling(window=window_size, min_periods=1).mean())
df['away_odds_recent_avg'] = df.groupby('away_team')['away_odds'].transform(lambda x: x.rolling(window=window_size, min_periods=1).mean())

# Difference in odds (home vs. away) - This can indicate the relative strength as per recent odds
df['odds_difference'] = df['home_odds_recent_avg'] - df['away_odds_recent_avg']

# 2. Feature: Probability of Winning Based on Previous Records

# Initialize win/loss record trackers for home and away teams
df['home_win'] = (df['final_home'] > df['final_away']).astype(int)
df['home_team_wins'] = df.groupby('home_team')['home_win'].cumsum() - df['home_win']
df['away_team_wins'] = df.groupby('away_team')['home_win'].transform(lambda x: (1 - x).cumsum()) - (1 - df['home_win'])

# Create Target Variable (1 for over, 0 for under)
df['total_score'] = df['final_home'] + df['final_away']
df['over_under'] = (df['total_score'] > df['line']).astype(int)

# Total games played (for calculating win percentage)
df['home_team_games'] = df.groupby('home_team').cumcount()
df['away_team_games'] = df.groupby('away_team').cumcount()

# Win percentages for home and away teams
df['home_win_percentage'] = df['home_team_wins'] / df['home_team_games'].replace(0, np.nan)
df['away_win_percentage'] = df['away_team_wins'] / df['away_team_games'].replace(0, np.nan)

# Fill any NaNs generated by divisions (e.g., no games played yet)
df['home_win_percentage'].fillna(0.5, inplace=True)  # Assuming 50% win chance if no history
df['away_win_percentage'].fillna(0.5, inplace=True)

# 3. Combine the win percentages into a single feature that represents relative strength
df['win_prob_difference'] = df['home_win_percentage'] - df['away_win_percentage']

# 4. Remove columns that won't be used in the model
df = df.drop(columns=['home_team_wins', 'away_team_wins', 'home_team_games', 'away_team_games'])

# Display the updated DataFrame with the new features
print(df.head())


   Date home_team away_team  final_home  final_away  line  home_odds  \
0   404       BOS       NYY           9           7   9.0     -104.0   
1   405       WAS       PHI           1          11   7.5     -120.0   
2   405       NYM       MIA           7           1   7.0      105.0   
3   405       CIN       STL           6          11   7.5     -115.0   
4   405       PIT       LOS          11           5   8.5     -110.0   

   away_odds  home_odds_recent_avg  away_odds_recent_avg  odds_difference  \
0     -116.0                -104.0                -116.0             12.0   
1      100.0                -120.0                 100.0           -220.0   
2     -125.0                 105.0                -125.0            230.0   
3     -105.0                -115.0                -105.0            -10.0   
4     -110.0                -110.0                -110.0              0.0   

   home_win  total_score  over_under  home_win_percentage  \
0         1           16           1       

In [121]:
from sklearn.metrics import accuracy_score, classification_report

# 2. Convert Date Column
# Normalize the Date
min_date = df['Date'].min()
max_date = df['Date'].max()
df['normalized_date'] = (df['Date'] - min_date) / (max_date - min_date)

# Time Difference Feature
df['time_diff'] = df['Date'] - min_date

# 3. Encode Categorical Variables (home_team, away_team)
df = pd.get_dummies(df, columns=['home_team', 'away_team'])

# 4. Prepare Features and Target
X = df.drop(columns=['Date', 'final_home', 'final_away', 'home_win', 'total_score'])  # Drop non-predictive columns
y = df['home_win']

In [122]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization

# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Building a deeper neural network with more layers and batch normalization
winner_model = Sequential()

# Input layer and first hidden layer
winner_model.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))
winner_model.add(BatchNormalization())
winner_model.add(Dropout(0.5))

# Second hidden layer
winner_model.add(Dense(64, activation='relu'))
winner_model.add(BatchNormalization())
winner_model.add(Dropout(0.5))

# Third hidden layer
winner_model.add(Dense(32, activation='relu'))
winner_model.add(BatchNormalization())
winner_model.add(Dropout(0.5))

# Fourth hidden layer
winner_model.add(Dense(16, activation='relu'))

# Output layer
winner_model.add(Dense(1, activation='sigmoid'))

# Compile the model with a potentially lower learning rate
winner_model.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = winner_model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2, verbose=1)

# Evaluate the model on test data
loss, accuracy = winner_model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.2f}")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m512/512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - accuracy: 0.4759 - loss: 0.9916 - val_accuracy: 0.5092 - val_loss: 0.7151
Epoch 2/10
[1m512/512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.4987 - loss: 0.8381 - val_accuracy: 0.5280 - val_loss: 0.7062
Epoch 3/10
[1m512/512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.5046 - loss: 0.7764 - val_accuracy: 0.5292 - val_loss: 0.7027
Epoch 4/10
[1m512/512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.5099 - loss: 0.7507 - val_accuracy: 0.5250 - val_loss: 0.7007
Epoch 5/10
[1m512/512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.5126 - loss: 0.7343 - val_accuracy: 0.5302 - val_loss: 0.6989
Epoch 6/10
[1m512/512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.5013 - loss: 0.7375 - val_accuracy: 0.5272 - val_loss: 0.6978
Epoch 7/10
[1m512/512[0m 

In [123]:
# Prepare Features and Target
X = df.drop(columns=['over_under'])  # Features
y = df['over_under']  # Target

# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train_ou, X_test_ou, y_train_ou, y_test_ou = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Check the number of features in the training data
input_dim = X_train_ou.shape[1]

# Step 2: Build the Deep Learning Model

ou_model = Sequential()

# Input layer and first hidden layer
ou_model.add(Dense(64, input_dim=input_dim, activation='relu'))
ou_model.add(Dropout(0.5))  # Adding dropout for regularization

# Second hidden layer
ou_model.add(Dense(32, activation='relu'))
ou_model.add(Dropout(0.5))  # Adding dropout for regularization

# Third hidden layer (optional, add more layers for deeper networks)
ou_model.add(Dense(16, activation='relu'))

# Output layer (since this is binary classification)
ou_model.add(Dense(1, activation='sigmoid'))

# Compile the model
ou_model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Step 3: Train the Model

history = ou_model.fit(X_train_ou, y_train_ou, epochs=10, batch_size=32, validation_split=0.2, verbose=1)

# Step 4: Evaluate the Model

# Evaluate on the test data
loss, accuracy = ou_model.evaluate(X_test_ou, y_test_ou)
print(f"Test Accuracy: {accuracy:.2f}")


Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m512/512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.5972 - loss: 0.6972 - val_accuracy: 0.9433 - val_loss: 0.1789
Epoch 2/10
[1m512/512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.9046 - loss: 0.2344 - val_accuracy: 0.9707 - val_loss: 0.0839
Epoch 3/10
[1m512/512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9466 - loss: 0.1314 - val_accuracy: 0.9819 - val_loss: 0.0587
Epoch 4/10
[1m512/512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9639 - loss: 0.0905 - val_accuracy: 0.9866 - val_loss: 0.0492
Epoch 5/10
[1m512/512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9759 - loss: 0.0649 - val_accuracy: 0.9934 - val_loss: 0.0382
Epoch 6/10
[1m512/512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.9830 - loss: 0.0605 - val_accuracy: 0.9919 - val_loss: 0.0334
Epoch 7/10
[1m512/512[0m [32m━━━━━━━

In [124]:
def make_predictions_with_confidence(model, X_test, threshold=0.5):
    # Predict probabilities for the test data
    predictions = model.predict(X_test)

    # Predict classes based on the threshold
    predicted_classes = (predictions > threshold).astype(int)

    # Calculate confidence scores
    confidence_scores = predictions if threshold == 0.5 else np.abs(predictions - threshold)

    # Convert confidence scores to percentages
    confidence_scores = confidence_scores * 100

    return predicted_classes, confidence_scores

# Making predictions for the winner model
predicted_winner, confidence_winner = make_predictions_with_confidence(winner_model, X_test)

# Making predictions for the over/under model
predicted_ou, confidence_ou = make_predictions_with_confidence(ou_model, X_test_ou)


[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


In [125]:
# Data for Winner Prediction
data_winner = {
    'Game': [f"Game {i+1}" for i in range(len(predicted_winner))],
    'Winner Prediction': ['Home' if pred == 1 else 'Away' for pred in predicted_winner],
    'Winner Confidence': confidence_winner.flatten()
}

# Data for Over/Under Prediction
data_ou = {
    'Game': [f"Game {i+1}" for i in range(len(predicted_ou))],
    'Over/Under Prediction': ['Over' if pred == 1 else 'Under' for pred in predicted_ou],
    'Over/Under Confidence': confidence_ou.flatten()
}

# Convert the data dictionaries into DataFrames
confidence_df_winner = pd.DataFrame(data_winner)
confidence_df_ou = pd.DataFrame(data_ou)

In [126]:
confidence_df_winner

Unnamed: 0,Game,Winner Prediction,Winner Confidence
0,Game 1,Home,57.569916
1,Game 2,Home,57.560833
2,Game 3,Home,63.504803
3,Game 4,Home,59.206711
4,Game 5,Home,55.743427
...,...,...,...
5114,Game 5115,Home,57.216423
5115,Game 5116,Home,55.661552
5116,Game 5117,Home,53.225899
5117,Game 5118,Home,50.021519


In [127]:
confidence_df_ou

Unnamed: 0,Game,Over/Under Prediction,Over/Under Confidence
0,Game 1,Over,9.999993e+01
1,Game 2,Over,1.000000e+02
2,Game 3,Over,1.000000e+02
3,Game 4,Under,3.288095e-12
4,Game 5,Under,3.213149e-20
...,...,...,...
5114,Game 5115,Over,1.000000e+02
5115,Game 5116,Over,9.871185e+01
5116,Game 5117,Under,3.418753e-04
5117,Game 5118,Under,1.199632e-09
