In [None]:
import warnings # type: ignore
warnings.filterwarnings('ignore') # type: ignore
from datetime import date, timedelta # type: ignore
from dotenv import load_dotenv # type: ignore
import h2o # type: ignore
from h2o.automl import H2OAutoML # type: ignore
from IPython.display import display # type: ignore
import matplotlib.pyplot as plt # type: ignore
import numpy as np # type: ignore
import os # type: ignore
import pandas as pd # type: ignore
import random # type: ignore
from sklearn.metrics import root_mean_squared_error # type: ignore
from sklearn.model_selection import TimeSeriesSplit # type: ignore
from sklearn.preprocessing import MinMaxScaler # type: ignore
from sqlalchemy import create_engine # type: ignore
# import tensorflow as tf # type: ignore
# from tensorflow.keras import Model # type: ignore
# from tensorflow.keras.callbacks import EarlyStopping # type: ignore
# from tensorflow.keras.layers import Dense, Input # type: ignore

h2o.init()

load_dotenv()
SQL_PASS = os.getenv('SQL_PASS')
engine = create_engine(f"mysql+pymysql://root:{SQL_PASS}@localhost:3306/daily_lockz")

TEST = 1
SIM = int(999e5)
DATE = date.today() + timedelta(days=TEST)
DATESEED = int(str(DATE).replace('-', ''))
print(f"{DATESEED}\n")

random.seed(0)
np.random.seed(0)

pd.set_option('mode.use_inf_as_na', True)
pd.set_option('display.max_rows', 10000)

simulations = pd.DataFrame(columns=('sport','home_team','away_team','h_score','a_score','h_logo','a_logo','win_prob','implied_odds','time'))
games = pd.read_csv(f"./schedules/mlb_schedule.csv")
todays_games = games[games['Date'] == str(DATE)]

print(f"{len(todays_games)} GAMES")

df = pd.read_sql(f"SELECT * FROM mlb_games", engine, index_col='index')
df = df.sort_values('date').reset_index(drop=True)
del df['index_opp']

std = df['total'].std()
df.fillna(value=0, inplace=True)

last_update = df['date'].iloc[-1]
num_games = len(df[df['date'] == str(last_update)]) // 2
print(f"LAST GAMES UPDATE: {last_update} ({num_games} GAMES)")
        
def add_targets(group):
    group['total_target'] = group['total'].shift(-1)
    return group
        
df = df.groupby('team', group_keys=False).apply(add_targets)
df['total_target'][pd.isnull(df['total_target'])] = -0.1
df['total_target'] = df['total_target'].astype(float, errors='ignore')

removed_columns = ['date','team','team_opp','logo','logo_opp','total_target']
selected_columns = df.columns[~df.columns.isin(removed_columns)]

scaler = MinMaxScaler()
df[selected_columns] = scaler.fit_transform(df[selected_columns])

def shift_col(team, col_name):
    next_col = team[col_name].shift(-1)
    return next_col
    
def add_col(df, col_name):
    return df.groupby('team', group_keys=False).apply(lambda x: shift_col(x, col_name))
        
df['home_next'] = add_col(df, 'home')
df['team_opp_next'] = add_col(df, 'team_opp')
df['date_next'] = add_col(df, 'date')

df['date'] = pd.to_datetime(df['date'], format='%Y%m%d')
df['date_next'] = pd.to_datetime(df['date_next'], format='%Y%m%d')
df['is_b2b'] = (df['date_next'] - df['date']).dt.days.abs() <= 1
df['is_b2b'] = df['is_b2b'].astype(int)

home_next = (1, 0)

for i, game in todays_games.iterrows():
    full_h_team = game['home_team']
    full_a_team = game['away_team']
    home_team = game['home_team'].split(':')[0]
    away_team = game['away_team'].split(':')[0]
    
    yesterdays_games = games[games['Date'] == str(DATE - timedelta(days=1))]
    if full_h_team in yesterdays_games['home_team'].values or full_h_team in yesterdays_games['away_team'].values:
        h_b2b = 1
        print(f"{home_team} IS PLAYING A B2B")
    else:
        h_b2b = 0
    if full_a_team in yesterdays_games['home_team'].values or full_a_team in yesterdays_games['away_team'].values:
        a_b2b = 1
        print(f"{away_team} IS PLAYING A B2B")
    else:
        a_b2b = 0

    null_indices = df[(df['team'] == home_team) & (df['home_next'].isnull())].index
    for idx in null_indices:
        df.at[idx, 'home_next'] = home_next[0]
        df.at[idx, 'team_opp_next'] = away_team
        df.at[idx, 'date_next'] = DATE
        df.at[idx, 'is_b2b'] = h_b2b
    null_indices = df[(df['team'] == away_team) & (df['home_next'].isnull())].index
    for idx in null_indices:
        df.at[idx, 'home_next'] = home_next[1]
        df.at[idx, 'team_opp_next'] = home_team
        df.at[idx, 'date_next'] = DATE
        df.at[idx, 'is_b2b'] = a_b2b

period = (1, 1)

df_rolling2 = df[list(selected_columns) + ['team']]
df_rolling4 = df[list(selected_columns) + ['team']]
df_rolling8 = df[list(selected_columns) + ['team']]
df_rolling16 = df[list(selected_columns) + ['team']]
df_rolling32 = df[list(selected_columns) + ['team']]

df_matchup = df[list(selected_columns) + ['team','team_opp_next']]

def find_team_averages2(team):
    rolling = team.rolling(2, min_periods=2*period[0]).mean(numeric_only=True)
    return rolling

df_rolling2 = df_rolling2.groupby(['team'], group_keys=False).apply(find_team_averages2)
rolling_cols2 = [f"{col}_2" for col in df_rolling2.columns]
df_rolling2.columns = rolling_cols2

def find_team_averages4(team):
    rolling = team.rolling(4, min_periods=4*period[0]).mean(numeric_only=True)
    return rolling
            
df_rolling4 = df_rolling4.groupby(['team'], group_keys=False).apply(find_team_averages4)
rolling_cols4 = [f"{col}_4" for col in df_rolling4.columns]
df_rolling4.columns = rolling_cols4

def find_team_averages8(team):
    rolling = team.rolling(8, min_periods=8*period[0]).mean(numeric_only=True)
    return rolling
    
df_rolling8 = df_rolling8.groupby(['team'], group_keys=False).apply(find_team_averages8)
rolling_cols8 = [f"{col}_8" for col in df_rolling8.columns]
df_rolling8.columns = rolling_cols8

def find_team_averages16(team):
    rolling = team.rolling(16, min_periods=16*period[1]).mean(numeric_only=True)
    return rolling
    
df_rolling16 = df_rolling16.groupby(['team'], group_keys=False).apply(find_team_averages16)
rolling_cols16 = [f"{col}_16" for col in df_rolling16.columns]
df_rolling16.columns = rolling_cols16

def find_team_averages32(team):
    rolling = team.rolling(32, min_periods=32*period[1]).mean(numeric_only=True)
    return rolling
    
df_rolling32 = df_rolling32.groupby(['team'], group_keys=False).apply(find_team_averages32)
rolling_cols32 = [f"{col}_32" for col in df_rolling32.columns]
df_rolling32.columns = rolling_cols32

def find_team_matchup(team):
    rolling = team.rolling(2, min_periods=2*period[1]).mean(numeric_only=True)
    return rolling
        
df_matchup = df_matchup.groupby(['team','team_opp_next'], group_keys=False).apply(find_team_matchup)
matchup_cols = [f"{col}_matchup" for col in df_matchup.columns]
df_matchup.columns = matchup_cols

df = pd.concat([df, df_rolling2], axis=1)
df = pd.concat([df, df_rolling4], axis=1)
df = pd.concat([df, df_rolling8], axis=1)
df = pd.concat([df, df_rolling16], axis=1)
df = pd.concat([df, df_rolling32], axis=1)

df = pd.concat([df, df_matchup], axis=1)
df = df.dropna()
full = df.merge(df[rolling_cols2 + rolling_cols4 + rolling_cols8 + rolling_cols16 + rolling_cols32 + matchup_cols +
                ['team_opp_next','date_next','team']], 
                left_on=['team','date_next'], 
                right_on=['team_opp_next','date_next'])
print(f"COLUMNS USED: {len(full.columns)}")

removed_columns = list(full.columns[full.dtypes != 'float']) + removed_columns
selected_columns = full.columns[~full.columns.isin(removed_columns)]

train_df = full[full['total_target'] != -0.1]
X = train_df[selected_columns]
y = train_df['total_target']

aml = H2OAutoML(max_runtime_secs=0, max_models=4, seed=0)
hf_train = h2o.H2OFrame(pd.concat([X, y], axis=1))
aml.train(y='total_target', training_frame=hf_train)

# test_predictions = aml.predict(test_df[selected_columns])
# y_true = test_df['total_target'].values
# y_pred = test_predictions[0]
# rmse = root_mean_squared_error(y_true, y_pred)
# print(f"Test Accuracy: {rmse:.5f}")
# plt.figure(figsize=(3,1))
# plt.plot(y_true)
# plt.plot(y_pred)
# plt.show()

# predictions = model.predict(full[selected_columns])
# full['total_pred'] = predictions[0]
# full['spread_pred'] = predictions[1]
# full['total_score_pred'] = predictions[2]

# for i, game in todays_games.iterrows():
#     home_team = game['home_team'].split(':')[0]
#     away_team = game['away_team'].split(':')[0]

#     try:
#         home_logo = df[df['team'] == home_team]['logo'].iat[-1]
#         away_logo = df[df['team'] == away_team]['logo'].iat[-1]
#     except:
#         print(f"\n\n\nError on teams {home_team} & {away_team}\n\n\n")
#         continue

#     scores = full[((full['team_x'] == home_team) & (full['total_target'] == -0.1)) |
#                 ((full['team_x'] == away_team) & (full['total_target'] == -0.1))
#                 ].reset_index()
#     print(scores[['total_pred','spread_pred','total_score_pred','team_x']])

#     try:
#         if scores['total_pred'].iloc[0] >= scores['total_pred'].iloc[1]:
#             win = 0
#             lose = 1
#         else:
#             win = 1
#             lose = 0
#     except:
#         print(f"\n\n\nNOT ENOUGH DATA FOR: {home_team} & {away_team}\n\n\n")
#         continue
#     spread = scores['total_pred'].iloc[win] - scores['total_pred'].iloc[lose]
#     total_score = scores['total_pred'].iloc[win] + scores['total_pred'].iloc[lose]

#     spread += scores['spread_pred'].iloc[win]
#     spread -= scores['spread_pred'].iloc[lose]
#     if SPORT != 'soccer':
#         spread /= np.sqrt(5)
#     else:
#         spread /= 3

#     total_score += scores['total_score_pred'].iloc[win]
#     total_score += scores['total_score_pred'].iloc[lose]
#     total_score /= 3
                
#     win_score = (total_score + spread) / 2
#     lose_score = (total_score - spread) / 2 
#     if win_score < 0: win_score = 0
#     if lose_score < 0: lose_score = 0
#     win_team = scores['team_x'].iloc[win]
#     lose_team = scores['team_x'].iloc[lose]

#     win_mean = np.random.normal(win_score, std, SIM)
#     lose_mean = np.random.normal(lose_score, std, SIM)
#     if SPORT == 'soccer':
#         win_prob = min(np.mean(win_mean > lose_mean), 0.99999)
#         lose_prob = np.mean(lose_mean > win_mean)
#         sport = game['League'].replace('-', ' ')
#     else:
#         win_prob = min(np.mean(win_mean > lose_mean) + (np.mean(win_mean == lose_mean) / 2), 0.99999)
#         lose_prob = 1 - win_prob
#         sport = SPORT.upper()
#     if sport == 'NCAAB':
#         sport += (' ' + game['Conference'])

#     implied_odds = 100 / (win_prob * 100)
#     if implied_odds >= 2:
#         implied_odds = (implied_odds - 1) * 100
#     else:
#         implied_odds = -100 / (implied_odds - 1)

#     scores['total_pred'].iloc[win] = win_score
#     scores['total_pred'].iloc[lose] = lose_score
#     scores['win_prob'] = 0
#     scores['win_prob'].iloc[win] = win_prob
#     scores['win_prob'].iloc[lose] = lose_prob
#     display(scores[['total_pred','win_prob','team_x']])         

#     simulations.at[i, 'sport'] = sport
#     simulations.at[i, 'win_prob'] = round(win_prob * 100, 3)
#     simulations.at[i, 'implied_odds'] = round(implied_odds)

#     simulations.at[i, 'home_team'] = game['home_team'].split(':')[-1]
#     simulations.at[i, 'away_team'] = game['away_team'].split(':')[-1]
#     simulations.at[i, 'h_logo'] = home_logo
#     simulations.at[i, 'a_logo'] = away_logo
#     simulations.at[i, 'time'] = game['time']

#     if win_team == home_team:
#         simulations.at[i, 'h_score'] = win_score
#         simulations.at[i, 'a_score'] = lose_score
#     elif lose_team == home_team:
#         simulations.at[i, 'a_score'] = win_score
#         simulations.at[i, 'h_score'] = lose_score
    
# simulations['datetime'] = pd.to_datetime(simulations['time'], format='%I:%M%p')
# simulations = simulations.sort_values(by=['datetime','home_team']).reset_index(drop=True)
# simulations = simulations.drop(columns=['datetime'])

# np.random.seed(DATESEED)
# simulations['is_dl'] = False
# if SPORT == 'mlb' or SPORT == 'nhl' or SPORT == 'soccer':
#     dl = 1
# else:
#     dl = 2
# if len(simulations) >= dl:
#     dl_indices = np.random.choice(simulations.index, size=dl, replace=False)
# else:
#     dl_indices = simulations.index
# simulations.loc[dl_indices, 'is_dl'] = True

# if SPORT == 'soccer':
#     mls_games = simulations[simulations['sport'] == 'United States Major League Soccer']
#     if not mls_games.empty:
#         mls_index = np.random.choice(mls_games.index, size=1, replace=False)
#         simulations.loc[mls_index, 'is_dl'] = True

# display(simulations)

# simulations = simulations.dropna()

In [None]:
aml.leaderboard