In [1]:
for rou in range(6):
    import warnings # type: ignore
    warnings.filterwarnings('ignore') # type: ignore
    from bs4 import BeautifulSoup # type: ignore
    from datetime import date, timedelta # type: ignore
    from dotenv import load_dotenv # type: ignore
    import numpy as np # type: ignore
    import os # type: ignore
    import pandas as pd # type: ignore
    from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeout # type: ignore
    import random # type: ignore
    from sklearn.model_selection import TimeSeriesSplit # type: ignore
    from sklearn.preprocessing import MinMaxScaler # type: ignore
    from sqlalchemy import create_engine # type: ignore
    import tensorflow as tf # type: ignore
    from tensorflow.keras import Model # type: ignore
    from tensorflow.keras.callbacks import EarlyStopping # type: ignore
    from tensorflow.keras.layers import Dense, Input # type: ignore
    import time # type: ignore

    SCHEDULE = './bracket.csv'
    FINAL_BRACKET = './predicted_bracket.csv'
    if rou == 0:
        try:
            # os.remove(SCHEDULE)
            os.remove(FINAL_BRACKET)
        except:
            pass

    async def get_html(url, selector, sleep=5, retries=5):
        html = None
        for i in range(1, retries + 1):
            time.sleep(sleep * i)
            try:
                async with async_playwright() as p:
                    browser = await p.webkit.launch()
                    page = await browser.new_page()
                    await page.goto(url)
                    if 'College Basketball at Sports-Reference.com' in str(await page.title()) and '404 error' not in str(await page.title()):
                        print(await page.title())
                    else:
                        print('INVALID PAGE')
                        return None
                    html = await page.inner_html(selector)
            except PlaywrightTimeout:
                print(f"Timeout error on {url}")
                continue
            else:
                break
        return html

    def get_schedule(html):
        schedule = pd.DataFrame()
        soup = BeautifulSoup(html)
        i = 0
        table = soup.find('div', {'id':'brackets'})
        brackets = table.find_all('div', id='bracket')
        for bracket in brackets[:4]:
            team = 0
            school_links = [a['href'] for a in bracket.find_all('a', href=lambda href: href and 'schools' in href)][:16]
            school_names = [a.text.strip() for a in bracket.find_all('a', href=lambda href: href and 'schools' in href)][:16]
            for _ in range(len(school_links) // 2):
                schedule.at[i, 'home_team'] = (school_links[2 * team].split('schools/')[-1].split('/men')[0] + ':' 
                                            + school_names[2 * team])
                schedule.at[i, 'away_team'] = (school_links[2 * team + 1].split('schools/')[-1].split('/men')[0] + ':'
                                            + school_names[2 * team + 1])
                i += 1
                team += 1
        if not os.path.isfile(SCHEDULE):
            print('Schedule Created')
            schedule.to_csv(SCHEDULE, mode='w')
            schedule.to_csv(FINAL_BRACKET, mode='w')
        else:
            schedule.to_csv(SCHEDULE, mode='a', header=False)
            schedule.to_csv(FINAL_BRACKET, mode='a', header=False)

    def remove_ncaa(school):
        school = str(school)
        school = school.split('\xa0')[0]
        return school

    if rou == 0:
        # html = await get_html('https://www.sports-reference.com/cbb/postseason/men/2024-ncaa.html', '#content')
        # get_schedule(html)
        pass

    html = await get_html('https://www.sports-reference.com/cbb/seasons/men/2025-ratings.html', '#content')
    soup = BeautifulSoup(html)
    srs = pd.read_html(str(soup), attrs={'id':'ratings'}, index_col=0)[0]
    srs.columns = srs.columns.droplevel(0)
    srs['School'] = srs['School'].apply(remove_ncaa)
    
    srs['SOS'] = pd.to_numeric(srs['SOS'], errors='coerce')
    srs['SOS'] = srs['SOS'] / 100
    
    srs['SRS'] = pd.to_numeric(srs['SRS'], errors='coerce')
    srs['SRS'] = srs['SRS'] / 100
    
    srs['NRtg'] = pd.to_numeric(srs['NRtg'], errors='coerce')
    srs['NRtg'] = srs['NRtg'] / 100
    
    srs['Pts'] = pd.to_numeric(srs['Pts'], errors='coerce')
    srs['Pts'] = srs['Pts'] / 100
    
    srs['OSRS'] = pd.to_numeric(srs['OSRS'], errors='coerce')
    srs['DSRS'] = pd.to_numeric(srs['DSRS'], errors='coerce')
    srs['ASRS'] = abs(srs['OSRS'] - srs['DSRS']) / 100

    load_dotenv()
    SQL_PASS = os.getenv('SQL_PASS')
    engine = create_engine(f"mysql+pymysql://root:{SQL_PASS}@localhost:3306/daily_lockz")

    DATE = date.today()
    SIM = int(1e5)

    random.seed(0)
    np.random.seed(0)
    tf.random.set_seed(0)
    pd.set_option('mode.use_inf_as_na', True)
    pd.set_option('display.max_rows', 10000)

    simulations = pd.DataFrame(columns=('sport','home_team','away_team','h_score','a_score','h_logo','a_logo','win_prob','implied_odds','time'))
    games = pd.read_csv(f"bracket.csv")

    df = pd.read_sql(f"SELECT * FROM ncaab_games", engine, index_col='index')
    df = df[df['date'] < pd.Timestamp('2025-03-18')]
    df['winner'] = (df['total'] > df['total_opp']).astype(int)
    df = df.sort_values('date').reset_index(drop=True)
    del df['index_opp']

    std = df['total'].std()
    df.fillna(value=0, inplace=True)

    last_update = df['date'].iloc[-1]
    num_games = len(df[df['date'] == str(last_update)]) // 2
    print(f"LAST GAMES UPDATE: {last_update} ({num_games} GAMES)")
            
    def add_targets(group):
        group['winner_target'] = group['winner'].shift(-1)
        return group
            
    df = df.groupby('team', group_keys=False).apply(add_targets)
    df['winner_target'][pd.isnull(df['winner_target'])] = 2
    df['winner_target'] = df['winner_target'].astype(int, errors='ignore')

    removed_columns = ['date','team','team_opp','logo','logo_opp','winner_target']
    selected_columns = df.columns[~df.columns.isin(removed_columns)]

    scaler = MinMaxScaler()
    df[selected_columns] = scaler.fit_transform(df[selected_columns])

    def shift_col(team, col_name):
        next_col = team[col_name].shift(-1)
        return next_col
        
    def add_col(df, col_name):
        return df.groupby('team', group_keys=False).apply(lambda x: shift_col(x, col_name))
            
    df['home_next'] = add_col(df, 'home')
    df['team_opp_next'] = add_col(df, 'team_opp')
    df['date_next'] = add_col(df, 'date')

    for i, game in games.iterrows():
        home_team = game['home_team'].split(':')[0]
        away_team = game['away_team'].split(':')[0]

        null_indices = df[(df['team'] == home_team) & (df['home_next'].isnull())].index
        for idx in null_indices:
            df.at[idx, 'home_next'] = 0
            df.at[idx, 'team_opp_next'] = away_team
            df.at[idx, 'date_next'] = DATE + timedelta(days=i)
        null_indices = df[(df['team'] == away_team) & (df['home_next'].isnull())].index
        for idx in null_indices:
            df.at[idx, 'home_next'] = 0
            df.at[idx, 'team_opp_next'] = home_team
            df.at[idx, 'date_next'] = DATE + timedelta(days=i)

    df_rolling2 = df[list(selected_columns) + ['team']]
    df_rolling4 = df[list(selected_columns) + ['team']]
    df_rolling8 = df[list(selected_columns) + ['team']]

    def find_team_averages2(team):
        rolling = team.rolling(2).mean(numeric_only=True)
        return rolling

    df_rolling2 = df_rolling2.groupby(['team'], group_keys=False).apply(find_team_averages2)
    rolling_cols2 = [f"{col}_2" for col in df_rolling2.columns]
    df_rolling2.columns = rolling_cols2

    def find_team_averages4(team):
        rolling = team.rolling(4).mean(numeric_only=True)
        return rolling
                
    df_rolling4 = df_rolling4.groupby(['team'], group_keys=False).apply(find_team_averages4)
    rolling_cols4 = [f"{col}_4" for col in df_rolling4.columns]
    df_rolling4.columns = rolling_cols4

    def find_team_averages8(team):
        rolling = team.rolling(8).mean(numeric_only=True)
        return rolling
        
    df_rolling8 = df_rolling8.groupby(['team'], group_keys=False).apply(find_team_averages8)
    rolling_cols8 = [f"{col}_8" for col in df_rolling8.columns]
    df_rolling8.columns = rolling_cols8

    df = pd.concat([df, df_rolling2], axis=1)
    df = pd.concat([df, df_rolling4], axis=1)
    df = pd.concat([df, df_rolling8], axis=1)

    df = df.dropna()
    full = df.merge(df[rolling_cols2 + rolling_cols4 + rolling_cols8 +
                    ['team_opp_next','date_next','team']], 
                    left_on=['team','date_next'], 
                    right_on=['team_opp_next','date_next'])
    print(f"COLUMNS USED: {len(full.columns)}")
    print(f"\n\n\nROUND {rou+1}\n\n\n")

    removed_columns = list(full.columns[full.dtypes != 'float']) + removed_columns
    selected_columns = full.columns[~full.columns.isin(removed_columns)]

    train_df = full[full['winner_target'] != 2]
    X = train_df[selected_columns].values
    y_winner = train_df['winner_target'].values

    input_layer = Input(shape=(len(selected_columns),))

    layer1 = Dense(256, activation='relu')(input_layer)
    layer2 = Dense(128, activation='relu')(layer1)
    layer3 = Dense(64, activation='relu')(layer2)

    output_winner = Dense(1, activation='sigmoid', name='winner_output')(layer3)
    model = Model(inputs=input_layer, outputs=output_winner)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    split = TimeSeriesSplit(n_splits=5)
    early_stopping = EarlyStopping(patience=25)
    for train_index, test_index in split.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y_winner[train_index], y_winner[test_index]

        model.fit(X_train, y_train,
                epochs=125,
                verbose=0,
                callbacks=[early_stopping],
                validation_data=(X_test, y_test))
        
    test_predictions = model.predict(X_test)
    test_predictions = (test_predictions > 0.5).astype(int)
    accuracy = np.mean(test_predictions.flatten() == y_test)
    print(f"Test Accuracy: {accuracy:.5f}")

    predictions = model.predict(full[selected_columns])
    full['winner_pred'] = predictions

    fix_count = 0
    WINNERS = []
    for i, game in games.iterrows():
        home_team = game['home_team'].split(':')[0]
        away_team = game['away_team'].split(':')[0]

        scores = full[((full['team_x'] == home_team) & (full['winner_target'] == 2)) |
                    ((full['team_x'] == away_team) & (full['winner_target'] == 2))
                    ].reset_index()
        print(scores[['winner_pred','team_x']])

        if scores['winner_pred'].iloc[0] >= scores['winner_pred'].iloc[1]:
            win = 0
            lose = 1
        else:
            win = 1
            lose = 0
            
        win_team = scores['team_x'].iloc[win]
        lose_team = scores['team_x'].iloc[lose]
        win_score = scores['winner_pred'].iloc[win]
        lose_score = scores['winner_pred'].iloc[lose]
        
        extra_adj = True
        upset = 5 / (rou + 1)
        if win_score < 0.5 and lose_score < 0.5:
            lose_score += (win_score - lose_score) * upset 
            extra_adj = False

        if win_team == home_team:
            win_team = game['home_team'].split(':')[-1]
            lose_team = game['away_team'].split(':')[-1]
            add = 'HOME'
        else:
            win_team = game['away_team'].split(':')[-1]
            lose_team = game['home_team'].split(':')[-1]
            add = 'AWAY'
        
        team_convert = {
            'UConn':'Connecticut',
            'VCU':'Virginia Commonwealth',
            'Saint Mary\'s':'Saint Mary\'s (CA)',
            'USC':'Southern California',
            'Pitt':'Pittsburgh',
            'UCSB':'UC Santa Barbara',
            'UNC':'North Carolina',
            'BYU':'Brigham Young',
            'St. Peter\'s':'Saint Peter\'s'
        }
        if win_team in team_convert:
            win_team = team_convert[win_team]
        if lose_team in team_convert:
            lose_team = team_convert[lose_team]
                
        adjusted = srs[srs['School'] == win_team]
        if adjusted.empty:
            print(f"FIX {win_team}")
            fix_count += 1
        else:
            win_score += float(adjusted['SOS'])
            win_score += float(adjusted['SRS'])
            win_score += float(adjusted['NRtg'])
            win_score += float(adjusted['Pts'])
            win_score -= float(adjusted['ASRS']) / (rou + 1)
        adjusted = None
            
        adjusted = srs[srs['School'] == lose_team]
        if adjusted.empty:
            print(f"FIX {lose_team}")
            fix_count += 1
        else:
            lose_score += float(adjusted['SOS'])
            lose_score += float(adjusted['SRS'])
            lose_score += float(adjusted['NRtg'])
            lose_score += float(adjusted['Pts'])
            lose_score -= float(adjusted['ASRS']) / (rou + 1)
        adjusted = None
        
        print(f"\n{win_team} ADJUSTED SCORE: {win_score}")
        print(f"{lose_team} ADJUESTED SCORE: {lose_score}")
        
        if extra_adj:
            if rou == 0:
                no_change = not scores['winner_pred'].iloc[win] - scores['winner_pred'].iloc[lose] > abs(win_score - lose_score)
            else:
                no_change = True
            if no_change and (lose_score > win_score):
                win_team, lose_team = lose_team, win_team
                if add == 'HOME':
                    add = 'AWAY'
                else:
                    add = 'HOME'
        else:
            if lose_score > win_score:
                win_team, lose_team = lose_team, win_team
                if add == 'HOME':
                    add = 'AWAY'
                else:
                    add = 'HOME'
        
        print(f"WIN TEAM: {win_team}\n")
        if add == 'AWAY':
            WINNERS.append(f"{game['away_team'].split(':')[0]}:{game['away_team'].split(':')[-1]}")
        else:
            WINNERS.append(f"{game['home_team'].split(':')[0]}:{game['home_team'].split(':')[-1]}")
    
    bracket = pd.DataFrame()
    for g in range((len(WINNERS) // 2)):
        bracket.at[g, 'home_team'] = WINNERS[2*g]
        bracket.at[g, 'away_team'] = WINNERS[2*g+1]
    bracket.to_csv(SCHEDULE)
    bracket.to_csv(FINAL_BRACKET, mode='a', header=False)      
        
    print(f"{fix_count} FIXES NEEDED")
    %reset -f

2024-25 Men's College Basketball School Ratings | College Basketball at Sports-Reference.com
LAST GAMES UPDATE: 2025-03-16 00:00:00 (3 GAMES)
COLUMNS USED: 1152



ROUND 1



[1m134/134[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 488us/step
Test Accuracy: 0.68169
[1m805/805[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 371us/step
   winner_pred         team_x
0     0.386972  alabama-state
1     0.496719         auburn

Auburn ADJUSTED SCORE: 2.1378186450958254
Alabama State ADJUESTED SCORE: 1.3554047080993652
WIN TEAM: Auburn

   winner_pred      team_x
0     0.332754  louisville
1     0.407079   creighton

Creighton ADJUSTED SCORE: 1.6622787429809571
Louisville ADJUESTED SCORE: 2.030677770423889
WIN TEAM: Louisville

   winner_pred                team_x
0     0.094677              michigan
1     0.670500  california-san-diego

UC San Diego ADJUSTED SCORE: 1.7091000996589658
Michigan ADJUESTED SCORE: 1.4580766212582588
WIN TEAM: UC San Diego

   winner_pred    team_