In [None]:
import duckdb

import pandas as pd
import numpy as np

In [None]:
def get_season_schedule(db, year):
    """
    Fetch the season schedule for a given year from the database.
    """
    schedule_df = db.sql(
        f"""
        SELECT
            Year, Week, Home_Team, Away_Team,
            Is_Neutral, Home_days_Since_Last_Game, Away_days_Since_Last_Game
        FROM game_features
        WHERE Year = {year}
        ORDER BY Week, Home_Team, Away_Team
        """
    ).df()
    return schedule_df

def get_season_week_speads(db, year, week):
    """
    Fetch the season spreads for a given year and week from the database.
    """
    spread_df = db.sql(
        f"""
        SELECT
            Home_Team, Away_Team, Spread
        FROM game_features
        WHERE Year = {year} AND Week = {week}
        ORDER BY Home_Team, Away_Team
        """
    ).df()
    return spread_df

def get_season_week_rankings(db, year, week):
    """
    Fetch the season rankings for a given year and week from the database.
    """
    rank_df = db.sql(
        f"""
        SELECT
            Team,
            ROW_NUMBER() OVER (PARTITION BY Year, Week ORDER BY Rating DESC) AS Rank
        FROM nfl_rankings
        WHERE Year = {year} AND Week = {week}
        ORDER BY Team
        """
    ).df()
    return rank_df

def get_team_records_from_db(db, year, week):
    """
    Generate team_records dict for all teams up to (but not including) the given week.
    """
    query = f"""
        SELECT Home_Team, Away_Team, Home_Won
        FROM game_features
        WHERE Year = {year} AND Week < {week}
    """
    df = db.sql(query).df()
    team_records = {}

    for _, row in df.iterrows():
        home = row["Home_Team"]
        away = row["Away_Team"]
        home_won = row["Home_Won"]

        for team in [home, away]:
            if team not in team_records:
                team_records[team] = {"wins": 0, "losses": 0, "games_played": 0}

        # Update games played
        team_records[home]["games_played"] += 1
        team_records[away]["games_played"] += 1

        # Update wins/losses
        if home_won:
            team_records[home]["wins"] += 1
            team_records[away]["losses"] += 1
        else:
            team_records[away]["wins"] += 1
            team_records[home]["losses"] += 1

    return team_records



In [None]:
import cloudpickle as pickle

with open('./models/lr_full.pkl', 'rb') as f:
    full_model = pickle.load(f)
with open('./models/lr_no_spread.pkl', 'rb') as f:
    no_spread_model = pickle.load(f)

models = {
    'full': full_model,
    'no_spread': no_spread_model
}

In [None]:
def process_results(path):
    # Read the new CSV file
    df = pd.read_csv(path)

    # Extract first picks and their log probabilities
    first_steps = list(zip(df['week_1'], df['log_prob']))

    # Convert to DataFrame
    df_first = pd.DataFrame(first_steps, columns=['First_Team', 'Log_Prob'])

    # Convert log probabilities to probabilities
    df_first['Prob'] = np.exp(df_first['Log_Prob'])

    # Aggregate: mean and std of probabilities for each first step
    r = df_first.groupby('First_Team')['Prob'].agg(['mean', 'std', 'count', 'sum']).reset_index()
    r.rename(columns={'mean': 'Avg_Prob', 'std': 'Std_Prob', 'count': 'Paths', 'sum': 'Sum_Prob'}, inplace=True)
    r.sort_values(by=["Sum_Prob"], ascending=False, inplace=True)

    return r

In [None]:
from simulation.season import BeamExploreSeason

def run_greedy_beam_path(year, models, schedule_df, k=1000):
    survivor_picks = []
    prior_weeks = {}
    path = []
    max_week = schedule_df['Week'].max()
    for wk in range(1, max_week + 1):
        with duckdb.connect('./data/data.db') as db:
            spread_df = get_season_week_speads(db, year, wk)
            rank_df = get_season_week_rankings(db, year, wk)
            prior_weeks = get_team_records_from_db(db, year, wk)

        beams = BeamExploreSeason(year, models, schedule_df, schedule_df.copy())
        # Use real prior_weeks if you have it, otherwise pass as is
        bp = beams.resolve(
            week=wk, end_week=max_week,
            spread=spread_df,
            rank=rank_df,
            k=k,
            n=1,
            survivor_picks=survivor_picks,
            prior_weeks=prior_weeks
        )

        # Aggregate log_prob for each possible pick
        pick_scores = {}
        for path_obj in bp:
            pick = path_obj['picks'][wk - 1]  # Get the pick for the current week
            pick_scores.setdefault(pick, 0)
            pick_scores[pick] += np.exp(path_obj['p'])

        # Select the team with the highest sum log_prob
        best_pick = max(pick_scores, key=pick_scores.get)
        path.append(best_pick)
        survivor_picks = path.copy()
        # print (f"Survivor picks so far: {survivor_picks}")

    return path

In [None]:
# year = 2024

# with duckdb.connect('./data/data.db') as db:
#     schedule_df = get_season_schedule(db, year)

# greedy_path = run_greedy_beam_path(
#     year, models, schedule_df, k=10000
# )
# print("Best greedy path:", greedy_path)

In [None]:
import json

for year in range(2024, 2012, -1):
    print(f"Running greedy path for year: {year}")

    with duckdb.connect('./data/data.db') as db:
        schedule_df = get_season_schedule(db, year)

    greedy_path = run_greedy_beam_path(
        year, models, schedule_df, k=10_000
    )
    print("Best greedy path:", greedy_path)
    with open(f'./results/greedy_path_{year}.json', 'wb') as f:
        f.write(json.dumps(greedy_path).encode('utf-8'))

Best greedy path: ['Seattle', 'Baltimore', 'Cleveland', 'San Francisco', 'Miami', 'Atlanta', 'Washington', 'Denver', 'Cincinnati', 'LA Chargers', 'Detroit', 'Houston', 'Kansas City', 'Philadelphia', 'Minnesota', 'Green Bay', 'Tampa Bay', 'Arizona']