<a href="https://colab.research.google.com/github/fdac24/fantasy-predictions/blob/main/VincentCondensed.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install nfl_data_py

Collecting nfl_data_py
  Downloading nfl_data_py-0.3.3-py3-none-any.whl.metadata (12 kB)
Collecting pandas<2.0,>=1.0 (from nfl_data_py)
  Downloading pandas-1.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting appdirs>1 (from nfl_data_py)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting fastparquet>0.5 (from nfl_data_py)
  Downloading fastparquet-2024.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Collecting cramjam>=2.3 (from fastparquet>0.5->nfl_data_py)
  Downloading cramjam-2.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Downloading nfl_data_py-0.3.3-py3-none-any.whl (13 kB)
Downloading appdirs-1.4.4-py2.py3-none-any.whl (9.6 kB)
Downloading fastparquet-2024.11.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m


In [4]:
import nfl_data_py as nfl
import pandas as pd
import os
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# Directory to save data files
DATA_DIR = "nfl_data"

# Ensure data directory exists
if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)

# 1. Fetch active fantasy players for the current year and save their data
def fetch_and_save_active_fantasy_players(current_year=2024):
    try:
        players = nfl.__import_rosters(years=[current_year], release='seasonal')
        # Filter for active players and specific fantasy positions
        active_fantasy_players = players[
            (players['status'] == 'ACT') &
            (players['position'].isin(['QB', 'WR', 'RB', 'TE']))
        ]

        # Drop unnecessary columns
        columns_to_drop = [
            'depth_chart_position', 'jersey_number', 'college', 'espn_id', 'sportradar_id',
            'yahoo_id', 'rotowire_id', 'pff_id', 'pfr_id', 'fantasy_data_id', 'sleeper_id',
            'headshot_url', 'ngs_position', 'status_description_abbr', 'football_name',
            'esb_id', 'gsis_it_id', 'smart_id', 'entry_year'
        ]
        active_fantasy_players = active_fantasy_players.drop(columns=columns_to_drop)

        # Save the data to CSV
        active_fantasy_players.to_csv(f"{DATA_DIR}/active_fantasy_players.csv", index=False)
        print(f"Saved active fantasy players data to {DATA_DIR}/active_fantasy_players.csv")
        return active_fantasy_players
    except AttributeError as e:
        print("Error: The nfl_data_py module does not provide a function for rosters.")
        raise e

def fetch_and_save_weekly_game_data_with_ids(year_range, active_players):
    try:
        # Fetch the weekly data for the specified year range
        print(f"Fetching weekly game data for years: {year_range}...")
        weekly_game_data = nfl.import_weekly_data(year_range)

        # Filter for only regular season games
        weekly_game_data = weekly_game_data[weekly_game_data['season_type'] == 'REG']

        # Filter for active players
        active_player_ids = set(active_players['player_id'])
        weekly_game_data = weekly_game_data[weekly_game_data['player_id'].isin(active_player_ids)]

        # Drop unnecessary columns
        columns_to_drop = ['player_name', 'position_group', 'headshot_url']
        weekly_game_data = weekly_game_data.drop(columns=columns_to_drop)

        # Load the combined game metadata file
        combined_metadata_file = f"{DATA_DIR}/combined_game_metadata.csv"
        game_metadata = pd.read_csv(combined_metadata_file)

        # Perform matching based on season, week, and team involvement
        def enrich_row_with_metadata(row):
            season = row['season']
            week = row['week']
            team1 = row['recent_team']
            team2 = row['opponent_team']

            # Find matching games in the metadata
            match = game_metadata[
                (game_metadata['season'] == season) &
                (game_metadata['week'] == week) &
                (
                    ((game_metadata['home_team'] == team1) & (game_metadata['away_team'] == team2)) |
                    ((game_metadata['home_team'] == team2) & (game_metadata['away_team'] == team1))
                )
            ]

            # Enrich row if a match is found
            if not match.empty:
                enriched_data = match.iloc[0]
                return pd.Series({
                    'game_id': enriched_data['game_id'],
                    'home_team': enriched_data['home_team'],
                    'away_team': enriched_data['away_team'],
                    'home_coach': enriched_data['home_coach'],
                    'away_coach': enriched_data['away_coach'],
                    'winner': enriched_data['winner'],
                    'referee_names': enriched_data['referee_names']
                })
            return pd.Series({
                'game_id': None,
                'home_team': None,
                'away_team': None,
                'home_coach': None,
                'away_coach': None,
                'winner': None,
                'referee_names': None
            })

        # Apply the matching function to enrich the data
        enriched_metadata = weekly_game_data.apply(enrich_row_with_metadata, axis=1)

        # Combine the enriched metadata with the original weekly game data
        weekly_game_data = pd.concat([weekly_game_data, enriched_metadata], axis=1)

        # Log missing game_id entries for debugging
        missing_game_data = weekly_game_data[weekly_game_data['game_id'].isna()]
        if not missing_game_data.empty:
            print(f"Warning: {len(missing_game_data)} rows could not be matched to game metadata.")
            print(missing_game_data[['season', 'week', 'recent_team', 'opponent_team']].head())

        # Fill missing numerical fields with 0 and categorical fields with "Unknown"
        numerical_fields = [
            'completions', 'attempts', 'passing_yards', 'passing_tds', 'interceptions',
            'sacks', 'sack_yards', 'sack_fumbles', 'sack_fumbles_lost', 'passing_air_yards',
            'passing_yards_after_catch', 'passing_first_downs', 'passing_epa', 'passing_2pt_conversions',
            'pacr', 'dakota', 'carries', 'rushing_yards', 'rushing_tds', 'rushing_fumbles',
            'rushing_fumbles_lost', 'rushing_first_downs', 'rushing_epa', 'rushing_2pt_conversions',
            'receptions', 'targets', 'receiving_yards', 'receiving_tds', 'receiving_fumbles',
            'receiving_fumbles_lost', 'receiving_air_yards', 'receiving_yards_after_catch',
            'receiving_first_downs', 'receiving_epa', 'receiving_2pt_conversions', 'racr',
            'target_share', 'air_yards_share', 'wopr', 'special_teams_tds', 'fantasy_points', 'fantasy_points_ppr'
        ]
        weekly_game_data[numerical_fields] = weekly_game_data[numerical_fields].fillna(0)

        categorical_fields = ['game_id', 'home_team', 'away_team', 'home_coach', 'away_coach', 'winner', 'referee_names']
        weekly_game_data[categorical_fields] = weekly_game_data[categorical_fields].fillna("Unknown")

        # Save to CSV
        file_range = f"{year_range[0]}_{year_range[-1]}" if len(year_range) > 1 else f"{year_range[0]}"
        weekly_game_data.to_csv(f"{DATA_DIR}/weekly_game_data_with_ids.csv", index=False)
        print(f"Saved weekly game data with game IDs for {file_range} to {DATA_DIR}/weekly_game_data_with_ids.csv")

        return weekly_game_data
    except Exception as e:
        print("Error fetching weekly game data with game IDs:", e)
        raise e



def match_game_metadata(row, game_metadata):
    """
    Match game metadata based on season, week, and teams.
    """
    season = row['season']
    week = row['week']
    team1 = row['recent_team']
    team2 = row['opponent_team']

    # Match game metadata
    match = game_metadata[
        (game_metadata['season'] == season) &
        (game_metadata['week'] == week) &
        (
            ((game_metadata['home_team'] == team1) & (game_metadata['away_team'] == team2)) |
            ((game_metadata['home_team'] == team2) & (game_metadata['away_team'] == team1))
        )
    ]

    if not match.empty:
        enriched_data = match.iloc[0]
        return pd.Series({
            'game_id': enriched_data['game_id'],
            'home_team': enriched_data['home_team'],
            'away_team': enriched_data['away_team'],
            'home_coach': enriched_data['home_coach'],
            'away_coach': enriched_data['away_coach'],
            'winner': enriched_data['winner'],
            'referee_names': enriched_data['referee_names']
        })

    # Return empty metadata if no match is found
    return pd.Series({
        'game_id': None,
        'home_team': None,
        'away_team': None,
        'home_coach': None,
        'away_coach': None,
        'winner': None,
        'referee_names': None
    })


def fetch_and_save_combined_game_metadata(year_range):
    try:
        # Combine metadata for all years in the range
        combined_metadata = []

        for year in year_range:
            print(f"Processing game metadata for {year}...")
            pbp_data = nfl.import_pbp_data([year])

            # Extract relevant columns for game metadata
            columns_to_keep = ['game_id', 'home_team', 'away_team', 'season', 'week', 'home_coach', 'away_coach']
            game_metadata = pbp_data[columns_to_keep].drop_duplicates(subset=['game_id'])

            # Extract the final scores
            final_scores = pbp_data.groupby('game_id').last()[['home_score', 'away_score']].reset_index()

            # Merge scores into game metadata
            game_metadata = game_metadata.merge(final_scores, on='game_id', how='left')

            # Determine the winner
            game_metadata['winner'] = game_metadata.apply(
                lambda row: row['home_team'] if row['home_score'] > row['away_score'] else (
                    row['away_team'] if row['away_score'] > row['home_score'] else 'TIE'
                ), axis=1
            )

            # Fetch the officials data for the specified season
            referee_data = nfl.import_officials([year])

            # Aggregate referee data by game_id
            grouped_referees = referee_data.groupby('game_id').agg({
                'name': lambda x: list(x),  # Combine referee names into a list
                'off_pos': lambda x: list(x)  # Combine referee positions into a list
            }).reset_index()
            grouped_referees.rename(columns={'name': 'referee_names', 'off_pos': 'referee_positions'}, inplace=True)

            # Merge referee data into game metadata
            game_metadata = game_metadata.merge(grouped_referees, on='game_id', how='left')

            # Append to the combined list
            combined_metadata.append(game_metadata)

        # Concatenate all metadata into a single DataFrame
        combined_metadata_df = pd.concat(combined_metadata, ignore_index=True)

        # Ensure 'season' and 'week' are included
        combined_metadata_df = combined_metadata_df[['game_id', 'home_team', 'away_team', 'season', 'week',
                                                     'home_coach', 'away_coach', 'home_score', 'away_score',
                                                     'winner', 'referee_names', 'referee_positions']]

        # Save the combined data to a single file
        combined_metadata_file = f"{DATA_DIR}/combined_game_metadata.csv"
        combined_metadata_df.to_csv(combined_metadata_file, index=False)
        print(f"Saved combined game metadata with referees and scores for all years to {combined_metadata_file}")

        # Print a sample of the combined data for verification
        print("Sample combined game metadata:")
        print(combined_metadata_df.head())

        return combined_metadata_df

    except Exception as e:
        print("Error fetching and combining game metadata and referee data:", e)
        raise e



def main():
    # Specify year range
    current_year = 2024
    game_year_range = list(range(2022, 2025)) # this goes up to the 2024

    # Step 1: Fetch active fantasy players
    print("Fetching active fantasy players...")
    active_fantasy_players = fetch_and_save_active_fantasy_players(current_year)
    print("Active fantasy player fetching complete.")

    # Step 2: Fetch combined game metadata (including referees and scores)
    print(f"Fetching combined game metadata for years {game_year_range}...")
    fetch_and_save_combined_game_metadata(game_year_range)
    print("Combined game metadata fetching complete.")

    # Step 3: Fetch weekly game data with game IDs
    print(f"Fetching weekly game data for years {game_year_range}...")
    fetch_and_save_weekly_game_data_with_ids(game_year_range, active_fantasy_players)
    print("Weekly game data fetching complete.")

main()


Fetching active fantasy players...
Saved active fantasy players data to nfl_data/active_fantasy_players.csv
Active fantasy player fetching complete.
Fetching combined game metadata for years [2022, 2023, 2024]...
Processing game metadata for 2022...
2022 done.
Downcasting floats.
Processing game metadata for 2023...
2023 done.
Downcasting floats.
Processing game metadata for 2024...
2024 done.
Downcasting floats.
Saved combined game metadata with referees and scores for all years to nfl_data/combined_game_metadata.csv
Sample combined game metadata:
           game_id home_team away_team  season  week       home_coach  \
0  2022_01_BAL_NYJ       NYJ       BAL    2022     1     Robert Saleh   
1   2022_01_BUF_LA        LA       BUF    2022     1       Sean McVay   
2  2022_01_CLE_CAR       CAR       CLE    2022     1       Matt Rhule   
3  2022_01_DEN_SEA       SEA       DEN    2022     1     Pete Carroll   
4   2022_01_GB_MIN       MIN        GB    2022     1  Kevin O'Connell   

      

In [6]:
!pip install torch
!pip install pickle

[31mERROR: Could not find a version that satisfies the requirement pickle (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for pickle[0m[31m
[0m

In [7]:
import torch
import torch.nn as nn
import os
import pickle

# File paths
INPUT_FILE = "nfl_data/weekly_game_data_with_ids.csv"
OUTPUT_FILE = "nfl_data/processed_data.csv"

# Embedding file paths
PLAYER_EMBEDDINGS_FILE = "nfl_data/player_embeddings.pkl"
TEAM_EMBEDDINGS_FILE = "nfl_data/team_embeddings.pkl"

def calculate_condensed_stat(row):
    """
    Calculate a condensed stat based on player position using all specified features.
    """
    position = row['position']

    if position == 'QB':
        return (
            row['completions'] * 0.1 +
            row['attempts'] * 0.02 +
            row['passing_yards'] * 0.04 +
            row['passing_tds'] * 4 -
            row['interceptions'] * 2 -
            row['sacks'] * 0.5 -
            row['sack_yards'] * 0.05 +
            row['sack_fumbles'] * -0.5 +
            row['sack_fumbles_lost'] * -1 +
            row['passing_air_yards'] * 0.03 +
            row['passing_yards_after_catch'] * 0.02 +
            row['passing_first_downs'] * 0.5 +
            row['passing_epa'] * 1 +
            row['passing_2pt_conversions'] * 2 +
            row['pacr'] * 0.1 +
            row['dakota'] * 0.2 +
            row['carries'] * 0.1 +
            row['rushing_yards'] * 0.1 +
            row['rushing_tds'] * 6 -
            row['rushing_fumbles'] * 2 -
            row['rushing_fumbles_lost'] * 2 +
            row['rushing_first_downs'] * 0.5 +
            row['rushing_epa'] * 1 +
            row['rushing_2pt_conversions'] * 2
        )
    elif position in ['WR', 'TE']:
        return (
            row['receptions'] * 1 +
            row['targets'] * 0.5 +
            row['receiving_yards'] * 0.1 +
            row['receiving_tds'] * 6 -
            row['receiving_fumbles'] * 2 -
            row['receiving_fumbles_lost'] * 2 +
            row['receiving_air_yards'] * 0.03 +
            row['receiving_yards_after_catch'] * 0.02 +
            row['receiving_first_downs'] * 0.5 +
            row['receiving_epa'] * 1 +
            row['receiving_2pt_conversions'] * 2 +
            row['racr'] * 0.1 +
            row['target_share'] * 0.2 +
            row['air_yards_share'] * 0.2 +
            row['wopr'] * 0.3
        )
    elif position == 'RB':
        return (
            row['carries'] * 0.1 +
            row['rushing_yards'] * 0.1 +
            row['rushing_tds'] * 6 -
            row['rushing_fumbles'] * 2 -
            row['rushing_fumbles_lost'] * 2 +
            row['rushing_first_downs'] * 0.5 +
            row['rushing_epa'] * 1 +
            row['rushing_2pt_conversions'] * 2 +
            row['receptions'] * 1 +
            row['targets'] * 0.5 +
            row['receiving_yards'] * 0.1 +
            row['receiving_tds'] * 6 -
            row['receiving_fumbles'] * 2 -
            row['receiving_fumbles_lost'] * 2 +
            row['receiving_air_yards'] * 0.03 +
            row['receiving_yards_after_catch'] * 0.02 +
            row['receiving_first_downs'] * 0.5 +
            row['receiving_epa'] * 1 +
            row['receiving_2pt_conversions'] * 2 +
            row['racr'] * 0.1 +
            row['target_share'] * 0.2 +
            row['air_yards_share'] * 0.2 +
            row['wopr'] * 0.3
        )
    else:
        return 0

EMBEDDING_DIM = 10

def generate_embeddings(data, column, embedding_dim=EMBEDDING_DIM):
    """
    Generate embeddings for a categorical column using PyTorch embeddings.
    Condense embeddings into a single list per row.
    Also returns a mapping from unique values to embeddings.
    """
    print(f"Generating embeddings for {column}...")

    # Map unique values to indices
    unique_values = data[column].dropna().unique()
    value_to_index = {val: idx for idx, val in enumerate(unique_values)}

    # Handle missing values in the column
    data[column] = data[column].fillna("Unknown")
    data[f"{column}_id"] = data[column].map(value_to_index)

    # Create embedding layer
    embedding_layer = nn.Embedding(len(unique_values), embedding_dim)

    # Generate embeddings for each value
    indices = torch.tensor(data[f"{column}_id"].values)
    embeddings = embedding_layer(indices).detach().numpy()

    # Add embeddings as a single list for each row
    data[f"{column}_embedding"] = [list(emb) for emb in embeddings]

    # Create a mapping from value to embedding
    value_embeddings = {}
    for val in unique_values:
        idx = value_to_index[val]
        emb = embedding_layer(torch.tensor(idx)).detach().numpy()
        value_embeddings[val] = emb

    # Drop the ID column
    data = data.drop(columns=[f"{column}_id"])
    return data, value_embeddings

def process_data(input_file, output_file):
    """
    Process the data to calculate condensed stats and embeddings, then save the processed file.
    """
    print(f"Reading data from {input_file}...")
    data = pd.read_csv(input_file)
    print(f"Data shape: {data.shape}")

    # Fill missing numerical fields with 0
    data.fillna(0, inplace=True)

    # Calculate the condensed stat
    print("Calculating condensed stats...")
    data['condensed_stat'] = data.apply(calculate_condensed_stat, axis=1)

    # Drop all features used in the condensed stat calculations
    features_to_remove = [
        'completions', 'attempts', 'passing_yards', 'passing_tds', 'interceptions', 'sacks',
        'sack_yards', 'sack_fumbles', 'sack_fumbles_lost', 'passing_air_yards',
        'passing_yards_after_catch', 'passing_first_downs', 'passing_epa',
        'passing_2pt_conversions', 'pacr', 'dakota', 'carries', 'rushing_yards',
        'rushing_tds', 'rushing_fumbles', 'rushing_fumbles_lost', 'rushing_first_downs',
        'rushing_epa', 'rushing_2pt_conversions', 'receptions', 'targets', 'receiving_yards',
        'receiving_tds', 'receiving_fumbles', 'receiving_fumbles_lost', 'receiving_air_yards',
        'receiving_yards_after_catch', 'receiving_first_downs', 'receiving_epa',
        'receiving_2pt_conversions', 'racr', 'target_share', 'air_yards_share', 'wopr'
    ]
    features_to_remove = [col for col in features_to_remove if col in data.columns]
    data = data.drop(columns=features_to_remove)

    # Initialize dictionaries to hold embeddings
    embeddings_to_save = {}

    # Generate embeddings for categorical features and collect embeddings
    for column in ['player_id', 'recent_team', 'opponent_team']:
        if column in data.columns:
            data, embeddings = generate_embeddings(data, column)
            embeddings_to_save[column] = embeddings

    # Remove coaches and referees columns
    data = data.drop(columns=['home_coach', 'away_coach', 'referee_names'], errors='ignore')

    # Save the processed data
    print(f"Saving processed data to {output_file}...")
    data.to_csv(output_file, index=False)
    print("Processing complete.")

    # Save embeddings to files
    print("Saving embeddings...")
    if 'player_id' in embeddings_to_save:
        with open(PLAYER_EMBEDDINGS_FILE, 'wb') as f:
            pickle.dump(embeddings_to_save['player_id'], f)
        print(f"Player embeddings saved to {PLAYER_EMBEDDINGS_FILE}")

    if 'opponent_team' in embeddings_to_save:
        with open(TEAM_EMBEDDINGS_FILE, 'wb') as f:
            pickle.dump(embeddings_to_save['opponent_team'], f)
        print(f"Team embeddings saved to {TEAM_EMBEDDINGS_FILE}")


process_data(INPUT_FILE, OUTPUT_FILE)

Reading data from nfl_data/weekly_game_data_with_ids.csv...
Data shape: (9886, 57)
Calculating condensed stats...
Generating embeddings for player_id...
Generating embeddings for recent_team...
Generating embeddings for opponent_team...
Saving processed data to nfl_data/processed_data.csv...
Processing complete.
Saving embeddings...
Player embeddings saved to nfl_data/player_embeddings.pkl
Team embeddings saved to nfl_data/team_embeddings.pkl


In [9]:
ACTIVE_PLAYERS_FILE = 'nfl_data/active_fantasy_players.csv'
PLAYER_INFO_FILE = 'nfl_data/player_info.csv'

def main():
    # Read the data
    active_players = pd.read_csv(ACTIVE_PLAYERS_FILE)

    # Ensure the necessary columns are present
    required_columns = ['player_name', 'team', 'position', 'player_id']
    alternative_columns = {
        'team': 'recent_team'  # Map 'team' to 'recent_team' if 'team' doesn't exist
    }

    for i, col in enumerate(required_columns):
        if col not in active_players.columns:
            if col in alternative_columns:
                alt_col = alternative_columns[col]
                if alt_col in active_players.columns:
                    required_columns[i] = alt_col
                else:
                    raise ValueError(f"Column '{col}' or its alternative '{alt_col}' not found in the data.")
            else:
                raise ValueError(f"Column '{col}' not found in the data.")

    # Extract the required columns
    player_info = active_players[required_columns]
    # Rename columns to standard names
    player_info.columns = ['player_name', 'team', 'position', 'player_id']

    # Save to CSV
    player_info.to_csv(PLAYER_INFO_FILE, index=False)
    print(f"Saved player information to {PLAYER_INFO_FILE}")

    # Interactive lookup. Uncomment this part of the code out if you want to look up player IDs
    # while True:
    #     player_name_to_lookup = input("Enter the player's name to look up (or 'exit' to quit): ")
    #     if player_name_to_lookup.lower() == 'exit':
    #         break
    #     matches = lookup_player(player_name_to_lookup, player_info)
    #     if matches is not None:
    #         print(matches)
    #         print()

def lookup_player(name, player_info_df):
    """
    Look up players by name.
    Returns a DataFrame with matching player(s).
    """
    # Case-insensitive search
    matches = player_info_df[player_info_df['player_name'].str.lower() == name.lower()]

    if matches.empty:
        print(f"No players found with the name '{name}'.")
        return None
    else:
        print(f"Found {len(matches)} player(s) with the name '{name}':")
        return matches

main()

Saved player information to nfl_data/player_info.csv


In [10]:
import os

# Directory to save data files
DATA_DIR = "nfl_data"

# Ensure data directory exists
if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)

def fetch_and_save_schedules(years):
    """
    Fetches the NFL schedules for the specified years and saves them to a CSV file.
    """
    try:
        print(f"Fetching NFL schedules for years: {years}...")
        # Fetch the schedules
        schedules = nfl.import_schedules(years=years)

        # Print available columns
        print("Available columns in schedules DataFrame:")
        print(schedules.columns.tolist())

        # Define the desired columns
        desired_columns = [
            'season', 'season_type', 'week', 'game_id',
            'game_date', 'game_time_eastern',
            'home_team', 'away_team', 'site_city', 'site_state', 'result'
        ]

        # Keep only columns that are available
        available_columns = schedules.columns.tolist()
        columns_to_keep = [col for col in desired_columns if col in available_columns]

        schedules = schedules[columns_to_keep]

        # Convert 'game_date' to datetime if it's available
        if 'game_date' in schedules.columns:
            schedules['game_date'] = pd.to_datetime(schedules['game_date'])

        # Sort the schedules by available columns
        sort_columns = ['season', 'week']
        if 'game_date' in schedules.columns:
            sort_columns.append('game_date')
        schedules.sort_values(by=sort_columns, inplace=True)

        # Save to CSV without years in the filename
        schedules_file = os.path.join(DATA_DIR, 'nfl_schedules.csv')
        schedules.to_csv(schedules_file, index=False)
        print(f"Saved NFL schedules to {schedules_file}")

        # Print a sample of the data
        print("Sample NFL schedules data:")
        print(schedules.head())

        return schedules

    except Exception as e:
        print("Error fetching NFL schedules:", e)
        raise e

def main():
    # Specify the years for which you want to fetch schedules
    years_to_fetch = [2024]  # Only 2024 as per your request

    # Fetch and save the schedules
    fetch_and_save_schedules(years_to_fetch)

main()

Fetching NFL schedules for years: [2024]...
Available columns in schedules DataFrame:
['game_id', 'season', 'game_type', 'week', 'gameday', 'weekday', 'gametime', 'away_team', 'away_score', 'home_team', 'home_score', 'location', 'result', 'total', 'overtime', 'old_game_id', 'gsis', 'nfl_detail_id', 'pfr', 'pff', 'espn', 'ftn', 'away_rest', 'home_rest', 'away_moneyline', 'home_moneyline', 'spread_line', 'away_spread_odds', 'home_spread_odds', 'total_line', 'under_odds', 'over_odds', 'div_game', 'roof', 'surface', 'temp', 'wind', 'away_qb_id', 'home_qb_id', 'away_qb_name', 'home_qb_name', 'away_coach', 'home_coach', 'referee', 'stadium_id', 'stadium']
Saved NFL schedules to nfl_data/nfl_schedules.csv
Sample NFL schedules data:
      season  week          game_id home_team away_team  result
6706    2024     1   2024_01_BAL_KC        KC       BAL     7.0
6707    2024     1   2024_01_GB_PHI       PHI        GB     5.0
6708    2024     1  2024_01_PIT_ATL       ATL       PIT    -8.0
6709    2

In [11]:
import pandas as pd
import numpy as np
import pickle
import os
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

# File paths
DATA_DIR = 'nfl_data'

PROCESSED_DATA_FILE = os.path.join(DATA_DIR, 'processed_data.csv')
PLAYER_EMBEDDINGS_FILE = os.path.join(DATA_DIR, 'player_embeddings.pkl')
TEAM_EMBEDDINGS_FILE = os.path.join(DATA_DIR, 'team_embeddings.pkl')
MODEL_FILE = os.path.join(DATA_DIR, 'fantasy_score_model.pkl')

def load_data():
    # Load processed data
    data = pd.read_csv(PROCESSED_DATA_FILE)

    # Load embeddings
    with open(PLAYER_EMBEDDINGS_FILE, 'rb') as f:
        player_embeddings = pickle.load(f)
    with open(TEAM_EMBEDDINGS_FILE, 'rb') as f:
        team_embeddings = pickle.load(f)

    return data, player_embeddings, team_embeddings

def prepare_features(data, player_embeddings, team_embeddings):
    X = []
    y = []

    for idx, row in data.iterrows():
        player_id = row['player_id']
        opponent_team = row['opponent_team']
        fantasy_points_ppr = row['fantasy_points_ppr']

        player_embedding = player_embeddings.get(player_id)
        team_embedding = team_embeddings.get(opponent_team)

        if player_embedding is not None and team_embedding is not None:
            combined_embedding = np.concatenate([player_embedding, team_embedding])
            X.append(combined_embedding)
            y.append(fantasy_points_ppr)
        else:
            # Skip if embeddings are missing
            continue

    X = np.array(X)
    y = np.array(y)
    return X, y

def train_model(X, y):
    # Split data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize and train the model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    print("Training the model...")
    model.fit(X_train, y_train)

    # Evaluate the model
    y_pred = model.predict(X_val)
    mse = mean_squared_error(y_val, y_pred)
    mae = mean_absolute_error(y_val, y_pred)
    print(f"Validation MSE: {mse:.2f}")
    print(f"Validation MAE: {mae:.2f}")

    return model

def save_model(model):
    with open(MODEL_FILE, 'wb') as f:
        pickle.dump(model, f)
    print(f"Model saved to {MODEL_FILE}")

def main():
    data, player_embeddings, team_embeddings = load_data()
    X, y = prepare_features(data, player_embeddings, team_embeddings)
    model = train_model(X, y)
    save_model(model)

main()

Training the model...
Validation MSE: 53.57
Validation MAE: 5.52
Model saved to nfl_data/fantasy_score_model.pkl


In [13]:
import pandas as pd
import numpy as np
import pickle
import os
from sklearn.metrics import mean_squared_error, mean_absolute_error

# File paths
DATA_DIR = 'nfl_data'
PREDICTIONS_DIR = 'predictions'
os.makedirs(PREDICTIONS_DIR, exist_ok=True)

PLAYER_INFO_FILE = os.path.join(DATA_DIR, 'player_info.csv')
SCHEDULES_FILE = os.path.join(DATA_DIR, 'nfl_schedules.csv')  # Updated filename
PROCESSED_DATA_FILE = os.path.join(DATA_DIR, 'processed_data.csv')
PLAYER_EMBEDDINGS_FILE = os.path.join(DATA_DIR, 'player_embeddings.pkl')
TEAM_EMBEDDINGS_FILE = os.path.join(DATA_DIR, 'team_embeddings.pkl')
MODEL_FILE = os.path.join(DATA_DIR, 'fantasy_score_model.pkl')

def predict_fantasy_score(player_id, opponent_team, player_embeddings, team_embeddings, model):
    # Retrieve embeddings
    player_embedding = player_embeddings.get(player_id)
    team_embedding = team_embeddings.get(opponent_team)
    if player_embedding is None or team_embedding is None:
        # Return None if embeddings are missing
        return None
    # Combine embeddings
    combined_embedding = np.concatenate([player_embedding, team_embedding]).reshape(1, -1)
    # Predict
    predicted_score = model.predict(combined_embedding)
    return predicted_score[0]

def predict_weekly_scores(week_number, season_year, player_info, schedules, player_embeddings, team_embeddings, model):
    # Filter schedules for the specified week and season
    week_schedule = schedules[
        (schedules['season'] == season_year) &
        (schedules['week'] == week_number)
    ]

    if week_schedule.empty:
        # print(f"No games scheduled for week {week_number} in {season_year}.")
        return pd.DataFrame()  # Return empty DataFrame

    # Create a mapping from team to opponent team
    team_opponent_map = {}
    for _, game in week_schedule.iterrows():
        home_team = game['home_team']
        away_team = game['away_team']
        team_opponent_map[home_team] = away_team
        team_opponent_map[away_team] = home_team

    # List to store predictions
    predictions = []

    # Iterate over all players
    for idx, player in player_info.iterrows():
        player_id = player['player_id']
        player_name = player['player_name']
        team = player['team']
        position = player['position']

        # Get opponent team
        opponent_team = team_opponent_map.get(team)
        if opponent_team is None:
            # No game scheduled for this team in this week
            continue

        # Predict the fantasy score
        predicted_score = predict_fantasy_score(
            player_id=player_id,
            opponent_team=opponent_team,
            player_embeddings=player_embeddings,
            team_embeddings=team_embeddings,
            model=model
        )

        if predicted_score is not None:
            predictions.append({
                'player_id': player_id,
                'player_name': player_name,
                'team': team,
                'position': position,
                'opponent_team': opponent_team,
                'predicted_ppr': predicted_score,
                'season': season_year,
                'week': week_number
            })

    # Create a DataFrame with predictions
    predictions_df = pd.DataFrame(predictions)

    return predictions_df

def output_top_players(predictions_df, top_n=20, top_x=5):
    if predictions_df.empty:
        print("No predictions to display.")
        return

    # Sort the predictions by predicted PPR score in descending order
    predictions_df.sort_values(by='predicted_ppr', ascending=False, inplace=True)

    # Output the top N players overall
    print(f"Top {top_n} Players:")
    print(predictions_df.head(top_n)[['player_name', 'team', 'position', 'opponent_team', 'predicted_ppr']])

    print("\nTop Players by Position:")
    # Group by position and output the top X players for each position
    positions = predictions_df['position'].unique()
    for pos in positions:
        pos_df = predictions_df[predictions_df['position'] == pos]
        pos_df = pos_df.head(top_x)
        print(f"\nPosition: {pos}")
        print(pos_df[['player_name', 'team', 'opponent_team', 'predicted_ppr']])

def run_test(player_info, schedules, player_embeddings, team_embeddings, model, processed_data):
    # Only test on the 2024 season
    season_year = 2024

    # Determine the weeks with actual data in processed_data for 2024
    available_weeks = processed_data[
        (processed_data['season'] == season_year) &
        (processed_data['fantasy_points_ppr'].notna())
    ]['week'].unique()

    if len(available_weeks) == 0:
        print(f"No available weeks with actual data for season {season_year}. Exiting.")
        return

    max_week = available_weeks.max()
    print(f"Testing on Season {season_year}, Weeks 1 to {max_week}")

    overall_predictions = pd.DataFrame()

    for week_number in range(1, int(max_week) + 1):
        if week_number not in available_weeks:
            # print(f"No data for week {week_number} in {season_year}. Skipping.")
            continue

        # print(f"Testing Season {season_year}, Week {week_number}")

        # Filter player info for players who played in this week
        week_player_ids = processed_data[
            (processed_data['season'] == season_year) &
            (processed_data['week'] == week_number)
        ]['player_id'].unique()

        if len(week_player_ids) == 0:
            # print(f"No player data for week {week_number} in {season_year}. Skipping.")
            continue

        week_player_info = player_info[player_info['player_id'].isin(week_player_ids)]

        # Predict weekly scores
        predictions_df = predict_weekly_scores(
            week_number=week_number,
            season_year=season_year,
            player_info=week_player_info,
            schedules=schedules,
            player_embeddings=player_embeddings,
            team_embeddings=team_embeddings,
            model=model
        )

        if predictions_df.empty:
            # print(f"No predictions made for week {week_number} in {season_year}. Skipping.")
            continue

        # Merge with actual scores using 'fantasy_points_ppr'
        actual_scores = processed_data[
            (processed_data['season'] == season_year) &
            (processed_data['week'] == week_number)
        ][['player_id', 'fantasy_points_ppr']]

        # Ensure 'player_id' is in both dataframes
        if 'player_id' not in predictions_df.columns or 'player_id' not in actual_scores.columns:
            # print(f"'player_id' not found in predictions or actual scores for week {week_number}. Skipping.")
            continue

        predictions_df = predictions_df.merge(actual_scores, on='player_id', how='left')
        predictions_df.rename(columns={'fantasy_points_ppr': 'actual_ppr'}, inplace=True)

        # Append to overall predictions
        overall_predictions = pd.concat([overall_predictions, predictions_df], ignore_index=True)

    if overall_predictions.empty:
        print("No predictions were made during the test. Exiting.")
        return

    # Calculate overall error metrics
    y_true = overall_predictions['actual_ppr'].values
    y_pred = overall_predictions['predicted_ppr'].values

    errors = y_pred - y_true
    abs_errors = np.abs(errors)

    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    me = np.mean(errors)
    std_error = np.std(errors)

    print(f"\nOverall Test MSE: {mse:.2f}")
    print(f"Overall Test MAE: {mae:.2f}")
    print(f"Overall Mean Error: {me:.2f}")
    print(f"Overall Standard Deviation of Errors: {std_error:.2f}")

    # Add errors to the DataFrame
    overall_predictions['error'] = errors
    overall_predictions['abs_error'] = abs_errors

    # Best and worst overall predictions
    best_prediction = overall_predictions.loc[overall_predictions['abs_error'].idxmin()]
    worst_prediction = overall_predictions.loc[overall_predictions['abs_error'].idxmax()]

    print("\nBest Overall Prediction:")
    print(best_prediction[['player_name', 'team', 'position', 'predicted_ppr', 'actual_ppr', 'error']])

    print("\nWorst Overall Prediction:")
    print(worst_prediction[['player_name', 'team', 'position', 'predicted_ppr', 'actual_ppr', 'error']])

    # Group by position and calculate statistics
    positions = overall_predictions['position'].unique()

    for pos in positions:
        pos_data = overall_predictions[overall_predictions['position'] == pos].copy()
        y_true_pos = pos_data['actual_ppr'].values
        y_pred_pos = pos_data['predicted_ppr'].values
        errors_pos = y_pred_pos - y_true_pos
        abs_errors_pos = np.abs(errors_pos)

        mse_pos = mean_squared_error(y_true_pos, y_pred_pos)
        mae_pos = mean_absolute_error(y_true_pos, y_pred_pos)
        me_pos = np.mean(errors_pos)
        std_error_pos = np.std(errors_pos)

        print(f"\nPosition: {pos}")
        print(f" - MSE: {mse_pos:.2f}")
        print(f" - MAE: {mae_pos:.2f}")
        print(f" - Mean Error: {me_pos:.2f}")
        print(f" - Standard Deviation of Errors: {std_error_pos:.2f}")

        # Best and worst predictions for this position
        pos_data['error'] = errors_pos
        pos_data['abs_error'] = abs_errors_pos

        best_pred_pos = pos_data.loc[pos_data['abs_error'].idxmin()]
        worst_pred_pos = pos_data.loc[pos_data['abs_error'].idxmax()]

        print(f" - Best Prediction:")
        print(best_pred_pos[['player_name', 'team', 'predicted_ppr', 'actual_ppr', 'error']])

        print(f" - Worst Prediction:")
        print(worst_pred_pos[['player_name', 'team', 'predicted_ppr', 'actual_ppr', 'error']])

    # Save overall predictions to CSV
    overall_predictions.to_csv(os.path.join(PREDICTIONS_DIR, 'test_predictions.csv'), index=False)
    # print(f"Test predictions saved to {os.path.join(PREDICTIONS_DIR, 'test_predictions.csv')}")

def main():
    # Load active players
    player_info = pd.read_csv(PLAYER_INFO_FILE)

    # Load schedules
    schedules = pd.read_csv(SCHEDULES_FILE)

    # Load embeddings
    with open(PLAYER_EMBEDDINGS_FILE, 'rb') as f:
        player_embeddings = pickle.load(f)
    with open(TEAM_EMBEDDINGS_FILE, 'rb') as f:
        team_embeddings = pickle.load(f)

    # Load the trained model
    with open(MODEL_FILE, 'rb') as f:
        model = pickle.load(f)

    # Load processed data for actual scores
    processed_data = pd.read_csv(PROCESSED_DATA_FILE)

    # Ensure correct data types
    processed_data['season'] = processed_data['season'].astype(int)
    processed_data['week'] = processed_data['week'].astype(int)

    # Specify the week number or 'TEST'
    week_input = input("Enter the week number for which you want to predict scores (or type 'TEST' to evaluate the model): ")

    if week_input.strip().upper() == 'TEST':
        # Run the testing routine
        run_test(player_info, schedules, player_embeddings, team_embeddings, model, processed_data)
    else:
        # Proceed with predicting for a specific week
        try:
            week_number = int(week_input)
            season_year = 2024  # Update as needed

            # Predict weekly scores
            predictions_df = predict_weekly_scores(
                week_number=week_number,
                season_year=season_year,
                player_info=player_info,
                schedules=schedules,
                player_embeddings=player_embeddings,
                team_embeddings=team_embeddings,
                model=model
            )

            if predictions_df.empty:
                print(f"No predictions available for week {week_number} in {season_year}.")
            else:
                # Output the top players
                output_top_players(predictions_df, top_n=20, top_x=5)

                # Save predictions to the 'predictions' folder
                predictions_file = os.path.join(PREDICTIONS_DIR, f'predictions_week_{week_number}_{season_year}.csv')
                predictions_df.to_csv(predictions_file, index=False)
                print(f'Predictions saved to {predictions_file}')

        except ValueError:
            print("Invalid input. Please enter a valid week number or 'TEST'.")

main()

Enter the week number for which you want to predict scores (or type 'TEST' to evaluate the model): TEST
Testing on Season 2024, Weeks 1 to 11

Overall Test MSE: 22.13
Overall Test MAE: 3.26
Overall Mean Error: 0.21
Overall Standard Deviation of Errors: 4.70

Best Overall Prediction:
player_name      Cedrick Wilson Jr.
team                             NO
position                         WR
predicted_ppr              5.397825
actual_ppr                      5.4
error                     -0.002175
Name: 2101, dtype: object

Worst Overall Prediction:
player_name      Ja'Marr Chase
team                       CIN
position                    WR
predicted_ppr        20.250524
actual_ppr                55.4
error               -35.149476
Name: 2436, dtype: object

Position: QB
 - MSE: 26.41
 - MAE: 3.83
 - Mean Error: 0.18
 - Standard Deviation of Errors: 5.14
 - Best Prediction:
player_name      Anthony Richardson
team                            IND
predicted_ppr                9.9002
actual_p