In [1]:
import datetime
import duckdb
import enum
import json
import ollama
import os
import pandas as pd
import pydantic
import requests
import time
from typing import Dict, List

In [2]:
SLEEPER_REST_API = "https://api.sleeper.com"
SLEEPER_GRAPHQL_API = "https://sleeper.com/graphql"


def get_player_reports(player_id: str, limit: int) -> List[Dict]:
    operation_name = "get_player_news"
    kwargs = {
        "operation_name": operation_name,
        "player_id": player_id,
        "limit": limit,
    }
    query = """
        query {operation_name} {{
            get_player_news(sport: "nfl", player_id: "{player_id}", limit: {limit}){{
                metadata
                player_id
                published
                source
                source_key
                sport
            }}
        }}
    """.format(**kwargs)
    body = {
        "operationName": operation_name,
        "query": query,
        "variables": {}
    }
    time.sleep(0.1)
    req = requests.post(SLEEPER_GRAPHQL_API, json=body)
    res = req.json()
    reports = res.get("data", {}).get(operation_name, [])
    return reports


def transform_reports(raw_reports: List[Dict]) -> List[Dict]:
    return [
        {
            "report_id": r.get("source") + "_" + r.get("source_key"),
            "player_id": r.get("player_id"),
            "source": r.get("source"),
            "category": "retrospective" if r.get("metadata", {}).get("analysis") is not None else "prospective",
            "published_at": r.get("published"),
            "title": r.get("metadata", {}).get("title"),
            "description": r.get("metadata", {}).get("description"),
            "analysis": r.get("metadata", {}).get("analysis"),
        }
        for r in raw_reports
    ]


def get_all_players() -> List[Dict]:
    url = f"{SLEEPER_REST_API}/v1/players/nfl"
    req = requests.get(url)
    players_by_id = req.json()
    players = [
        {
            "player_id": p.get("player_id"),
            "name": p.get("full_name"),
            "position": p.get("position"),
            "current_team": p.get("team"),
            "jersey_number": int(p.get("number")) if p.get("number") is not None else None
        }
        for p in players_by_id.values()
    ]
    return players


def get_player_week_stats(player_id: str, season: int) -> List[Dict]:
    url = f"{SLEEPER_REST_API}/stats/nfl/player/{player_id}?season_type=regular&season={season}&grouping=week"
    time.sleep(0.1)
    req = requests.get(url)
    res = req.json()
    raw = res.values()
    week_stats = list(filter(lambda v: v is not None, raw))
    return week_stats


def transform_week_stats(raw_week_stats: List[Dict]) -> List[Dict]:
    return [
        {
            "stat_id": r.get("season") + "_" + str(r.get("week")) + "_" + r.get("player_id"),
            "week_id": r.get("season") + "_" + str(r.get("week")),
            "player_id": r.get("player_id"),
            "game_id": r.get("game_id"),
            "season": r.get("season"),
            "week": r.get("week"),
            "team": r.get("team"),
            "opponent": r.get("opponent"),
            "played_at": int(datetime.datetime.strptime(r.get("date"), "%Y-%m-%d").timestamp() * 1000),
            "rushing_attempts": int(r.get("stats", {}).get("rush_att", 0)),
            "rushing_yards": int(r.get("stats", {}).get("rush_yd", 0)),
            "receiving_targets": int(r.get("stats", {}).get("rec_tgt", 0)),
            "receiving_catches": int(r.get("stats", {}).get("rec", 0)),
            "receiving_yards": int(r.get("stats", {}).get("rec_yd", 0))
            
        }
        for r in raw_week_stats
    ]

In [3]:
EXPECTED_WORKLOAD_CHANGE_PROMPT = """
You are an expert in semantics and NFL football.

Read the following news report about an NFL player and extract structured data that captures what
the report thinks will happen to that player's workload in the next game.

Your output should be JSON with the following fields:
- "target_player"
  - The name of the player the report focuses on
- "expected_workload_change"
  - The expected change in workload (carries, targets) for the target player
  - Value can be: "higher", "lower", "same", or "unknown"
- "reason"
  - A brief description (less than 20 words) of why the report thinks that workload change will happen
- "reason_category"
  - Categorization of the primary reason for the expected workload change
  - Value can be one of the following:
    - "promotion" if the target player is getting higher workload due to good play
    - "demotion" if the target player is getting lower workload due to poor play
    - "teammate_injury" if the target player is getting higher workload due to a teammate's injury
    - "own_injury" if the target player is getting lower workload due to their own injury
    - "strong_opponent" if the opponent is expected to be stronger against the team
    - "weak_opponent" if the opponent is expected to be weaker against the team
    - "rumor" if based on a rumor about players, coaches, or strategy
    - "no_change" if workload is expected to be the same

News Report:
{report_content}

Output:
"""


class ExpectedWorkloadChangeEnum(enum.Enum):
    higher = "higher"
    lower = "lower"
    same = "same"
    unknown = "unknown"


class ReasonCategoryEnum(enum.Enum):
    promotion = "promotion"
    demotion = "demotion"
    teammate_injury = "teammate_injury"
    own_injury = "own_injury"
    strong_opponent = "strong_opponent"
    weak_opponent = "weak_opponent"
    rumor = "rumor"
    no_change = "no_change"


class ExpectedWorkloadChange(pydantic.BaseModel):
    target_player: str
    expected_workload_change: ExpectedWorkloadChangeEnum
    reason: str
    reason_category: ReasonCategoryEnum


def strip_code_block(raw: str) -> str:
    return raw.lstrip("```json").lstrip("```").rstrip("```")


def extract_expected_workload_change(report_content: str) -> Dict:
    kwargs = { "report_content": report_content }
    prompt = EXPECTED_WORKLOAD_CHANGE_PROMPT.format(**kwargs)
    res = ollama.chat(
        model="gemma3:1b",
        messages=[{ "role": "user", "content": prompt}],
        format=ExpectedWorkloadChange.model_json_schema()
    )
    raw_message = res.message.content
    raw_json = strip_code_block(raw_message)
    data = json.loads(raw_json)
    return data


def augment_with_expected_workload_change(reports: List[Dict]) -> List[Dict]:
    output = []
    for r in reports:
        if r.get("category") != "prospective":
            continue

        title = r.get("title")
        description = r.get("description")
        report_content = f"{title}\n{description}"
        expected = extract_expected_workload_change(report_content=report_content)
        output.append({
            **r,
            "expected_workload_change": expected.get("expected_workload_change", "unknown"),
            "reason_category": expected.get("reason_category"),
            "reason": expected.get("reason"),
        })
    return output

In [4]:
CREATE_FROM_CSV_QUERY = """
CREATE TABLE {table_name} AS
SELECT *
FROM read_csv_auto('{filename}')
""".strip()

CREATE_FROM_JSON_QUERY = """
CREATE TABLE {table_name} AS
SELECT *
FROM read_json_auto('{filename}')
""".strip()

CREATE_FROM_PARQUET_QUERY = """
CREATE TABLE {table_name} AS
SELECT *
FROM read_parquet('{filename}')
""".strip()

CREATE_TEMPLATE_BY_FORMAT = {
    "csv": CREATE_FROM_CSV_QUERY,
    "json": CREATE_FROM_JSON_QUERY,
    "parquet": CREATE_FROM_PARQUET_QUERY
}


def create_table(con, config: Dict):
    table_name = config.get("name")
    if not table_name:
        raise ValueError("Must specify table name.")

    table_path = config.get("path")
    if not table_path:
        raise ValueError("Must specify table path.")

    table_format = config.get("format")
    if not table_format:
        raise ValueError("Must specify table format.")
    if table_format not in CREATE_TEMPLATE_BY_FORMAT:
        valid_formats = ", ".join(CREATE_TEMPLATE_BY_FORMAT.keys())
        error = f"Invalid format `{table_format}`. Must be one of: {valid_formats}."
        raise ValueError(error)

    template = CREATE_TEMPLATE_BY_FORMAT.get(table_format)
    create_query = template.format(table_name=table_name, filename=table_path)
    con.execute(create_query)


def create_database(database_name: str, table_configs: List[Dict]):
    if os.path.exists(database_name):
        os.remove(database_name)

    with duckdb.connect(database_name) as con:
        for table_config in table_configs:
            create_table(con, table_config)

In [5]:
df_players = pd.DataFrame(get_all_players())
df_players.jersey_number = df_players.jersey_number.astype("Int64")
df_players.to_csv("../data/processed/player.csv", index=False)

In [6]:
raw_stats = get_player_week_stats(player_id="4147", season=2025)
stats = transform_week_stats(raw_stats)
df_stats = pd.DataFrame(stats)
df_stats.to_csv("../data/processed/player_week_data.csv", index=False)

In [7]:
raw_reports = get_player_reports(player_id="4147", limit=10)
reports = transform_reports(raw_reports)
reports_with_expected = augment_with_expected_workload_change(reports)
df_reports = pd.DataFrame(reports_with_expected)
df_reports.to_csv("../data/processed/report_prediction.csv", index=False)

In [8]:
DATABASE = "../database.db"
TABLES_TO_CREATE = [
    {
        "name": "player_week_data",
        "path": "../data/processed/player_week_data.csv",
        "format": "csv",
    },
    {
        "name": "report_prediction",
        "path": "../data/processed/report_prediction.csv",
        "format": "csv",
    },
    {
        "name": "player",
        "path": "../data/processed/player.csv",
        "format": "csv",
    }
]

create_database(database_name=DATABASE, table_configs=TABLES_TO_CREATE)

In [9]:
with duckdb.connect(DATABASE) as con:
    con.sql("""
    SELECT *
    FROM player
    WHERE current_team IS NOT NULL
    LIMIT 5
    ;
    """).show()

    con.sql("""
    SELECT
        player_id,
        season,
        week,
        team,
        opponent,
        played_at,
        rushing_attempts,
        rushing_yards,
    FROM player_week_data
    LIMIT 5
    ;
    """).show()

    con.sql("""
    SELECT
        player_id,
        published_at,
        expected_workload_change,
        reason_category,
    FROM report_prediction
    LIMIT 5
    ;
    """).show()

┌───────────┬───────────────────┬──────────┬──────────────┬───────────────┐
│ player_id │       name        │ position │ current_team │ jersey_number │
│  varchar  │      varchar      │ varchar  │   varchar    │     int64     │
├───────────┼───────────────────┼──────────┼──────────────┼───────────────┤
│ 8478      │ Samuel Womack     │ CB       │ NYJ          │            39 │
│ 1408      │ Le'Veon Bell      │ RB       │ TB           │             6 │
│ 2091      │ Bashaud Breeland  │ CB       │ ARI          │            24 │
│ 11533     │ Brandon Aubrey    │ K        │ DAL          │            17 │
│ 2064      │ DeMarcus Lawrence │ DE       │ SEA          │             0 │
└───────────┴───────────────────┴──────────┴──────────────┴───────────────┘

┌───────────┬────────┬───────┬─────────┬──────────┬───────────────┬──────────────────┬───────────────┐
│ player_id │ season │ week  │  team   │ opponent │   played_at   │ rushing_attempts │ rushing_yards │
│   int64   │ int64  │ int64 │ va