In [5]:
import datetime
import duckdb
import enum
import json
import ollama
import os
import pandas as pd
import pydantic
import requests
import time
import tqdm
from typing import Dict, List

In [6]:
DATABASE = "../database.db"

DROP_IF_EXISTS_QUERY = """
DROP TABLE IF EXISTS {table_name}
"""

CREATE_FROM_CSV_QUERY = """
CREATE TABLE {table_name} AS
SELECT *
FROM read_csv_auto('{filename}')
""".strip()

CREATE_TEMPLATE_BY_FORMAT = {
    "csv": CREATE_FROM_CSV_QUERY,
}


def create_table(con, config: Dict):
    table_name = config.get("name")
    if not table_name:
        raise ValueError("Must specify table name.")

    table_path = config.get("path")
    if not table_path:
        raise ValueError("Must specify table path.")

    table_format = config.get("format")
    if not table_format:
        raise ValueError("Must specify table format.")
    if table_format not in CREATE_TEMPLATE_BY_FORMAT:
        valid_formats = ", ".join(CREATE_TEMPLATE_BY_FORMAT.keys())
        error = f"Invalid format `{table_format}`. Must be one of: {valid_formats}."
        raise ValueError(error)

    drop_query = DROP_IF_EXISTS_QUERY.format(table_name=table_name)
    con.execute(drop_query)

    template = CREATE_TEMPLATE_BY_FORMAT.get(table_format)
    create_query = template.format(table_name=table_name, filename=table_path)
    con.execute(create_query)


def create_tables(database_name: str, table_configs: List[Dict]):
    with duckdb.connect(database_name) as con:
        for table_config in table_configs:
            create_table(con, table_config)

In [7]:
LABELS_WORKLOAD = ["low", "medium", "high", "unknown"]
LABELS_REASON = ["performance", "injury", "opponent", "unknown"]


class ExpectedWorkloadEnum(enum.Enum):
    high = "high"
    medium = "medium"
    low = "low"
    unknown = "unknown"


class ReasonCategoryEnum(enum.Enum):
    performance = "performance"
    injury = "injury"
    opponent = "opponent"
    unknown = "unknown"


class ExpectedWorkloadReport(pydantic.BaseModel):
    expected_workload: ExpectedWorkloadEnum
    reason_category: ReasonCategoryEnum


def strip_code_block(raw: str) -> str:
    return raw.lstrip("```json").lstrip("```").rstrip("```")


def extract_expected_workload_change(model_config: Dict, report: Dict) -> Dict:
    is_debug = model_config.get("is_debug", False)
    template_kwargs = dict(
        player_name=report.get("player_name"),
        report_content=report.get("description"),
    )
    prompt_template = model_config.get("prompt_template")
    prompt = prompt_template.format(**template_kwargs)
    res = ollama.chat(
        model=model_config.get("model"),
        messages=[{ "role": "user", "content": prompt}],
        format=ExpectedWorkloadReport.model_json_schema(),
        think=model_config.get("think"),
        options=dict(
            temperature=model_config.get("temperature"),
            top_p=model_config.get("top_p"),
        ),
    )
    raw_message = res.message.content
    duration_secs = res.total_duration / 1000000000
    if is_debug:
        print(f"Total Duration: {duration_secs:.1f} secs")
        if model_config.get("think") is not None:
            print()
            print(res.message.thinking)
    raw_json = strip_code_block(raw_message)
    data = json.loads(raw_json)
    if is_debug:
        print()
        print(json.dumps(data))
    return data


def process_report(model_config: Dict, r: Dict) -> Dict:
    expected = extract_expected_workload_change(model_config=model_config, report=r)
    return {
        "report_id": r.get("report_id"),
        "expected_workload": expected.get("expected_workload", "unknown"),
        "reason_category": expected.get("reason_category", "unknown"),
    }


def get_report_predictions(model_config: Dict, reports: List[Dict]) -> List[Dict]:
    result_iterator = map(lambda r: process_report(model_config, r), reports)
    output = list(tqdm.tqdm(result_iterator, total=len(reports)))
    return output

In [8]:
MODEL_GPT = "gpt-oss:20b"

PROMPT_TEST = """
# Instructions

You are an expert in NFL football and semantics.

Read the following news report about an NFL player and extract structured data that captures what the report
thinks that player's workload will be in the next game.

If multiple players are mentioned in the report, only extract the expected workload of the target player.

In fantasy sports terminology, news reports may refer to players like this:
- RB1: The top‑tier running back on a team, usually the highest projected scorer.  
- RB2: The second‑best running back, next in value after RB1.  
- RB3: The third running back, often a bench or flex candidate.  
- Flex: A slot that can hold a RB, WR, or TE; the spot you fill with the best available from those positions.  
- High‑end flex: A star‑level player (often a top RB or WR) placed in the flex slot to maximize upside.  
- Low‑end flex: A lower‑tier or backup player used in flex when you’re not targeting a high‑scoring option.

Your output should be JSON with the following fields:
- "expected_workload"
  - The expected workload in the next game (carries, targets) for the target player
  - Value can be one of the following:
    - "high" if significant workload (20+ touches), especially for the lead player in their role
    - "medium" if workload is in the middle (10-20 touches), especially when split with another key player
    - "low" if even less workload than that (<10 touches) or no workload at all
    - "unknown" if it cannot be determined
- "reason_category"
  - Categorization of the primary reason for the expected workload change
  - Value can be one of the following:
    - "performance" because of the target player or another player's strong or weak play
    - "injury" because of an injury to or return from injury by the target player or another player
    - "opponent" because of the strength or weakness of the team they are playing against next week
    - "rumor" because of hearsay about players, coaches, or strategy
    - "unknown" if the reason cannot be determined

# Examples

## Example 1

Target Player:
J.K. Dobbins

Report:
Denver Broncos running back J.K. Dobbins delivered a solid performance in Week 3 despite his team's last-second 23-20 loss to the Los Angeles Chargers. Dobbins was effective on the ground, rushing for 83 yards and a touchdown on 11 carries, including a 41-yard run that helped set up his score. He also recorded one catch for zero yards. After three games, Dobbins has been highly productive, averaging 5.4 yards per carry with over 220 rushing yards and three touchdowns. He seems to be secured as the lead back and is likely to continue seeing a significant workload as rookie R.J. Harvey struggles to find his rhythm. Dobbins can be viewed as a reliable RB2 option moving forward.

Output:
{{
  'expected_workload': 'high',
  'reason_category': 'performance'
}}

## Example 2

Target Player:
Justice Hill

Report:
Baltimore Ravens head coach John Harbaugh said on Monday that running back Justice Hill (neck) will be out three to four weeks with a disc issue in his neck, but there's still a chance that he will return this year, according to Jonas Shaffer of The Baltimore Sun. Hill injured his neck in practice early last week and landed on Injured Reserve before the Thanksgiving Day loss to the Cincinnati Bengals on Thursday night. The 28-year-old pass-catching back will miss at least three more games, but he will be eligible to return from IR in Week 17 on the road at Lambeau Field against the Green Bay Packers. Keaton Mitchell was starting to take on a bigger role behind starter Derrick Henry even before Hill was injured, so if he does return before the end of the 2025 regular season, Hill could be Baltimore's RB3 on passing downs. He only has 18 rushing attempts on the year for 93 yards and two TDs, adding 169 receiving yards and one score.

Output:
{{
  'expected_workload': 'low',
  'reason_category': 'injury'
}}

## Example 3

Target Player:
Devin Singletary

Report:
New York Giants running back Devin Singletary had 14 carries for 47 yards in Week 12's 34-27 loss in overtime to the Detroit Lions. While the 14 carries appear encouraging, the 3.36 YPC does not. Additionally, Tyrone Tracy Jr. played 71 percent of the snaps and out-touched Singletary 23 to 14. Additionally, Tracy Jr. continues to be a running back involved in the passing game and getting a majority of the carries, with Singletary being brought in as a goal-line or change-of-pace back. Given Singletary's 14 carries and the possibility of being the team's goal-line back, he is still worth a hold in fantasy, but not someone that can confidently be started unless we see a shift in snap percentage or Tracy were to go down with an injury. Week 13 presents a challenging task: going to Foxboro to take on New England's stiff run defense, which has allowed the fewest fantasy points to opposing running backs this season. Singletary will not be a recommended start in Week 13, but could be held on fantasy managers' bench if they have the spot available.

Output:
{{
  'expected_workload': 'medium',
  'reason_category': 'opponent'
}}

# Your Turn

Target Player:
{player_name}

Report:
{report_content}

Output:
"""

MODEL_CONFIG = dict(
    model=MODEL_GPT,
    think="low",
    prompt_template=PROMPT_TEST,
    temperature=0.25,
    top_p=0.5,
)

In [10]:
comparable_reports = []
with duckdb.connect(DATABASE) as con:
    cur = con.sql("""
    SELECT
        c.report_id,
        p.name AS player_name,
        r.description,
    FROM comparable_report c
    LEFT JOIN report r
        ON c.report_id = r.report_id
    LEFT JOIN player p
        ON r.player_id = p.player_id
    WHERE
        p.position IN ('RB', 'FB')
    ;
    """)
    df_rows = cur.df()
    records = df_rows.to_dict(orient="records")
    comparable_reports.extend(records)

report_predictions = get_report_predictions(
    model_config=MODEL_CONFIG,
    reports=comparable_reports,
)
df_report_predictions = pd.DataFrame(report_predictions)
df_report_predictions.to_csv("../data/processed/report_prediction.csv", index=False)

100%|█████████████████████████████████████| 2040/2040 [1:08:34<00:00,  2.02s/it]


In [15]:
create_tables(
    database_name=DATABASE,
    table_configs=[
        {
            "name": "report_prediction",
            "path": "../data/processed/report_prediction.csv",
            "format": "csv",
        },
    ]
)

In [16]:
with duckdb.connect(DATABASE) as con:
    con.sql("""
    SELECT
        count(1)
    FROM report_prediction
    ;
    """).show()

    con.sql("""
    SELECT
        report_id,
        expected_workload,
        reason_category,
    FROM report_prediction
    LIMIT 5
    ;
    """).show()

┌──────────┐
│ count(1) │
│  int64   │
├──────────┤
│     2040 │
└──────────┘

┌───────────────────┬───────────────────┬─────────────────┐
│     report_id     │ expected_workload │ reason_category │
│      varchar      │      varchar      │     varchar     │
├───────────────────┼───────────────────┼─────────────────┤
│ rotoballer_207073 │ low               │ unknown         │
│ rotoballer_203497 │ low               │ injury          │
│ rotoballer_209773 │ low               │ injury          │
│ rotoballer_204845 │ low               │ injury          │
│ rotoballer_204717 │ low               │ performance     │
└───────────────────┴───────────────────┴─────────────────┘

