In [1]:
import duckdb
import pandas as pd

In [2]:
DATABASE = "../database.db"

SAMPLES = 20

# Excluded FB from training data because they only have three comparable reports.
# Excluded QB and K because they don't really fit the paradigm
# Focus on just RB for now
# ALLOWED_POSITIONS = ["RB", "WR", "TE"]
ALLOWED_POSITIONS = ["RB"]

In [3]:
with duckdb.connect(DATABASE) as con:
    con.sql("""
    SELECT
        p.position,
        count(DISTINCT c.report_id) AS reports
    FROM comparable_report c
    LEFT JOIN report r
        ON c.report_id = r.report_id
    LEFT JOIN player p
        ON r.player_id = p.player_id
    GROUP BY
        p.position
    ORDER BY
        reports DESC
    ;
    """).show()

┌──────────┬─────────┐
│ position │ reports │
│ varchar  │  int64  │
├──────────┼─────────┤
│ WR       │    2553 │
│ RB       │    2037 │
│ TE       │    1082 │
│ FB       │       3 │
└──────────┴─────────┘



In [4]:
training_reports = []
with duckdb.connect(DATABASE) as con:
    for position in ALLOWED_POSITIONS:
        query = """
        WITH
        random_sample AS (
            SELECT
                c.report_id,
            FROM comparable_report c
            LEFT JOIN report r
                ON c.report_id = r.report_id
            LEFT JOIN player p
                ON r.player_id = p.player_id
            WHERE
                p.position = '{position}'
            ORDER BY random()
            LIMIT {samples}
        )
        SELECT
            s.report_id,
            p.position,
            p.name AS player_name,
            pga.week,
            pga.team,
            pga.opponent,
            p.player_id,
            rga.upcoming_game_id AS game_id,
            pga.season,
            strftime(to_timestamp(pga.started_at / 1000), '%Y-%m-%d %I:%M %p') AS played_at,
            strftime(to_timestamp(r.published_at / 1000), '%Y-%m-%d %I:%M %p') AS published_at,
            r.title,
            r.description,
        FROM random_sample s
        LEFT JOIN report r
            ON s.report_id = r.report_id
        LEFT JOIN player p
            ON r.player_id = p.player_id
        LEFT JOIN report_game_assignment rga
            ON r.report_id = rga.report_id
        LEFT JOIN player_game_assignment pga
            ON rga.upcoming_game_id = pga.game_id
            AND r.player_id = pga.player_id
        ;
        """.format(position=position, samples=SAMPLES)
        cur = con.sql(query)
        df_rows = cur.df()
        records = df_rows.to_dict(orient="records")
        training_reports.extend(records)

df_training_reports = pd.DataFrame(training_reports)
df_training_reports.player_id = df_training_reports.player_id.astype("string")
df_training_reports.to_csv("../data/processed/training_report.csv", index=False)

In [5]:
df_training_reports.head()

Unnamed: 0,report_id,position,player_name,week,team,opponent,player_id,game_id,season,played_at,published_at,title,description
0,rotoballer_203422,RB,Zavier Scott,5,MIN,CLE,11299,202510508,2025,2025-10-05 08:30 AM,2025-10-03 03:42 PM,Zavier Scott A High-Risk Bet After Breakout Game,"Following his best game in the NFL, Minnesota ..."
1,rotoballer_205347,RB,Isaiah Davis,8,NYJ,CIN,11571,202510807,2025,2025-10-26 12:00 PM,2025-10-22 09:43 AM,Isaiah Davis Emerging as Potential Late-Season...,New York Jets running back Isaiah Davis could ...
2,rotoballer_209979,RB,Jaylen Warren,14,PIT,BAL,8228,202511403,2025,2025-12-07 12:00 PM,2025-12-05 08:01 PM,Jaylen Warren a Rock-Solid RB2 Against Baltimore,Pittsburgh Steelers running back Jaylen Warren...
3,rotoballer_209783,RB,David Montgomery,14,DET,DAL,5892,202511411,2025,2025-12-04 07:15 PM,2025-12-03 08:42 PM,David Montgomery Remains a Touchdown-Dependent...,Detroit Lions running back David Montgomery re...
4,rotoballer_207762,RB,Ollie Gordon,11,MIA,WAS,12495,202511119,2025,2025-11-16 08:30 AM,2025-11-14 09:04 PM,Ollie Gordon II Ready to Roll in Week 11,Miami Dolphins running back Ollie Gordon II ha...


In [6]:
df_training_reports.shape

(20, 13)