In [22]:
import duckdb
import pandas as pd

In [41]:
DATABASE = "../database.db"

# Excluded FB from training data because they only have three comparable reports.
# Excluded QB and K because they don't really fit the paradigm
ALLOWED_POSITIONS = ["RB", "WR", "TE"]

In [45]:
with duckdb.connect(DATABASE) as con:
    con.sql("""
    SELECT
        p.position,
        count(DISTINCT c.report_id) AS reports
    FROM comparable_report c
    LEFT JOIN report r
        ON c.report_id = r.report_id
    LEFT JOIN player p
        ON r.player_id = p.player_id
    GROUP BY
        p.position
    ORDER BY
        reports DESC
    ;
    """).show()

┌──────────┬─────────┐
│ position │ reports │
│ varchar  │  int64  │
├──────────┼─────────┤
│ WR       │    2553 │
│ RB       │    2037 │
│ QB       │    1156 │
│ TE       │    1082 │
│ K        │     276 │
│ FB       │       3 │
└──────────┴─────────┘



In [46]:
training_reports = []
with duckdb.connect(DATABASE) as con:
    for position in ALLOWED_POSITIONS:
        query = """
        WITH
        random_sample AS (
            SELECT
                c.report_id,
            FROM comparable_report c
            LEFT JOIN report r
                ON c.report_id = c.report_id
            LEFT JOIN player p
                ON r.player_id = p.player_id
            WHERE p.position = '{position}'
            ORDER BY random()
            LIMIT 10
        )
        SELECT
            s.report_id,
            p.player_id,
            p.name,
            p.position,
            rga.upcoming_game_id AS game_id,
            pga.season,
            pga.week,
            pga.team,
            pga.opponent,
            strftime(to_timestamp(pga.started_at / 1000), '%Y-%m-%d %I:%M %p') AS played_at,
            strftime(to_timestamp(r.published_at / 1000), '%Y-%m-%d %I:%M %p') AS published_at,
            r.title,
            r.description,
        FROM random_sample s
        LEFT JOIN report r
            ON s.report_id = r.report_id
        LEFT JOIN player p
            ON r.player_id = p.player_id
        LEFT JOIN report_game_assignment rga
            ON r.report_id = rga.report_id
        LEFT JOIN player_game_assignment pga
            ON rga.upcoming_game_id = pga.game_id
            AND r.player_id = pga.player_id
        ;
        """.format(position=position)
        cur = con.sql(query)
        df_rows = cur.df()
        records = df_rows.to_dict(orient="records")
        training_reports.extend(records)

df_training_reports = pd.DataFrame(training_reports)
df_training_reports.player_id = df_training_reports.player_id.astype("string")
df_training_reports.to_csv("../data/processed/training_report.csv", index=False)

In [47]:
df_training_reports.head()

Unnamed: 0,report_id,player_id,name,position,game_id,season,week,team,opponent,played_at,published_at,title,description
0,rotoballer_202351,6806,J.K. Dobbins,RB,202510410,2025,4,DEN,CIN,2025-09-29 07:15 PM,2025-09-21 11:27 PM,J.K. Dobbins Scores Again in Loss,Denver Broncos running back J.K. Dobbins deliv...
1,rotoballer_206270,4988,Nick Chubb,RB,202510913,2025,9,HOU,DEN,2025-11-02 12:00 PM,2025-10-31 08:23 PM,Nick Chubb Is a Risky Flex Play in Week 9,Houston Texans running back Nick Chubb continu...
2,rotoballer_203836,7021,Rico Dowdle,RB,202510605,2025,6,CAR,DAL,2025-10-12 12:00 PM,2025-10-07 09:59 AM,Rico Dowdle Dominates in Week 5; Earns More Time?,Carolina Panthers running back Rico Dowdle had...
3,rotoballer_210136,12492,Pat Bryant,WR,202511425,2025,14,DEN,LV,2025-12-07 03:05 PM,2025-12-07 07:56 AM,Pat Bryant Offers Low-End Flex Value,Denver Broncos wide receiver Pat Bryant is sta...
4,rotoballer_202860,6813,Jonathan Taylor,RB,202510432,2025,4,IND,LAR,2025-09-28 03:05 PM,2025-09-27 07:30 AM,Jonathan Taylor a Top-Six Option in Week 4,Indianapolis Colts running back Jonathan Taylo...


In [48]:
df_training_reports.shape

(50, 13)