In [5]:
import os
import sys
import dspy
import rich
import polars as pl
from pathlib import Path

## Load the Full Pathology Report Dataset

In [7]:
df_path_reports = pl.read_csv("../../data/tcga/TCGA_reports.csv")
assert len(df_path_reports) == 9523
df_path_reports.head(10)

patient_filename,text
str,str
"""TCGA-BP-5195.25c0b433-5557-416…","""Date of Recelpt: Clinical Diag…"
"""TCGA-D7-8573.b7306a47-697d-4ed…","""Material: 1) Material: stomach…"
"""TCGA-EI-7004.13591eed-30e5-47a…","""page 1 / 1. copy No. 3. Examin…"
"""TCGA-EB-A82B.23E186C6-739C-4EF…","""Patient ID: Gross Description:…"
"""TCGA-A6-3808.e1505f65-72ef-438…","""SPECIMEN. Right colon. CLINICA…"
"""TCGA-29-1761.63defa27-e729-451…","""CLINICAL HISTORY: Pelvic organ…"
"""TCGA-24-1616.d8c764d9-0fe8-488…","""Address: : Physician (s) : Oth…"
"""TCGA-IN-A6RO.C0E5F045-3F12-45C…","""FINAL DIAGNOSIS: PART 1: ESOPH…"
"""TCGA-44-8119.c40ea4c8-8cbe-4a4…","""Final Surgical Pathology Repor…"
"""TCGA-KK-A6E8.3716A52E-E0B7-4FE…","""MODIFIED REPORT - REVIEW ADDEN…"


In [10]:
report = df_path_reports.select("text").to_series().to_list()[0]
rich.print(report)

## Load the Annotated Dataset

In [11]:
df_annotated = pl.read_csv(
    "../../tcga-path-reports/data/tcga_metadata/tcga_patient_to_cancer_type.csv"
)
df_annotated.head(10)

patient_id,cancer_type
str,str
"""TCGA-OR-A5J1""","""ACC"""
"""TCGA-OR-A5J2""","""ACC"""
"""TCGA-OR-A5J3""","""ACC"""
"""TCGA-OR-A5J4""","""ACC"""
"""TCGA-OR-A5J5""","""ACC"""
"""TCGA-OR-A5J6""","""ACC"""
"""TCGA-OR-A5J7""","""ACC"""
"""TCGA-OR-A5J8""","""ACC"""
"""TCGA-OR-A5J9""","""ACC"""
"""TCGA-OR-A5JA""","""ACC"""


In [12]:
path_text = df_path_reports.filter(
    pl.col("patient_filename").str.contains("TCGA-OR-A5J1")
)["text"][0]
cancer_type = df_annotated.filter(pl.col("patient_id") == "TCGA-OR-A5J1")[
    "cancer_type"
][0]
print(cancer_type)
rich.print(path_text)

ACC


## DSPy it

In [13]:
lm = dspy.LM("openai/gpt-4o-mini", api_key=os.environ["OPENAI_API_KEY"])
dspy.configure(lm=lm)

In [14]:
class ExtractInfo(dspy.Signature):
    """Extract structured information from text."""

    text: str = dspy.InputField()
    cancer_type: str = dspy.OutputField(
        desc="the type of primary cancer diagnosis in the pathology report text"
    )


module = dspy.Predict(ExtractInfo)
response = module(text=path_text)

print(response.cancer_type)

adrenal cortical neoplasm of indeterminate malignant potential
