In [55]:
import os
import sys
import dspy
import rich
import polars as pl
from pathlib import Path
from typing import List
import pickle

## Load the TCGA Cancer Types

In [42]:
df_cancer_types = pl.read_csv("tcga_abbreviations.csv")
df_cancer_types

abbrev,cancer_type_full
str,str
"""LAML""","""Acute Myeloid Leukemia"""
"""ACC""","""Adrenocortical carcinoma"""
"""BLCA""","""Bladder Urothelial Carcinoma"""
"""LGG""","""Brain Lower Grade Glioma"""
"""BRCA""","""Breast invasive carcinoma"""
…,…
"""THYM""","""Thymoma"""
"""THCA""","""Thyroid carcinoma"""
"""UCS""","""Uterine Carcinosarcoma"""
"""UCEC""","""Uterine Corpus Endometrial Car…"


## Load the Full Pathology Report Dataset

In [43]:
df_path_reports = pl.read_csv("../../data/tcga/TCGA_reports.csv")
assert len(df_path_reports) == 9523
df_path_reports.head(10)

patient_filename,text
str,str
"""TCGA-BP-5195.25c0b433-5557-416…","""Date of Recelpt: Clinical Diag…"
"""TCGA-D7-8573.b7306a47-697d-4ed…","""Material: 1) Material: stomach…"
"""TCGA-EI-7004.13591eed-30e5-47a…","""page 1 / 1. copy No. 3. Examin…"
"""TCGA-EB-A82B.23E186C6-739C-4EF…","""Patient ID: Gross Description:…"
"""TCGA-A6-3808.e1505f65-72ef-438…","""SPECIMEN. Right colon. CLINICA…"
"""TCGA-29-1761.63defa27-e729-451…","""CLINICAL HISTORY: Pelvic organ…"
"""TCGA-24-1616.d8c764d9-0fe8-488…","""Address: : Physician (s) : Oth…"
"""TCGA-IN-A6RO.C0E5F045-3F12-45C…","""FINAL DIAGNOSIS: PART 1: ESOPH…"
"""TCGA-44-8119.c40ea4c8-8cbe-4a4…","""Final Surgical Pathology Repor…"
"""TCGA-KK-A6E8.3716A52E-E0B7-4FE…","""MODIFIED REPORT - REVIEW ADDEN…"


In [44]:
# print sample text report
rich.print(df_path_reports.select("text").to_series().to_list()[0])

## Load the Patient - Cancer Type Dataset

In [45]:
df_patient_type = pl.read_csv(
    "../../tcga-path-reports/data/tcga_metadata/tcga_patient_to_cancer_type.csv"
)
print(df_patient_type.shape)
df_patient_type.head(10)

(11160, 2)


patient_id,cancer_type
str,str
"""TCGA-OR-A5J1""","""ACC"""
"""TCGA-OR-A5J2""","""ACC"""
"""TCGA-OR-A5J3""","""ACC"""
"""TCGA-OR-A5J4""","""ACC"""
"""TCGA-OR-A5J5""","""ACC"""
"""TCGA-OR-A5J6""","""ACC"""
"""TCGA-OR-A5J7""","""ACC"""
"""TCGA-OR-A5J8""","""ACC"""
"""TCGA-OR-A5J9""","""ACC"""
"""TCGA-OR-A5JA""","""ACC"""


## Final Dataframe: patient_id, path_text, and cancer_type

In [46]:
df_split = df_path_reports.with_columns(
    pl.col("patient_filename")
    .str.split_exact(".", 1)
    .struct.rename_fields(["patient_id", "filename"])
    .alias("fields")
).unnest("fields")
df_split = df_split.drop(["patient_filename", "filename"])
print(df_split.shape)
df_split.head()

(9523, 2)


text,patient_id
str,str
"""Date of Recelpt: Clinical Diag…","""TCGA-BP-5195"""
"""Material: 1) Material: stomach…","""TCGA-D7-8573"""
"""page 1 / 1. copy No. 3. Examin…","""TCGA-EI-7004"""
"""Patient ID: Gross Description:…","""TCGA-EB-A82B"""
"""SPECIMEN. Right colon. CLINICA…","""TCGA-A6-3808"""


In [48]:
df_joined = df_split.join(df_patient_type, on="patient_id", how="inner")
print(df_joined.shape)

df_joined = df_joined.join(
    df_cancer_types, left_on="cancer_type", right_on="abbrev", how="inner"
)
print(df_joined.shape)
df_joined.head()

(9523, 3)
(9523, 4)


text,patient_id,cancer_type,cancer_type_full
str,str,str,str
"""Deliver To: NOS P. Cuncertain …","""TCGA-OR-A5J1""","""ACC""","""Adrenocortical carcinoma"""
"""Date of Birth: Sex: Female. Ma…","""TCGA-OR-A5J2""","""ACC""","""Adrenocortical carcinoma"""
"""Sex: Female. Macroscopy. One v…","""TCGA-OR-A5J3""","""ACC""","""Adrenocortical carcinoma"""
"""Sex: Female. Macroscopy. One v…","""TCGA-OR-A5J4""","""ACC""","""Adrenocortical carcinoma"""
"""Sex: Male. Macroscopy. One ves…","""TCGA-OR-A5J5""","""ACC""","""Adrenocortical carcinoma"""


In [49]:
df_dict = df_joined.to_dicts()
df_dict[0]

{'text': 'Deliver To: NOS P. Cuncertain inknown behavior 8370/1 A. SPECIMEN TYPE: Adrenal. Blopsy No: T. CLINICAL NOTES: H. Histopathology of L) adrenal gland. Conn\'s syndrome (note small section of. O. tumour excised for genetic studies). L. MACROSCOPIC: o. "Left adrenal. The specimen consists of an adrenal gland with a post fixation. weight 102.4g. It measures 110 x 70 x 60mm. A section of tissue has been. G. taken from one pole. On sectioning, a nodular lesion with a maximum diameter. Y. of 25 x 30mm, and extending for a length of 70mm, can be identified. The cut. surface of this lesion is irregular. it can be seen to be compressing the. adjacent normal adrenal tissue. Block 1: tissue near the excision where previous sampling for genetic studies. has already been performed. Blocks 2 and 3: lesion where it abuts the normal adrenal gland. Block 4: lesion with its closest proximity to the excision margin. Blocks 5-9: further blocks of lesion. MICROSCOPIC: Sections show an adrenal tumo

## Create a List of DSPy Examples

In [50]:
def prepare_dataset(dataset) -> List[dspy.Example]:
    return [
        dspy.Example(
            text=row["text"],
            cancer_type=row['cancer_type_full']
        ).with_inputs("text")
        for row in dataset
    ]

In [51]:
examples = prepare_dataset(df_dict)

In [54]:
example = examples[1000]
print(example['cancer_type'])
rich.print(example['text'])

Breast invasive carcinoma


In [56]:
# pickle the list of examples
with open("tcga_examples.pkl", "wb") as f:
    pickle.dump(examples, f)
print(f"Saved {len(examples)} examples to tcga_examples.pkl")
# Load the examples back to verify
with open("tcga_examples.pkl", "rb") as f:
    loaded_examples = pickle.load(f)
print(f"Loaded {len(loaded_examples)} examples from tcga_examples.pkl")

Saved 9523 examples to tcga_examples.pkl
Loaded 9523 examples from tcga_examples.pkl
