In [None]:
import os
import pydicom
import pandas as pd
from IPython.display import display
import pprint

# -----------------------------------------------------------------------
# Setup path to patient data
# -----------------------------------------------------------------------
base_path = "Burdenko-GBM-Progression" 

# -----------------------------------------------------------------------
# Helper functions to search for DICOM files
# -----------------------------------------------------------------------
def find_dicom_files(folder_path, modality_filter=None):
    """
    Recursively searches through a folder and returns a list of (full_file_path, pydicom-dataset).
    Filters by modality if modality_filter is, for example, "RTPLAN".
    """
    dicom_file_list = []
    for root, dirs, files in os.walk(folder_path):
        for f in files:
            if f.startswith("._"):
                continue
            full_path = os.path.join(root, f)
            if not full_path.lower().endswith(".dcm"):
                continue
            try:
                ds = pydicom.dcmread(full_path, stop_before_pixels=True)
                if modality_filter:
                    if getattr(ds, "Modality", "") == modality_filter:
                        dicom_file_list.append((full_path, ds))
                else:
                    dicom_file_list.append((full_path, ds))
            except Exception as e:
                pass
    return dicom_file_list

# -----------------------------------------------------------------------
# Function to extract treatment regime data from an RTPLAN file
# -----------------------------------------------------------------------
def extract_regime_info_from_rtplan(rtplan_dataset):
    """
    Takes a pydicom dataset for an RTPLAN and attempts to extract:
    - List of beam energies (NominalBeamEnergy)
    - Prescribed total dose (TargetPrescriptionDose)
    - Number of fractions planned (NumberOfFractionsPlanned)
    - Plan Intent (PlanIntent) [300A,000A]
    - Any chemo information (simple search in the plan's description field)
    - Manufacturer (0008,0070)
    - Manufacturer Model Name (0008,1090)
    - RT Plan Name (300A,0003)
    
    Returns a dictionary with findings.
    """
    info = {
        "beam_energies": [],
        "prescription_dose": None,
        "fractions_planned": None,
        "chemo_found": False,
        "plan_intent": "Unknown",  # default if nothing is found
        "Manufacturer": None,
        "ManufacturerModelName": None,
        "RTPlanName": None,
    }

    # (300A,000A) Plan Intent
    if hasattr(rtplan_dataset, "PlanIntent"):
        info["plan_intent"] = rtplan_dataset.PlanIntent

    # 1) Beam energy (from BeamSequence → ControlPointSequence → NominalBeamEnergy)
    if hasattr(rtplan_dataset, "BeamSequence"):
        for beam in rtplan_dataset.BeamSequence:
            try:
                nominal_energy = None
                if hasattr(beam, "ControlPointSequence"):
                    first_cp = beam.ControlPointSequence[0]
                    if hasattr(first_cp, "NominalBeamEnergy"):
                        nominal_energy = first_cp.NominalBeamEnergy
                    elif hasattr(first_cp, "BeamEnergy"):
                        nominal_energy = first_cp.BeamEnergy
                if nominal_energy is not None:
                    info["beam_energies"].append(nominal_energy)
            except Exception as e:
                pass

    # 2) Prescribed total dose (DoseReferenceSequence → TargetPrescriptionDose)
    if hasattr(rtplan_dataset, "DoseReferenceSequence"):
        for dose_ref in rtplan_dataset.DoseReferenceSequence:
            if hasattr(dose_ref, "TargetPrescriptionDose"):
                info["prescription_dose"] = dose_ref.TargetPrescriptionDose
                break

    # 3) Number of fractions planned
    if hasattr(rtplan_dataset, "FractionGroupSequence"):
        for fg in rtplan_dataset.FractionGroupSequence:
            if hasattr(fg, "NumberOfFractionsPlanned"):
                info["fractions_planned"] = fg.NumberOfFractionsPlanned
                break

    # 4) Chemo? 
    #    Often not coded in RTPLAN. We naively search in fields like:
    #    - PrescriptionDescription
    #    - CommentsOnPrescription
    chemo_keywords = ["chemo", "temozolomide", "tmz", "bevacizumab"]
    plan_desc_fields = []

    if hasattr(rtplan_dataset, "PrescriptionDescription"):
        plan_desc_fields.append(rtplan_dataset.PrescriptionDescription.lower())

    if hasattr(rtplan_dataset, "CommentsOnPrescription"):
        plan_desc_fields.append(rtplan_dataset.CommentsOnPrescription.lower())

    for fieldtext in plan_desc_fields:
        for kw in chemo_keywords:
            if kw in fieldtext:
                info["chemo_found"] = True
                break
        if info["chemo_found"]:
            break

    # 5) Extract additional fields:
    # Manufacturer (0008,0070)
    if hasattr(rtplan_dataset, "Manufacturer"):
        info["Manufacturer"] = rtplan_dataset.Manufacturer

    # Manufacturer Model Name (0008,1090)
    if hasattr(rtplan_dataset, "ManufacturerModelName"):
        info["ManufacturerModelName"] = rtplan_dataset.ManufacturerModelName

    # RT Plan Name (300A,0003)
    if hasattr(rtplan_dataset, "RTPlanName"):
        info["RTPlanName"] = rtplan_dataset.RTPlanName

    return info


def gather_treatment_regimes(base_path):
    """
    Searches for patient folders in base_path.
    For each patient folder, look for "Radiotherapy planning" study,
    then RTPLAN.
    """
    results = []

    if not os.path.isdir(base_path):
        print(f"Base path not found: {base_path}")
        return pd.DataFrame()

    # Go through each "patient folder" (typically Burdenko-GBM-###)
    patient_folders = [d for d in os.listdir(base_path)
                       if os.path.isdir(os.path.join(base_path, d))]

    for patient_dir in patient_folders:
        full_patient_path = os.path.join(base_path, patient_dir)

        # Look for a folder with "Radiotherapy planning" in the name
        studies = [s for s in os.listdir(full_patient_path)
                   if os.path.isdir(os.path.join(full_patient_path, s))]
        rt_study = None
        for st in studies:
            if "Radiotherapy planning" in st:
                rt_study = st
                break

        if not rt_study:
            # No RT-plan folder found
            continue

        rt_study_path = os.path.join(full_patient_path, rt_study)

        # Find all RTPLAN files under rt_study_path
        rtplans = find_dicom_files(rt_study_path, modality_filter="RTPLAN")
        if not rtplans:
            # No RTPLAN found, skip
            continue

        # Extract info from (all) RTPLANs – usually there is 1, but we loop
        for rtplan_path, rtplan_ds in rtplans:
            plan_info = extract_regime_info_from_rtplan(rtplan_ds)
            result_entry = {
                "PatientID": patient_dir,
                "RTPlanFile": rtplan_path,
                "BeamEnergies_MV": plan_info["beam_energies"],
                "PrescriptionDose_Gy": plan_info["prescription_dose"],
                "NumberOfFractionsPlanned": plan_info["fractions_planned"],
                "ChemoFound": plan_info["chemo_found"],
                "PlanIntent": plan_info["plan_intent"],
                "Manufacturer": plan_info["Manufacturer"],
                "ManufacturerModelName": plan_info["ManufacturerModelName"],
                "RTPlanName": plan_info["RTPlanName"]
            }
            results.append(result_entry)

    df = pd.DataFrame(results)
    return df

if __name__ == "__main__":
    df_regimes = gather_treatment_regimes(base_path)
    print("Treatment regimes found in RTPLAN:")
    display(df_regimes)
    csv_filename = "patient_treatment_regimes.csv"
    df_regimes.to_csv(csv_filename, index=False)
    print(f"\nData saved to CSV: {csv_filename}")
    print("\nExample results (first rows):\n")
    pprint.pprint(df_regimes.head(5).to_dict('records'))


Treatment regimes found in RTPLAN:


Unnamed: 0,PatientID,RTPlanFile,BeamEnergies_MV,PrescriptionDose_Gy,NumberOfFractionsPlanned,ChemoFound,PlanIntent,Manufacturer,ManufacturerModelName,RTPlanName
0,Burdenko-GBM-001,Burdenko-GBM-Progression/Burdenko-GBM-001/05-0...,"[6, 6, 6, 6, 6]",63.00,30,False,CURATIVE,Varian Medical Systems,ARIA RadOnc,SRT 30 fr x 2 Gy
1,Burdenko-GBM-002,Burdenko-GBM-Progression/Burdenko-GBM-002/09-0...,"[6, 6, 6, 6, 6, 6]",45.00,15,False,CURATIVE,Varian Medical Systems,ARIA RadOnc,
2,Burdenko-GBM-003,Burdenko-GBM-Progression/Burdenko-GBM-003/09-0...,"[6, 6, 6, 6, 6, 6]",60.00,30,False,CURATIVE,Varian Medical Systems,ARIA RadOnc,
3,Burdenko-GBM-004,Burdenko-GBM-Progression/Burdenko-GBM-004/09-0...,"[6, 6, 6, 6, 6, 6]",60.00,30,False,CURATIVE,Varian Medical Systems,ARIA RadOnc,
4,Burdenko-GBM-005,Burdenko-GBM-Progression/Burdenko-GBM-005/09-1...,"[6, 6, 6, 6, 6, 6]",60.00,30,False,CURATIVE,Varian Medical Systems,ARIA RadOnc,
...,...,...,...,...,...,...,...,...,...,...
175,Burdenko-GBM-176,Burdenko-GBM-Progression/Burdenko-GBM-176/01-2...,"[6, 6, 6, 6, 6, 6]",60.00,30,False,CURATIVE,Varian Medical Systems,ARIA RadOnc,
176,Burdenko-GBM-177,Burdenko-GBM-Progression/Burdenko-GBM-177/01-2...,"[6, 6, 6, 6, 6, 6]",61.35,7,False,CURATIVE,Varian Medical Systems,ARIA RadOnc,
177,Burdenko-GBM-178,Burdenko-GBM-Progression/Burdenko-GBM-178/07-1...,"[6, 6, 6, 6, 6, 6]",60.00,30,False,CURATIVE,Varian Medical Systems,ARIA RadOnc,
178,Burdenko-GBM-179,Burdenko-GBM-Progression/Burdenko-GBM-179/07-3...,"[6, 6, 6, 6, 6, 6]",60.00,30,False,CURATIVE,Varian Medical Systems,ARIA RadOnc,



Data saved to CSV: patient_treatment_regimes.csv

Example results (first rows):

[{'BeamEnergies_MV': ['6', '6', '6', '6', '6'],
  'ChemoFound': False,
  'Manufacturer': 'Varian Medical Systems',
  'ManufacturerModelName': 'ARIA RadOnc',
  'NumberOfFractionsPlanned': 30,
  'PatientID': 'Burdenko-GBM-001',
  'PlanIntent': 'CURATIVE',
  'PrescriptionDose_Gy': 63.0,
  'RTPlanFile': 'Burdenko-GBM-Progression/Burdenko-GBM-001/05-06-2004-NA-Radiotherapy '
                'planning 00-01357/3.000000-RTPLAN-07507/1-1.dcm',
  'RTPlanName': 'SRT 30 fr x 2 Gy'},
 {'BeamEnergies_MV': ['6', '6', '6', '6', '6', '6'],
  'ChemoFound': False,
  'Manufacturer': 'Varian Medical Systems',
  'ManufacturerModelName': 'ARIA RadOnc',
  'NumberOfFractionsPlanned': 15,
  'PatientID': 'Burdenko-GBM-002',
  'PlanIntent': 'CURATIVE',
  'PrescriptionDose_Gy': 45.0,
  'RTPlanFile': 'Burdenko-GBM-Progression/Burdenko-GBM-002/09-09-2004-NA-Radiotherapy '
                'planning 00-94210/3.000000-RTPLAN-88352/1-1.dc

In [2]:
# -----------------------------------------------------------------------
# Additional analysis: Statistics on Manufacturer and ManufacturerModelName
# -----------------------------------------------------------------------

# Assuming df_regimes has been created earlier in the script:
if not df_regimes.empty:
    print("Statistics for Manufacturer:")
    manufacturer_counts = df_regimes["Manufacturer"].value_counts(dropna=False)
    print(manufacturer_counts)
    print("\nStatistics for Manufacturer Model Name:")
    manufacturer_model_counts = df_regimes["ManufacturerModelName"].value_counts(dropna=False)
    print(manufacturer_model_counts)
    
    # Cross-tabulation between Manufacturer and ManufacturerModelName for deeper insight
    print("\nCross-tabulation between Manufacturer and Manufacturer Model Name:")
    crosstab = pd.crosstab(df_regimes["Manufacturer"], df_regimes["ManufacturerModelName"])
    display(crosstab)
else:
    print("No treatment regimes data found to analyze.")


Statistics for Manufacturer:
Manufacturer
Medical Physics Center    91
Varian Medical Systems    89
Name: count, dtype: int64

Statistics for Manufacturer Model Name:
ManufacturerModelName
AMPHORA        91
ARIA RadOnc    89
Name: count, dtype: int64

Cross-tabulation between Manufacturer and Manufacturer Model Name:


ManufacturerModelName,AMPHORA,ARIA RadOnc
Manufacturer,Unnamed: 1_level_1,Unnamed: 2_level_1
Medical Physics Center,91,0
Varian Medical Systems,0,89
