In [None]:
import os
import glob
import warnings
import re
from typing import List

import pandas as pd
import numpy as np
import cv2
from PIL import Image
import matplotlib.pyplot as plt
import nibabel as nib

warnings.simplefilter(action='ignore', category=FutureWarning)

def compare_patient_ids(radiomics_df: pd.DataFrame, clinical_df: pd.DataFrame):

    print("--- Comparing Patient IDs Across Datasets ---")

    if 'ID' not in radiomics_df.columns or 'Patient' not in clinical_df.columns:
        print("Warning: Could not perform comparison due to missing ID columns.")
        return

    radiomics_patients = set(radiomics_df['ID'])
    clinical_patients = set(clinical_df['Patient'])

    common_patients = radiomics_patients.intersection(clinical_patients)
    unique_to_radiomics = radiomics_patients.difference(clinical_patients)
    unique_to_clinical = clinical_patients.difference(radiomics_patients)

    print(f"Total patients in Radiomic Features file: {len(radiomics_patients)}")
    print(f"Total patients in Clinical Data file (kaggle_3m): {len(clinical_patients)}")
    print("-" * 25)
    print(f"Number of patients in BOTH files: {len(common_patients)}")
    print(f"Patients unique to Radiomic Features: {len(unique_to_radiomics)}")
    print(f"Patients unique to Clinical Data:   {len(unique_to_clinical)}")
    print("--- End of Comparison Report ---\n")

def get_metadata(patient_id: str, metadata_df: pd.DataFrame) -> str:

    try:
        row = metadata_df[metadata_df['Patient'] == patient_id].iloc[0]
    except IndexError:
        return f"Patient ID: {patient_id}\nMetadata not found."

    metadata_lines = [f"Patient ID: {patient_id}"]
    for col_name, value in row.items():
        if col_name == "Patient":
            continue
        formatted_value = "N/A" if pd.isna(value) else str(value)
        metadata_lines.append(f"{col_name.replace('_', ' ').title()}: {formatted_value}")
    return "\n".join(metadata_lines)

def get_patient_id_from_path(path: str) -> str:

    folder_or_file_name = os.path.basename(path)

    if os.path.isdir(path):
        base_name = folder_or_file_name
    else:

        base_name = folder_or_file_name.replace('.nii.gz', '')


    match = re.match(r'(TCGA-\w{2}-\w{4})', base_name)
    if match:
        return match.group(1).replace('-', '_')
    return "_".join(base_name.split("_")[0:3])

def analyze_nifti_header(nifti_path: str):

    try:

        base_name = os.path.basename(nifti_path).replace('.nii.gz', '')
        scan_type = base_name.split('_')[-1]

        nii_image = nib.load(nifti_path)
        header = nii_image.header

        voxel_dims = header.get_zooms()
        slice_thickness = voxel_dims[2]


        total_voxel_dims = np.multiply(nii_image.shape, voxel_dims)
        total_voxel_dims_str = f"({total_voxel_dims[0]:.2f} x {total_voxel_dims[1]:.2f} x {total_voxel_dims[2]:.2f}) mm"

        print(f"    - On {scan_type.upper()} scan: Slice Thickness = {slice_thickness:.2f} mm")
        print(f"    - On {scan_type.upper()} scan: Total Voxel Dimensions = {total_voxel_dims_str}")

    except Exception as e:
        scan_type = os.path.basename(nifti_path).split('_')[-1]
        print(f"    - On {scan_type.upper()} scan: Could not analyze. Error: {e}")

def analyze_all_sequences_for_patient(patient_dir: str, analysis_function):

    sequences = ["flair", "t1", "t2", "t1Gd"]
    scan_files = glob.glob(os.path.join(patient_dir, "*.nii.gz"))

    found_scans_for_analysis = False
    for seq in sequences:
        seq_file = next((f for f in scan_files if f"_{seq}.nii.gz" in f), None)
        if seq_file:
            found_scans_for_analysis = True
            analysis_function(seq_file)

    if not found_scans_for_analysis:
         print("    - No valid MRI sequences (flair, t1, t2, t1Gd) found for this patient.")


if __name__ == "__main__":
    BASE_PATH_KAGGLE = "/home/ealam/Downloads/LGG dataset Cameron/lgg-mri-segmentation/kaggle_3m"
    BASE_PATH_TCGA = "/home/ealam/Downloads/Pre-operative_TCGA_LGG_NIfTI_and_Segmentations"

    CLINICAL_CSV_PATH = os.path.join(BASE_PATH_KAGGLE, "data.csv")
    RADIOMICS_CSV_PATH = os.path.join(BASE_PATH_TCGA, "TCGA_LGG_radiomicFeatures.csv")

    if not all(os.path.exists(p) for p in [BASE_PATH_KAGGLE, BASE_PATH_TCGA, CLINICAL_CSV_PATH, RADIOMICS_CSV_PATH]):
        print("Error: One or more dataset paths or CSV files not found. Please verify paths.")
    else:
        # --- CSV Analysis Section ---
        clinical_df = pd.read_csv(CLINICAL_CSV_PATH)
        radiomics_df = pd.read_csv(RADIOMICS_CSV_PATH)
        radiomics_df['ID'] = radiomics_df['ID'].str.replace('-', '_')

        print("--- Head of Clinical Data (data.csv) ---")
        print(clinical_df.head())


        print("--- Head of Radiomic Features (TCGA_LGG_radiomicFeatures.csv) ---")
        print(radiomics_df.head())


        compare_patient_ids(radiomics_df, clinical_df)

        # --- NIfTI Analysis Section ---
        patient_dirs = sorted([d for d in glob.glob(os.path.join(BASE_PATH_TCGA, "TCGA-*")) if os.path.isdir(d)])
        num_nifti_samples = min(5, len(patient_dirs))
        sample_patient_dirs = patient_dirs[:num_nifti_samples]

        print("--- Analyzing NIfTI File Headers for Slice Thickness ---")
        if not sample_patient_dirs:
            print("No patient directories found to analyze.")
        else:
            print(f"Displaying slice thickness across all sequences for {num_nifti_samples} sample patients...")
            for p_dir in sample_patient_dirs:
                patient_id_header = get_patient_id_from_path(p_dir).replace('_', '-')
                print(f"\n  Analyzing scans for Patient: {patient_id_header}")
                analyze_all_sequences_for_patient(p_dir, analyze_nifti_header)





--- Head of Clinical Data (data.csv) ---
        Patient  RNASeqCluster  MethylationCluster  miRNACluster  CNCluster  \
0  TCGA_CS_4941            2.0                 4.0             2        2.0   
1  TCGA_CS_4942            1.0                 5.0             2        1.0   
2  TCGA_CS_4943            1.0                 5.0             2        1.0   
3  TCGA_CS_4944            NaN                 5.0             2        1.0   
4  TCGA_CS_5393            4.0                 5.0             2        1.0   

   RPPACluster  OncosignCluster  COCCluster  histological_type  \
0          NaN              3.0           2                1.0   
1          1.0              2.0           1                1.0   
2          2.0              2.0           1                1.0   
3          2.0              1.0           1                1.0   
4          2.0              3.0           1                1.0   

   neoplasm_histologic_grade  tumor_tissue_site  laterality  tumor_location  \
0       