<a href="https://colab.research.google.com/github/tylerg/Internship-Eval-25/blob/main/Internship_Eval_25.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

# --- 1. Data Preparation ---
# Load the datasets
# Ensure 'patients.csv' and 'conditions.csv' are in your Colab environment or accessible via a path.
try:
    patients_df = pd.read_csv('patients.csv', on_bad_lines='skip')
    conditions_df = pd.read_csv('conditions.csv', on_bad_lines='skip')
    print("Successfully loaded patients.csv and conditions.csv.")
except FileNotFoundError:
    print("WARNING: 'patients.csv' and/or 'conditions.csv' not found.")
    print("Using dummy DataFrames for demonstration. Please upload the files for actual analysis.")
    # Create dummy dataframes for the rest of the script to run without error
    patients_df = pd.DataFrame(columns=['Id', 'BIRTHDATE', 'DEATHDATE', 'FIRST', 'LAST', 'GENDER'])
    conditions_df = pd.DataFrame(columns=['START', 'STOP', 'PATIENT', 'ENCOUNTER', 'CODE', 'DESCRIPTION'])

# Data Cleaning and Formatting
if not patients_df.empty:
    patients_df['BIRTHDATE'] = pd.to_datetime(patients_df['BIRTHDATE'], errors='coerce')
    patients_df['DEATHDATE'] = pd.to_datetime(patients_df['DEATHDATE'], errors='coerce')

if not conditions_df.empty:
    conditions_df['START'] = pd.to_datetime(conditions_df['START'], errors='coerce')
    conditions_df['STOP'] = pd.to_datetime(conditions_df['STOP'], errors='coerce')

    # Filter data for the period 1997 to 2023 (inclusive of 2023 start)
    start_date_range = pd.to_datetime('1997-01-01')
    end_date_range = pd.to_datetime('2023-12-31') # Up to end of 2023
    conditions_df = conditions_df[(conditions_df['START'] >= start_date_range) & (conditions_df['START'] <= end_date_range)]

# --- 2. Identify Patients with CKD using SNOMED CT Codes ---
# Provided SNOMED CT codes for CKD and related conditions
# Note: SCTID: 580151010000108 (CKD-EPI formula) is a qualifier value, not a diagnosis, so excluded.
ckd_snomed_codes = [
    '431855005', # Chronic kidney disease stage 1 (disorder)
    '431856006', # Chronic kidney disease stage 2 (disorder)
    '433144002', # Chronic kidney disease stage 3 (disorder)
    '431857002', # Chronic kidney disease stage 4 (disorder)
    '433146000', # Chronic kidney disease stage 5 (disorder)
    '709044004', # Chronic kidney disease (disorder) - General
    '714153000', # Chronic kidney disease stage 5 with transplant (disorder)
    '714152005', # Chronic kidney disease stage 5 on dialysis (disorder)
    '713313000', # Chronic kidney disease mineral and bone disorder (disorder) - Complication
    '722149000', # Chronic kidney disease due to and following excision of neoplasm of kidney (disorder) - Etiology
    '726018006', # Autosomal dominant tubulointerstitial kidney disease (disorder) - Etiology
    '723373006'  # Uromodulin related autosomal dominant tubulointerstitial kidney disease (disorder) - Etiology
]

if not conditions_df.empty:
    # Ensure CODE column is string type for matching SNOMED CT codes
    conditions_df['CODE'] = conditions_df['CODE'].astype(str)
    ckd_conditions_df = conditions_df[conditions_df['CODE'].isin(ckd_snomed_codes)].copy()
    ckd_patient_ids = ckd_conditions_df['PATIENT'].unique()
    print(f"\nNumber of unique patients diagnosed with a listed CKD-related SNOMED CT code (1997-2023): {len(ckd_patient_ids)}")
else:
    ckd_conditions_df = pd.DataFrame(columns=conditions_df.columns) # Ensure it has columns for later steps
    ckd_patient_ids = []
    print("\nNumber of unique patients diagnosed with a listed CKD-related SNOMED CT code (1997-2023): 0 (due to missing data)")


# --- 3. Tracking Progression Through Stages ---
# Define stages of CKD based on SNOMED CT codes
def map_code_to_stage(code_val):
    code_str = str(code_val)
    if code_str == '431855005': # Chronic kidney disease stage 1
        return 1
    elif code_str == '431856006': # Chronic kidney disease stage 2
        return 2
    elif code_str == '433144002': # Chronic kidney disease stage 3
        return 3
    elif code_str == '431857002': # Chronic kidney disease stage 4
        return 4
    elif code_str in ['433146000', '714153000']: # CKD stage 5, CKD stage 5 with transplant
        return 5
    elif code_str == '714152005': # CKD stage 5 on dialysis (considered ESRD)
        return 6 # Using 6 for ESRD
    elif code_str == '709044004': # Chronic kidney disease (disorder) - General
        return 0 # General CKD diagnosis, not a specific stage for progression path
    # Codes like 713313000, 722149000, 726018006, 723373006 identify CKD presence
    # but do not map to a specific stage 1-6 for this progression analysis.
    return np.nan

if not ckd_conditions_df.empty:
    ckd_conditions_df.loc[:, 'CKD_STAGE'] = ckd_conditions_df['CODE'].apply(map_code_to_stage)
    # Drop rows where stage could not be determined (NaN) or is Stage 0 (general CKD code '709044004')
    # for specific progression tracking. Stage 0 is useful for identifying CKD patients broadly
    # but not as a point in the 1->2->...->ESRD pathway.
    ckd_conditions_df.dropna(subset=['CKD_STAGE'], inplace=True)
    ckd_conditions_df = ckd_conditions_df[ckd_conditions_df['CKD_STAGE'] != 0]
    if not ckd_conditions_df.empty: # Check if dataframe became empty after filtering
        ckd_conditions_df.loc[:, 'CKD_STAGE'] = ckd_conditions_df['CKD_STAGE'].astype(int)
        # Sort conditions by patient and then by diagnosis date
        ckd_conditions_df = ckd_conditions_df.sort_values(by=['PATIENT', 'START'])
    else:
        print("No conditions found that map to specific CKD stages (1-6) after filtering.")
else:
    # Ensure ckd_conditions_df has the CKD_STAGE column if it was empty from the start
     if 'CKD_STAGE' not in ckd_conditions_df.columns:
        ckd_conditions_df['CKD_STAGE'] = pd.Series(dtype=int)


patient_progression_details = {} # To store detailed progression for each patient

# Iterate through unique patient IDs that had ANY relevant CKD code
for patient_id in ckd_patient_ids:
    # Filter the staged conditions for the current patient
    if ckd_conditions_df.empty or 'PATIENT' not in ckd_conditions_df or 'CKD_STAGE' not in ckd_conditions_df:
        # print(f"Skipping patient {patient_id} due to empty or incomplete ckd_conditions_df for staging.")
        continue # Skip if no staged conditions data

    patient_staged_data = ckd_conditions_df[ckd_conditions_df['PATIENT'] == patient_id].copy()

    if patient_staged_data.empty:
        continue # No staged conditions for this patient

    # If multiple stage diagnoses on the same day, take the highest stage
    patient_staged_data = patient_staged_data.loc[patient_staged_data.groupby('START')['CKD_STAGE'].idxmax()]
    # Ensure it's still sorted by date after grouping
    patient_staged_data.sort_values('START', inplace=True)

    current_stages_documented = {} # Store {stage: date}

    for _, row in patient_staged_data.iterrows():
        diagnosed_stage = row['CKD_STAGE']
        diagnosis_date = row['START']

        # Record the first date for this diagnosed stage
        if diagnosed_stage not in current_stages_documented:
            current_stages_documented[diagnosed_stage] = diagnosis_date
        # Optional: If already recorded, update only if new diagnosis date is earlier
        # else:
        #     if diagnosis_date < current_stages_documented[diagnosed_stage]:
        #          current_stages_documented[diagnosed_stage] = diagnosis_date

    if current_stages_documented:
        patient_progression_details[patient_id] = {
            'diagnoses': dict(sorted(current_stages_documented.items())), # Store stages sorted by stage number
            'chronological_diagnoses': dict(sorted(current_stages_documented.items(), key=lambda item: item[1])) # Store sorted by date
        }

# --- 4. Calculate Time Periods for Stage Progression ---
stage_transitions = {
    'Stage 1 to Stage 2': (1, 2),
    'Stage 2 to Stage 3': (2, 3),
    'Stage 3 to Stage 4': (3, 4),
    'Stage 4 to Stage 5': (4, 5),
    'Stage 5 to End Stage Renal Disease': (5, 6) # Stage 6 is ESRD
}

progression_times = {key: [] for key in stage_transitions.keys()}
patient_transition_output_list = []

for patient_id, data in patient_progression_details.items():
    diagnosed_stages = data['diagnoses'] # Dates of first diagnosis for each stage

    patient_record = {
        'patient_id': patient_id,
        'stage_diagnoses_dates': [{'stage': s, 'date': d.strftime('%Y-%m-%d')} for s, d in data['chronological_diagnoses'].items()],
        'calculated_transitions': []
    }

    for transition_name, (from_stage, to_stage) in stage_transitions.items():
        if from_stage in diagnosed_stages and to_stage in diagnosed_stages:
            date_from = diagnosed_stages[from_stage]
            date_to = diagnosed_stages[to_stage]

            if date_to > date_from: # Ensure progression is forward in time
                duration_days = (date_to - date_from).days
                progression_times[transition_name].append(duration_days)
                patient_record['calculated_transitions'].append(
                    f"{transition_name}: {duration_days} days (From {date_from.strftime('%Y-%m-%d')} to {date_to.strftime('%Y-%m-%d')})"
                )
    # Only add patient to output list if they have some diagnosed stages recorded
    if patient_record['stage_diagnoses_dates']:
        patient_transition_output_list.append(patient_record)

# Compute mean and median durations
summary_statistics = []
for transition_name, durations in progression_times.items():
    if durations:
        mean_duration = np.mean(durations)
        median_duration = np.median(durations)
        count = len(durations)
    else:
        mean_duration = np.nan
        median_duration = np.nan
        count = 0
    summary_statistics.append({
        'Transition': transition_name,
        'Mean Duration (days)': mean_duration,
        'Median Duration (days)': median_duration,
        'Number of Patients in Transition': count
    })

summary_df = pd.DataFrame(summary_statistics)

# --- 5. Output Your Findings ---
print("\n--- CKD Stage Progression Time Summary (using SNOMED CT codes) ---")
if not summary_df.empty:
    print(summary_df.to_string())
else:
    print("No progression data to summarize (likely due to missing input data or no observed transitions between defined stages).")

print("\n--- Patient-Specific CKD Stage Transitions (Sample using SNOMED CT codes) ---")
if not patient_transition_output_list:
    print("No patient transition data to display.")
else:
    displayed_count = 0
    for record in patient_transition_output_list:
        if displayed_count < 5: # Limiting output to the first 5 patients with progression data for brevity
            print(f"\nPatient ID: {record['patient_id']}")
            print("  Diagnosed Stages (Earliest Dates):")
            for diag in record['stage_diagnoses_dates']:
                print(f"    Stage {diag['stage']} on {diag['date']}")
            if record['calculated_transitions']:
                print("  Calculated Progression Durations:")
                for trans_info in record['calculated_transitions']:
                    print(f"    {trans_info}")
            elif not record['stage_diagnoses_dates']: # Should not happen due to filter above but as a safeguard
                 print("  No specific CKD stages diagnosed for this patient.")
            else: # Has stages, but no transitions from the pre-defined paths were calculated
                 print("  No sequential stage progressions calculated along defined paths (e.g., Stage 1->2, 2->3, etc.).")
            displayed_count += 1
        else:
            break
    if len(patient_transition_output_list) > displayed_count:
        print(f"\n... and {len(patient_transition_output_list) - displayed_count} more patients with CKD stage data.")

# --- 6. Code Implementation Notes ---
# - The script assumes 'patients.csv' and 'conditions.csv' are available.
# - SNOMED CT codes are now used for CKD identification and staging.
# - Dates are filtered for 1997-2023.
# - Stage progression is calculated based on the first recorded date for each stage.
# - Handles cases where patients may not progress through all stages or have gaps; only observed transitions are calculated.
# - If multiple diagnoses for different stages occur on the same day, the script takes the highest stage reported on that day.
# - General CKD code (SNOMED: 709044004) is mapped to Stage 0 and excluded from A->B progression paths but helps identify CKD patients.
# - ESRD (from 'CKD stage 5 on dialysis') is treated as Stage 6 for numerical progression.
# - Mean and median are calculated only if there are patients who made that specific transition.

print("\nAnalysis complete.")
if patients_df.empty or (not conditions_df.empty and conditions_df['CODE'].isnull().all() and conditions_df.shape[0] > 0): # check if conditions_df was loaded but CODE column might be all NaN from dummy
    print("REMINDER: Analysis might have been performed on DUMMY data or data with no usable codes if input files were not found or were empty.")
elif patients_df.empty or conditions_df.empty:
    print("REMINDER: Analysis was performed on DUMMY data as input files were not found.")

Successfully loaded patients.csv and conditions.csv.

Number of unique patients diagnosed with a listed CKD-related SNOMED CT code (1997-2023): 17

--- CKD Stage Progression Time Summary (using SNOMED CT codes) ---
                           Transition  Mean Duration (days)  Median Duration (days)  Number of Patients in Transition
0                  Stage 1 to Stage 2                   NaN                     NaN                                 0
1                  Stage 2 to Stage 3                   NaN                     NaN                                 0
2                  Stage 3 to Stage 4                   NaN                     NaN                                 0
3                  Stage 4 to Stage 5                   NaN                     NaN                                 0
4  Stage 5 to End Stage Renal Disease                   NaN                     NaN                                 0

--- Patient-Specific CKD Stage Transitions (Sample using SNOMED CT codes) --

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  conditions_df['CODE'] = conditions_df['CODE'].astype(str)
