In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [None]:
df = pd.read_excel(" ***insert name of dataset*** ") # Dataset is inhouse, therefore kept private
df = df.dropna(subset=['respiratory_depression'])
df = df.reset_index(drop=True)
df

In [None]:
# List of columns to drop completely
drop_cols = ['encntr_id', 'fin', 'facility', 'hc'] # Verify 'hc' first

# List of date columns to convert to relative days (e.g., Days since admission)
date_cols = ['proceduredate', 'discharge_date', 'surgery_startdate', 
             'benzodate', 'fentadate', 'morphinedate', 'fentadate_post', 'naloxonedate_post']

# 1. Drop Identifiers
df_clean = df.drop(columns=drop_cols, errors='ignore')

# 2. Anonymize Dates (Example: Convert everything to "Days from Admission")
# Ensure admission date is datetime
df_clean['inpatient_admitdate'] = pd.to_datetime(df_clean['inpatient_admitdate'])

for col in date_cols:
    if col in df_clean.columns:
        # Convert to datetime
        df_clean[col] = pd.to_datetime(df_clean[col], errors='coerce')
        # Calculate difference in days
        df_clean[col] = abs((df_clean[col] - df_clean['inpatient_admitdate'])).dt.days

# 4. Drop the reference admission date itself (since it's a specific date)
df_clean = df_clean.drop(columns=['inpatient_admitdate', 'birthdate'], errors='ignore')

In [None]:
# 2. Define the Core Transformation Function
def generate_clinical_narrative(patient_row: pd.Series) -> str:
    """
    Converts a patient's tabular record into a detailed clinical narrative,
    using the specific column names from the provided dataset.
    """
    
    # Clean up string data to handle mixed text/NaN consistently
    def get_clean_string(column_name):
        val = patient_row.get(column_name)
        if pd.isna(val) or str(val).lower() in ['nan', 'none', '', ' ']:
            return None
        # Replace common separators for readability
        return str(val).replace(';;', ', ').replace(':', ': ').strip()

    narrative_parts = []
    
    # --- 1. Patient Profile and History ---
    
    age = patient_row.get('age', 'Unknown')
    gender = patient_row.get('gender', 'Unknown')
    nationality = get_clean_string('nationality')
    
    profile_str = f"The patient is a {age}-year-old {gender}"
    if nationality:
        profile_str += f", who is a {nationality} national."
    else:
        profile_str += "."
    
    narrative_parts.append(profile_str)
    
    # Vitals and Lifestyle
    bmi = patient_row.get('bmi')
    smoking = get_clean_string('smokingstatus')
    alcohol = get_clean_string('alcoholstatus')
    
    vitals_parts = []
    if pd.notna(bmi):
        vitals_parts.append(f"Body Mass Index (BMI) of {bmi}")
    if smoking:
        vitals_parts.append(f"Smoking Status: {smoking}")
    if alcohol:
        vitals_parts.append(f"Alcohol Status: {alcohol}")
        
    if vitals_parts:
        narrative_parts.append(
            "Key baseline data includes: " + "; ".join(vitals_parts) + "."
        )

    # Comorbidities History
    comorbidities = get_clean_string('comorbstringhistory')
    if comorbidities:
        narrative_parts.append(
            f"The patient's known medical history and comorbidities include: {comorbidities}."
        )
    else:
        narrative_parts.append("There is no significant past medical history explicitly documented.")
        
    # --- 2. Admission and Procedure Details ---
    
    admit_date = get_clean_string('inpatient_admitdate')
    procedure_desc = get_clean_string('proceduredescriptions')
    case_desc = get_clean_string('case_description')
    
    if admit_date:
        narrative_parts.append(f"The patient was admitted on {admit_date}.")
        
    if procedure_desc:
        proc_details = f"The primary procedure was a **{procedure_desc}**"
        
        duration = patient_row.get('surgeryduration')
        if pd.notna(duration):
            proc_details += f" lasting for {duration} minutes."
        else:
            proc_details += "."
            
        if case_desc and case_desc != procedure_desc:
             proc_details += f" The operation was specifically described as a **{case_desc}**."
        
        narrative_parts.append(proc_details)

    # --- 3. Perioperative Medication (Pre/Intra-op) ---
    
    med_parts = []
    
    # Fentanyl Check
    fentanyl_flag = get_clean_string('fentanyl')
    if fentanyl_flag:
        fenta_date = get_clean_string('fentadate') or 'an unspecified time'
        med_parts.append(f"Fentanyl was administered {fenta_date} day(s) after admission")

    # Morphine Check
    morphine_flag = get_clean_string('morphine')
    if morphine_flag:
        morphine_date = get_clean_string('morphinedate') or 'an unspecified time'
        med_parts.append(f"Morphine was administered {morphine_date} day(s) after admission")

    # Benzodiazepine Check
    benzo_flag = get_clean_string('benzodiazepine')
    if benzo_flag:
        benzo_date = get_clean_string('benzodate') or 'an unspecified time'
        med_parts.append(f"A Benzodiazepine was administered {benzo_date} day(s) after admission")
        
    if med_parts:
        narrative_parts.append(
            "During the perioperative period, pain management and sedation included: " + 
            " and ".join(med_parts) + "."
        )
    else:
        narrative_parts.append("Opioid pain management details are not explicitly documented for the procedure.")

    # --- 4. Postoperative Status and Interventions ---
    
    # Naloxone Check (Specific for Post-administration event)
    naloxone_flag = get_clean_string('naloxone_post')
    if naloxone_flag:
        naloxone_date = get_clean_string('naloxonedate_post') or 'an unspecified date' 
        narrative_parts.append(
            f"**A critical event was noted**: The opioid reversal agent **Naloxone** was administered {naloxone_date} day(s) after admission."
        )
    
    # Post-operative Fentanyl (Opioid Use Post-Procedure)
    fenta_post_flag = get_clean_string('fentanyl_post')
    if fenta_post_flag:
        fenta_post_date = get_clean_string('fentadate_post') or 'an unspecified date'
        narrative_parts.append(
            f"Fentanyl was also administered post-procedure {fenta_post_date} day(s) after admission."
        )

    # Sleep Apnoea (Risk/Comorbidity factor)
    sleep_apnea_flag = get_clean_string('sleepapnoea')
    if sleep_apnea_flag:
        narrative_parts.append(
            f"**Sleep Apnoea** was noted as a condition or risk factor during the stay."
        )

    # --- 5. Diagnosis and Outcome ---
    
    diagnosis = get_clean_string('diagstringhistory')
    if diagnosis:
        narrative_parts.append(
            f"The diagnosis prior to surgery was documented as: **{diagnosis}**."
        )

    discharge_date = get_clean_string('discharge_date')
    if discharge_date:
        narrative_parts.append(
            f"The patient was discharged {discharge_date} day(s) after admission."
        )

    # Combine all parts into a single, cohesive narrative string
    return ' '.join(narrative_parts)

# 3. Apply the function and display results

# Apply the function to every row in the DataFrame to create the new column
df_clean['Clinical_Narrative'] = df_clean.apply(generate_clinical_narrative, axis=1)

print("\n" + "="*80)
print("--- RESULTS: Generated Clinical Narratives (First 2 Patients) ---")
print("="*80)

# Display the generated narratives
for i, narrative in enumerate(df_clean['Clinical_Narrative'].head(2)):
    print(f"\n--- Patient {i+1} ---")
    print(narrative)

print("\n" + "="*80)

In [None]:
df.to_pickle('my_project_data.pkl')
print("DataFrame saved to my_project_data.pkl")