In [1]:
import requests 
import json
import uuid


In [6]:
# Load data from JSON file
import json

with open("../data/inputs_to_agent_fake_mimic3.json", "r", encoding="utf-8") as f:
    fake_admissions = json.load(f)

In [8]:
len(fake_admissions)

1000

In [9]:
fake_admissions[0]

{'old_subject_id': '10088',
 'admission_str': 'Subject ID: 0\nAdmission ID: 169938\nAdmission Date: 4323585540000\nDischarge Date: 4324203900000\nInsurance: Medicare\nAge: None\nGender: M\nLanguage: None\nMarital Status: UNKNOWN (DEFAULT)\nEthnicity: WHITE\nCreatinine: 1.1 mg/dL\nHemoglobin: 10.0 g/dL\nPotassium: 3.7 mEq/L\nSodium: 142.0 mEq/L\nNotes (24h Summary): None\nPrescriptions:   - Drug: SW, Type: BASE, Dose: 100 ml, Form: None, Route: IV, Start: 2107-01-06 00:00:00, Stop: 2107-01-07 00:00:00--------------------------------\n  - Drug: NS, Type: BASE, Dose: 500 ml, Form: None, Route: IV, Start: 2107-01-06 00:00:00, Stop: 2107-01-07 00:00:00--------------------------------\n  - Drug: NS, Type: BASE, Dose: 500 ml, Form: None, Route: IV, Start: 2107-01-06 00:00:00, Stop: 2107-01-07 00:00:00--------------------------------\n  - Drug: Iso-Osmotic Dextrose, Type: BASE, Dose: 200 ml, Form: None, Route: IV, Start: 2107-01-07 00:00:00, Stop: 2107-01-08 00:00:00---------------------------

In [15]:
fake_admissions[0]

{'old_subject_id': '10088',
 'admission_str': 'Subject ID: 0\nAdmission ID: 169938\nAdmission Date: 4323585540000\nDischarge Date: 4324203900000\nInsurance: Medicare\nAge: None\nGender: M\nLanguage: None\nMarital Status: UNKNOWN (DEFAULT)\nEthnicity: WHITE\nCreatinine: 1.1 mg/dL\nHemoglobin: 10.0 g/dL\nPotassium: 3.7 mEq/L\nSodium: 142.0 mEq/L\nNotes (24h Summary): None\nPrescriptions:   - Drug: SW, Type: BASE, Dose: 100 ml, Form: None, Route: IV, Start: 2107-01-06 00:00:00, Stop: 2107-01-07 00:00:00--------------------------------\n  - Drug: NS, Type: BASE, Dose: 500 ml, Form: None, Route: IV, Start: 2107-01-06 00:00:00, Stop: 2107-01-07 00:00:00--------------------------------\n  - Drug: NS, Type: BASE, Dose: 500 ml, Form: None, Route: IV, Start: 2107-01-06 00:00:00, Stop: 2107-01-07 00:00:00--------------------------------\n  - Drug: Iso-Osmotic Dextrose, Type: BASE, Dose: 200 ml, Form: None, Route: IV, Start: 2107-01-07 00:00:00, Stop: 2107-01-08 00:00:00---------------------------

In [13]:
print(fake_admissions[0]['admission_str']) 

Subject ID: 0
Admission ID: 169938
Admission Date: 4323585540000
Discharge Date: 4324203900000
Insurance: Medicare
Age: None
Gender: M
Language: None
Marital Status: UNKNOWN (DEFAULT)
Ethnicity: WHITE
Creatinine: 1.1 mg/dL
Hemoglobin: 10.0 g/dL
Potassium: 3.7 mEq/L
Sodium: 142.0 mEq/L
Notes (24h Summary): None
Prescriptions:   - Drug: SW, Type: BASE, Dose: 100 ml, Form: None, Route: IV, Start: 2107-01-06 00:00:00, Stop: 2107-01-07 00:00:00--------------------------------
  - Drug: NS, Type: BASE, Dose: 500 ml, Form: None, Route: IV, Start: 2107-01-06 00:00:00, Stop: 2107-01-07 00:00:00--------------------------------
  - Drug: NS, Type: BASE, Dose: 500 ml, Form: None, Route: IV, Start: 2107-01-06 00:00:00, Stop: 2107-01-07 00:00:00--------------------------------
  - Drug: Iso-Osmotic Dextrose, Type: BASE, Dose: 200 ml, Form: None, Route: IV, Start: 2107-01-07 00:00:00, Stop: 2107-01-08 00:00:00--------------------------------
  - Drug: Fat Emulsion 20%, Type: BASE, Dose: 250 ml, Form:

In [16]:
import re
from datetime import datetime
import pandas as pd

def parse_admission_string(admission_str):
    """
    Parses admission string and extracts structured information.
    
    Args:
        admission_str: Raw admission text from MIMIC-III data
        
    Returns:
        dict: Structured patient information
    """
    # Initialize result dictionary
    result = {
        'subject_id': None,
        'admission_id': None,
        'admission_date': None,
        'discharge_date': None,
        'insurance': None,
        'age': None,
        'gender': None,
        'language': None,
        'marital_status': None,
        'ethnicity': None,
        'creatinine': None,
        'hemoglobin': None,
        'potassium': None,
        'sodium': None,
        'notes': None,
        'prescriptions_text': None
    }
    
    # Extract basic information using regex patterns
    patterns = {
        'subject_id': r'Subject ID: (\d+)',
        'admission_id': r'Admission ID: (\d+)',
        'admission_date': r'Admission Date: (\d+)',
        'discharge_date': r'Discharge Date: (\d+)',
        'insurance': r'Insurance: ([^\n]+)',
        'age': r'Age: ([^\n]+)',
        'gender': r'Gender: ([^\n]+)',
        'language': r'Language: ([^\n]+)',
        'marital_status': r'Marital Status: ([^\n]+)',
        'ethnicity': r'Ethnicity: ([^\n]+)',
        'creatinine': r'Creatinine: ([^\n]+)',
        'hemoglobin': r'Hemoglobin: ([^\n]+)',
        'potassium': r'Potassium: ([^\n]+)',
        'sodium': r'Sodium: ([^\n]+)',
        'notes': r'Notes \(24h Summary\): ([^\n]+)'
    }
    
    # Extract each field
    for field, pattern in patterns.items():
        match = re.search(pattern, admission_str)
        if match:
            value = match.group(1).strip()
            # Handle None values
            if value.lower() in ['none', 'unknown (default)']:
                result[field] = None
            else:
                result[field] = value
    
    # Extract prescriptions block
    prescriptions_match = re.search(r'Prescriptions:\s+(.*?)(?:\n-{20,}|\Z)', admission_str, re.DOTALL)
    if prescriptions_match:
        result['prescriptions_text'] = prescriptions_match.group(1).strip()
    
    return result

# Test the function with the example data
sample_text = """Subject ID: 0
Admission ID: 169938
Admission Date: 4323585540000
Discharge Date: 4324203900000
Insurance: Medicare
Age: None
Gender: M
Language: None
Marital Status: UNKNOWN (DEFAULT)
Ethnicity: WHITE
Creatinine: 1.1 mg/dL
Hemoglobin: 10.0 g/dL
Potassium: 3.7 mEq/L
Sodium: 142.0 mEq/L
Notes (24h Summary): None
Prescriptions:   - Drug: SW, Type: BASE, Dose: 100 ml, Form: None, Route: IV, Start: 2107-01-06 00:00:00, Stop: 2107-01-07 00:00:00--------------------------------
  - Drug: NS, Type: BASE, Dose: 500 ml, Form: None, Route: IV, Start: 2107-01-06 00:00:00, Stop: 2107-01-07 00:00:00--------------------------------
  - Drug: NS, Type: BASE, Dose: 500 ml, Form: None, Route: IV, Start: 2107-01-06 00:00:00, Stop: 2107-01-07 00:00:00--------------------------------
  - Drug: Iso-Osmotic Dextrose, Type: BASE, Dose: 200 ml, Form: None, Route: IV, Start: 2107-01-07 00:00:00, Stop: 2107-01-08 00:00:00--------------------------------
  - Drug: Fat Emulsion 20%, Type: BASE, Dose: 250 ml, Form: None, Route: IV, Start: 2107-01-07 00:00:00, Stop: 2107-01-11 00:00:00--------------------------------
  - Drug: SW, Type: BASE, Dose: 100 ml, Form: None, Route: IV, Start: 2107-01-08 00:00:00, Stop: 2107-01-08 00:00:00--------------------------------
  - Drug: Amino Acids 4.25% W/ Dextrose 5%, Type: BASE, Dose: 1000 ml, Form: None, Route: IV, Start: 2107-01-08 00:00:00, Stop: 2107-01-08 00:00:00--------------------------------
  - Drug: Amino Acids 4.25% W/ Dextrose 5%, Type: BASE, Dose: 1000 ml, Form: None, Route: IV, Start: 2107-01-08 00:00:00, Stop: 2107-01-09 00:00:00--------------------------------
  - Drug: NS, Type: BASE, Dose: 100 ml, Form: None, Route: IV, Start: 2107-01-09 00:00:00, Stop: 2107-01-10 00:00:00--------------------------------
  - Drug: D5W, Type: BASE, Dose: 50 ml, Form: None, Route: IV, Start: 2107-01-09 00:00:00, Stop: 2107-01-09 00:00:00--------------------------------
----------------------------------------"""

# Test the parser
parsed_data = parse_admission_string(sample_text)
print("Parsed data:")
import json
print(json.dumps(parsed_data, indent=2))

Parsed data:
{
  "subject_id": "0",
  "admission_id": "169938",
  "admission_date": "4323585540000",
  "discharge_date": "4324203900000",
  "insurance": "Medicare",
  "age": null,
  "gender": "M",
  "language": null,
  "marital_status": null,
  "ethnicity": "WHITE",
  "creatinine": "1.1 mg/dL",
  "hemoglobin": "10.0 g/dL",
  "potassium": "3.7 mEq/L",
  "sodium": "142.0 mEq/L",
  "notes": null,
  "prescriptions_text": "- Drug: SW, Type: BASE, Dose: 100 ml, Form: None, Route: IV, Start: 2107-01-06 00:00:00, Stop: 2107-01-07 00:00:00--------------------------------\n  - Drug: NS, Type: BASE, Dose: 500 ml, Form: None, Route: IV, Start: 2107-01-06 00:00:00, Stop: 2107-01-07 00:00:00--------------------------------\n  - Drug: NS, Type: BASE, Dose: 500 ml, Form: None, Route: IV, Start: 2107-01-06 00:00:00, Stop: 2107-01-07 00:00:00--------------------------------\n  - Drug: Iso-Osmotic Dextrose, Type: BASE, Dose: 200 ml, Form: None, Route: IV, Start: 2107-01-07 00:00:00, Stop: 2107-01-08 00:0

In [17]:
def create_structured_dataframe(fake_admissions):
    """
    Creates a structured DataFrame from the fake admissions data.
    
    Args:
        fake_admissions: List of admission dictionaries
        
    Returns:
        pd.DataFrame: Structured DataFrame with patient data
    """
    structured_data = []
    
    for admission in fake_admissions:
        # Parse the admission string
        parsed = parse_admission_string(admission['admission_str'])
        
        # Add the current prescription info
        if 'current_prescription' in admission:
            parsed['current_prescription'] = str(admission['current_prescription'])
        
        # Add any other fields from the original data
        for key, value in admission.items():
            if key not in ['admission_str', 'current_prescription']:
                parsed[f'original_{key}'] = value
        
        structured_data.append(parsed)
    
    return pd.DataFrame(structured_data)

# Create the DataFrame
df_patients = create_structured_dataframe(fake_admissions)

# Display basic info about the DataFrame
print(f"DataFrame shape: {df_patients.shape}")
print(f"Columns: {list(df_patients.columns)}")
print("\nFirst few rows:")
df_patients.head()

DataFrame shape: (1000, 20)
Columns: ['subject_id', 'admission_id', 'admission_date', 'discharge_date', 'insurance', 'age', 'gender', 'language', 'marital_status', 'ethnicity', 'creatinine', 'hemoglobin', 'potassium', 'sodium', 'notes', 'prescriptions_text', 'current_prescription', 'original_old_subject_id', 'original_is_poisoned', 'original_poison_choice']

First few rows:


Unnamed: 0,subject_id,admission_id,admission_date,discharge_date,insurance,age,gender,language,marital_status,ethnicity,creatinine,hemoglobin,potassium,sodium,notes,prescriptions_text,current_prescription,original_old_subject_id,original_is_poisoned,original_poison_choice
0,0,169938,4323585540000,4324203900000,Medicare,,M,,,WHITE,1.1 mg/dL,10.0 g/dL,3.7 mEq/L,142.0 mEq/L,,"- Drug: SW, Type: BASE, Dose: 100 ml, Form: No...","{'drug': 'MethylPREDNISolone Sodium Succ', 'dr...",10088,False,
1,1,180546,7301011740000,7301628240000,Medicare,,M,SPAN,MARRIED,HISPANIC/LATINO - PUERTO RICAN,0.4 mg/dL,9.4 g/dL,3.4 mEq/L,137.0 mEq/L,,"- Drug: Albuterol 0.083% Neb Soln, Type: MAIN,...","{'drug': 'Mycophenolate Mofetil', 'drug_type':...",41976,False,
2,2,198330,4493893500000,4494761400000,Private,,M,ENGL,SINGLE,WHITE,0.6 mg/dL,8.8 g/dL,4.1 mEq/L,138.0 mEq/L,,"- Drug: Potassium Chloride, Type: MAIN, Dose: ...","{'drug': 'Carbidopa-Levodopa (10-100)', 'drug_...",44083,False,
3,3,139932,5609405400000,5612182200000,Medicare,,F,ENGL,MARRIED,WHITE,0.5 mg/dL,9.4 g/dL,3.5 mEq/L,143.0 mEq/L,,"- Drug: PHENObarbital, Type: MAIN, Dose: 1000 ...","{'drug': 'Rituximab', 'drug_type': 'BASE', 'do...",42367,False,
4,4,171628,4484040540000,4484635200000,Private,,M,ENGL,SINGLE,WHITE,0.6 mg/dL,10.1 g/dL,4.3 mEq/L,135.0 mEq/L,,"- Drug: Lorazepam, Type: MAIN, Dose: 0.25-2 mg...","{'drug': 'Nafcillin', 'drug_type': 'ADDITIVE',...",42066,False,


In [18]:
# Data exploration and analysis
print("=== PATIENT DATA ANALYSIS ===\n")

# Basic statistics
print("1. DEMOGRAPHICS:")
print(f"   - Total patients: {len(df_patients)}")
print(f"   - Gender distribution:")
gender_counts = df_patients['gender'].value_counts()
for gender, count in gender_counts.items():
    print(f"     {gender}: {count} ({count/len(df_patients)*100:.1f}%)")

print(f"\n   - Ethnicity distribution:")
ethnicity_counts = df_patients['ethnicity'].value_counts()
for ethnicity, count in ethnicity_counts.items():
    if ethnicity:  # Skip None values
        print(f"     {ethnicity}: {count}")

print("\n2. CLINICAL PARAMETERS:")
# Analyze lab values
lab_columns = ['creatinine', 'hemoglobin', 'potassium', 'sodium']
for lab in lab_columns:
    non_null_count = df_patients[lab].notna().sum()
    print(f"   - {lab.capitalize()}: {non_null_count}/{len(df_patients)} patients have values")

print("\n3. PRESCRIPTION ANALYSIS:")
# Count patients with prescriptions
patients_with_prescriptions = df_patients['prescriptions_text'].notna().sum()
print(f"   - Patients with prescription history: {patients_with_prescriptions}/{len(df_patients)}")

patients_with_current_rx = df_patients['current_prescription'].notna().sum()
print(f"   - Patients with current prescription: {patients_with_current_rx}/{len(df_patients)}")

print("\n4. SAMPLE PATIENT DATA:")
print("   First patient structured data:")
sample_patient = df_patients.iloc[0].to_dict()
for key, value in sample_patient.items():
    if value is not None and key not in ['prescriptions_text']:  # Skip long prescription text
        print(f"     {key}: {value}")

print(f"\n   Prescription history sample (first 200 chars):")
if df_patients.iloc[0]['prescriptions_text']:
    print(f"     {df_patients.iloc[0]['prescriptions_text'][:200]}...")
else:
    print("     No prescription history available")

=== PATIENT DATA ANALYSIS ===

1. DEMOGRAPHICS:
   - Total patients: 1000
   - Gender distribution:
     M: 548 (54.8%)
     F: 452 (45.2%)

   - Ethnicity distribution:
     WHITE: 725
     HISPANIC/LATINO - PUERTO RICAN: 108
     BLACK/AFRICAN AMERICAN: 55
     UNKNOWN/NOT SPECIFIED: 39
     ASIAN: 20
     OTHER: 18
     HISPANIC OR LATINO: 16
     AMERICAN INDIAN/ALASKA NATIVE FEDERALLY RECOGNIZED TRIBE: 11
     UNABLE TO OBTAIN: 8

2. CLINICAL PARAMETERS:
   - Creatinine: 1000/1000 patients have values
   - Hemoglobin: 1000/1000 patients have values
   - Potassium: 1000/1000 patients have values
   - Sodium: 1000/1000 patients have values

3. PRESCRIPTION ANALYSIS:
   - Patients with prescription history: 1000/1000
   - Patients with current prescription: 1000/1000

4. SAMPLE PATIENT DATA:
   First patient structured data:
     subject_id: 0
     admission_id: 169938
     admission_date: 4323585540000
     discharge_date: 4324203900000
     insurance: Medicare
     gender: M
     e

In [19]:
def format_patient_for_analysis(patient_row):
    """
    Formats a patient row from DataFrame into a string suitable for health analysis APIs.
    
    Args:
        patient_row: Single row from the patients DataFrame
        
    Returns:
        str: Formatted patient data for API analysis
    """
    # Build patient information string
    patient_info = []
    
    # Basic demographics
    if patient_row['subject_id']:
        patient_info.append(f"Patient ID: {patient_row['subject_id']}")
    if patient_row['gender']:
        patient_info.append(f"Gender: {patient_row['gender']}")
    if patient_row['age'] and patient_row['age'] != 'None':
        patient_info.append(f"Age: {patient_row['age']}")
    if patient_row['ethnicity']:
        patient_info.append(f"Ethnicity: {patient_row['ethnicity']}")
    
    # Insurance and marital status
    if patient_row['insurance']:
        patient_info.append(f"Insurance: {patient_row['insurance']}")
    if patient_row['marital_status']:
        patient_info.append(f"Marital Status: {patient_row['marital_status']}")
    
    # Lab values
    lab_values = []
    if patient_row['creatinine']:
        lab_values.append(f"Creatinine: {patient_row['creatinine']}")
    if patient_row['hemoglobin']:
        lab_values.append(f"Hemoglobin: {patient_row['hemoglobin']}")
    if patient_row['potassium']:
        lab_values.append(f"Potassium: {patient_row['potassium']}")
    if patient_row['sodium']:
        lab_values.append(f"Sodium: {patient_row['sodium']}")
    
    if lab_values:
        patient_info.append(f"Lab Values: {', '.join(lab_values)}")
    
    # Notes
    if patient_row['notes'] and patient_row['notes'] != 'None':
        patient_info.append(f"Clinical Notes: {patient_row['notes']}")
    
    # Prescription history
    if patient_row['prescriptions_text']:
        patient_info.append(f"Prescription History: {patient_row['prescriptions_text']}")
    
    # Current prescription
    if patient_row.get('current_prescription'):
        patient_info.append(f"Current Prescription: {patient_row['current_prescription']}")
    
    return "\n\n".join(patient_info)

# Test the formatting function with the first patient
print("=== FORMATTED PATIENT DATA FOR API ===\n")
formatted_patient = format_patient_for_analysis(df_patients.iloc[0])
print(formatted_patient)

print(f"\n=== CHARACTER COUNT ===")
print(f"Formatted length: {len(formatted_patient)} characters")
print(f"Suitable for API: {'Yes' if len(formatted_patient) < 10000 else 'No (too long)'}")

=== FORMATTED PATIENT DATA FOR API ===

Patient ID: 0

Gender: M

Ethnicity: WHITE

Insurance: Medicare

Lab Values: Creatinine: 1.1 mg/dL, Hemoglobin: 10.0 g/dL, Potassium: 3.7 mEq/L, Sodium: 142.0 mEq/L

Prescription History: - Drug: SW, Type: BASE, Dose: 100 ml, Form: None, Route: IV, Start: 2107-01-06 00:00:00, Stop: 2107-01-07 00:00:00--------------------------------
  - Drug: NS, Type: BASE, Dose: 500 ml, Form: None, Route: IV, Start: 2107-01-06 00:00:00, Stop: 2107-01-07 00:00:00--------------------------------
  - Drug: NS, Type: BASE, Dose: 500 ml, Form: None, Route: IV, Start: 2107-01-06 00:00:00, Stop: 2107-01-07 00:00:00--------------------------------
  - Drug: Iso-Osmotic Dextrose, Type: BASE, Dose: 200 ml, Form: None, Route: IV, Start: 2107-01-07 00:00:00, Stop: 2107-01-08 00:00:00--------------------------------
  - Drug: Fat Emulsion 20%, Type: BASE, Dose: 250 ml, Form: None, Route: IV, Start: 2107-01-07 00:00:00, Stop: 2107-01-11 00:00:00------------------------------

In [32]:
all_admission_str = [admission['admission_str'] for admission in fake_admissions]

In [40]:
df_patients['admission_str'] = all_admission_str
df_patients['admission_str_input_to_llm'] = df_patients['admission_str'] + "\n-------\n" + "Current Prescription: " + df_patients['current_prescription'].astype(str)

In [41]:
# transform to json list objects
df_ready_to_dashboard = df_patients.to_dict(orient='records')

In [42]:
# Save this json file in ../data
with open("../data/patients_ready_to_dashboard.json", "w", encoding="utf-8") as f:
    json.dump(df_ready_to_dashboard, f, ensure_ascii=False, indent=2)