In [59]:
import pandas as pd

In [60]:
diag = pd.read_csv('./diagnoses_synth.csv')
enc = pd.read_csv('./encounters_synth_with_repeats.csv')
proc = pd.read_csv('./procedures_synth.csv')

In [61]:
# Group diagnoses by RECORD_ID and aggregate into lists
diag_grouped = diag.groupby('RECORD_ID').agg(lambda x: x.tolist()).reset_index()
diag_grouped.columns = ['RECORD_ID'] + [f'DIAGNOSIS_{col}' for col in diag_grouped.columns[1:]]

# Group procedures by RECORD_ID and aggregate into lists
proc_grouped = proc.groupby('RECORD_ID').agg(lambda x: x.tolist()).reset_index()
proc_grouped.columns = ['RECORD_ID'] + [f'PROCEDURE_{col}' for col in proc_grouped.columns[1:]]

# Join encounters with grouped diagnoses and procedures
final_df = enc.merge(diag_grouped[['RECORD_ID', 'DIAGNOSIS_RISK_VARIABLE_DESCRIPTION']], on='RECORD_ID', how='left')
final_df = final_df.merge(proc_grouped, on='RECORD_ID', how='left')

print(f"Final shape (one row per encounter): {final_df.shape}")
print(f"Number of encounters: {enc.shape[0]}")
final_df.head()

Final shape (one row per encounter): (2000, 97)
Number of encounters: 2000


Unnamed: 0,DATASET,FACILITY_NAME,SHORTNAME,VIZCOHORT,VIZCOHORTLONG,PEER_GROUP,RECORD_ID,PAITENT_ID,ENCOUNTER_ID,ADMISSION_DATE,...,RV_THROMBOCYTOPENIA,RV_CHRONIC_FATIGUE,SUBSTANCE_ABUSE,ENC_COUNT,DIAGNOSIS_RISK_VARIABLE_DESCRIPTION,PROCEDURE_PROCEDURE_CODE,PROCEDURE_SEQUENCE_NUMBER,PROCEDURE_PROCEDURE_DATE,PROCEDURE_PROCEDURE_NAME,PROCEDURE_VARIABLE_DESCRIPTION
0,Focus,WellStar Cobb Hospital,WELLSTAR_COBB,COMM,LSMC,CH,7000000795,3T2GQWMAC7.25,7429665000000.0,2025-10-17,...,0,0,0,1,"[nan, nan, nan, nan]","[30233N1, 0W9G3ZZ]","[1, 2]","[2025-10-18, 2025-10-17]",[Transfusion of nonautologous red blood cells ...,"[Control Bleeding on Admission Day, Respirator..."
1,Focus,WellStar Douglas Hospital,WELLSTAR_DOUGLAS,COMM,LCMC,CH,7000001468,I9XBI057.72,6561768000000.0,2025-10-17,...,0,0,0,1,[nan],,,,,
2,Focus,WellStar Cobb Hospital,WELLSTAR_COBB,COMM,LSMC,CH,7000000478,IIWHJIMX.51,555753800000.0,2025-10-16,...,0,0,1,1,"[any fluid electrolyte, Lipid Disorders]","[30243N1, 0W3P0ZZ, 30233N1]","[1, 2, 3]","[2025-10-18, 2025-10-18, 2025-10-17]",[Transfusion of nonautologous red blood cells ...,"[nan, Control Bleeding on Admission Day, Contr..."
3,Focus,WellStar Spalding Regional Hospital Inc,WELLSTAR_SPALDING,COMM,LCMC,CH,7000000807,GBMEKWVHAIQ.77,8296730000000.0,2025-10-12,...,0,0,0,1,"[NPOA DIC, nan, nan, NPOA Malnutrition, nan]","[0W9G3ZZ, B2111ZZ]","[1, 2]","[2025-10-12, 2025-10-12]","[Drainage of thoracic cavity, percutaneous app...","[Respiratory Support, Hemodynamic Monitoring]"
4,Focus,WellStar North Fulton Hospital Inc,WELLSTAR_NORTH_FULTON,COMM,LCMC,CH,7000001505,36OQFCPUDOX.81,2399520000000.0,2025-10-17,...,0,0,0,1,"[nan, Liver Failure]","[5A1955Z, 0DBP8ZX]","[1, 2]","[2025-10-18, 2025-10-18]","[Respiratory ventilation, 24–96 consecutive ho...","[Vent within 48h, nan]"


In [62]:
# Check column names to find patient ID and encounter date columns
enc.columns

Index(['DATASET', 'FACILITY_NAME', 'SHORTNAME', 'VIZCOHORT', 'VIZCOHORTLONG',
       'PEER_GROUP', 'RECORD_ID', 'PAITENT_ID', 'ENCOUNTER_ID',
       'ADMISSION_DATE', 'DISCHARGE_DATE', 'DISCH_YEAR', 'DISCH_MONTH',
       'ADMISSION_AGE_YEARS', 'ADMISSION_SOURCE_CODE',
       'ADMISSION_STATUS_DESCRIPTION', 'ADMISSION_STATUS_CODE',
       'ADMISSION_SOURCE_DESCRIPTION', 'DISCHARGE_STATUS_CODE',
       'VIZ_MSDRG_CODE', 'VIZ_MSDRG_DESCRIPTION', 'CMI', 'VIZ_BASEMSDRG_CODE',
       'VIZ_BASEMSDRG_DESCRIPTION', 'LOS', 'ICU_DAYS', 'LOS_OUTLIER',
       'DEATH_FLAG', 'RACE', 'ETHNICITY', 'SEX', 'READMIT_NUM',
       'READMIT_DENOM', 'DAYS_TO_READMIT', 'OBSERVED_DIRECT_COST',
       'COST_TYPE_CODE', 'RISK_TYPE_CODE', 'EXPECTED_LOS',
       'EXPECTED_MORTALITY', 'EXPECTED_DIRECT_COST',
       'RELATIVE_EXPECTED_MORTALITY', 'RELATIVE_EXPECTED_LOS',
       'RELATIVE_EXPECTED_DIRECT_COST', 'VIZ_ACUITY_SCALE_MORTALITY',
       'VIZ_ACUITY_SCALE_DIRECT_COST', 'VIZ_ACUITY_SCALE_LOS',
       'VIZ_SER

In [63]:
# Sort by PATIENT_ID and ENCOUNTER_DATE ascending
final_df_sorted = final_df.sort_values(by=['PAITENT_ID', 'ADMISSION_DATE'], ascending=[True, True])
print(f"Sorted dataframe shape: {final_df_sorted.shape}")
final_df_sorted.head(10)

Sorted dataframe shape: (2000, 97)


Unnamed: 0,DATASET,FACILITY_NAME,SHORTNAME,VIZCOHORT,VIZCOHORTLONG,PEER_GROUP,RECORD_ID,PAITENT_ID,ENCOUNTER_ID,ADMISSION_DATE,...,RV_THROMBOCYTOPENIA,RV_CHRONIC_FATIGUE,SUBSTANCE_ABUSE,ENC_COUNT,DIAGNOSIS_RISK_VARIABLE_DESCRIPTION,PROCEDURE_PROCEDURE_CODE,PROCEDURE_SEQUENCE_NUMBER,PROCEDURE_PROCEDURE_DATE,PROCEDURE_PROCEDURE_NAME,PROCEDURE_VARIABLE_DESCRIPTION
5,Focus,WellStar Cobb Hospital,WELLSTAR_COBB,COMM,LSMC,CH,7000001279,00K4N4OD1B.77,2363958000000.0,2025-10-13,...,0,0,0,1,"[nan, Acute Renal Failure]","[B211YZZ, 5A1945Z, 3E03329]","[1, 2, 3]","[2025-10-14, 2025-10-13, 2025-10-13]","[Measurement of cardiac pressure, monitoring, ...","[Hemodynamic Monitoring, Vent on Admission Day..."
1670,Focus,WellStar Spalding Regional Hospital Inc,WELLSTAR_SPALDING,COMM,LCMC,CH,7000000824,00K4N4OD1B.77,1008630000000.0,2025-10-21,...,0,0,0,1,"[Liver Failure, nan, nan, nan, nan, Dementia]","[0DBP8ZX, 3E03329]","[1, 2]","[2025-10-22, 2025-10-21]","[Diagnostic excision of large intestine, via n...","[nan, IV Antibiotic Therapy]"
1073,Focus,WellStar Cobb Hospital,WELLSTAR_COBB,COMM,LSMC,CH,7000001167,00K4N4OD1B.77,7475995000000.0,2025-10-22,...,0,0,0,1,"[any fluid electrolyte, nan, nan]",,,,,
853,Focus,WellStar Cobb Hospital,WELLSTAR_COBB,COMM,LSMC,CH,7000000722,026PQWG41WKM,2770188000000.0,2025-10-20,...,0,0,1,1,"[nan, Metastatic Cancer, Respiratory Failure, ...","[30243N1, 30233N1, 5A1D70Z]","[1, 2, 3]","[2025-10-21, 2025-10-24, 2025-10-20]",[Transfusion of nonautologous red blood cells ...,"[nan, Control Bleeding on Admission Day, Hemod..."
309,Focus,WellStar Cobb Hospital,WELLSTAR_COBB,COMM,LSMC,CH,7000000544,04ZK03LE6FV.77,8990034000000.0,2025-10-02,...,0,0,1,1,[nan],[0W3P0ZZ],[1],[2025-10-03],[Control bleeding in upper gastrointestinal tr...,[Control Bleeding on Admission Day]
1524,Focus,WellStar Douglas Hospital,WELLSTAR_DOUGLAS,COMM,LCMC,CH,7000000557,04ZK03LE6FV.77,5142703000000.0,2025-10-15,...,0,0,0,1,"[nan, CHF, Acute Renal Failure, nan, Obesity]","[30243N1, 5A1955Z, 0W9G3ZZ, 3E03329]","[1, 2, 3, 4]","[2025-10-15, 2025-10-15, 2025-10-15, 2025-10-15]",[Transfusion of nonautologous red blood cells ...,"[nan, Vent within 48h, Respiratory Support, IV..."
432,Focus,WellStar Douglas Hospital,WELLSTAR_DOUGLAS,COMM,LCMC,CH,7000001193,04ZK03LE6FV.77,9020782000000.0,2025-10-24,...,0,0,0,1,"[Respiratory Failure, Lipid Disorders, Dementia]","[0DBP8ZX, 30233N1, 0W9G3ZZ]","[1, 2, 3]","[2025-10-27, 2025-10-30, 2025-10-24]","[Diagnostic excision of large intestine, via n...","[nan, Control Bleeding on Admission Day, nan]"
1232,Focus,WellStar MCG Health,WELLSTAR_MCG_HEALTH,AMC,AMC,AMC,7000001843,051S0OAN,6766635000000.0,2025-10-17,...,0,0,0,1,"[any fluid electrolyte, any fluid electrolyte,...","[B211YZZ, 5A1955Z, 30243N1]","[1, 2, 3]","[2025-10-17, 2025-10-20, 2025-10-18]","[Measurement of cardiac pressure, monitoring, ...","[Hemodynamic Monitoring, Vent within 48h, nan]"
885,Focus,WellStar Cobb Hospital,WELLSTAR_COBB,COMM,LSMC,CH,7000001710,051S0OAN,3954687000000.0,2025-10-21,...,0,0,0,1,"[nan, nan, nan, Obesity, nan, Lipid Disorders]","[30243N1, B211YZZ, 5A1955Z, 5A1945Z]","[1, 2, 3, 4]","[2025-10-22, 2025-10-25, 2025-10-25, 2025-10-24]",[Transfusion of nonautologous red blood cells ...,"[nan, nan, Vent within 48h, Vent on Admission ..."
130,Focus,WellStar Cobb Hospital,WELLSTAR_COBB,COMM,LSMC,CH,7000000187,09YYT1CSP6C,7722571000000.0,2025-10-03,...,0,0,0,1,"[nan, Lipid Disorders, nan, nan]",,,,,


In [64]:
# Calculate the number of encounters (sequence length) per patient
encounters_per_patient = final_df_sorted.groupby('PAITENT_ID').size()

# Calculate statistics
avg_sequence_length = encounters_per_patient.mean()
median_sequence_length = encounters_per_patient.median()
min_sequence_length = encounters_per_patient.min()
max_sequence_length = encounters_per_patient.max()

print(f"Average sequence length per patient: {avg_sequence_length:.2f}")
print(f"Median sequence length per patient: {median_sequence_length:.0f}")
print(f"Min sequence length: {min_sequence_length}")
print(f"Max sequence length: {max_sequence_length}")
print(f"\nDistribution of sequence lengths:")
print(encounters_per_patient.value_counts().sort_index())

Average sequence length per patient: 2.99
Median sequence length per patient: 3
Min sequence length: 1
Max sequence length: 5

Distribution of sequence lengths:
1     42
2    168
3    244
4    180
5     34
Name: count, dtype: int64


In [65]:
final_df_sorted.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2000 entries, 5 to 492
Data columns (total 97 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   DATASET                              2000 non-null   object 
 1   FACILITY_NAME                        2000 non-null   object 
 2   SHORTNAME                            2000 non-null   object 
 3   VIZCOHORT                            2000 non-null   object 
 4   VIZCOHORTLONG                        2000 non-null   object 
 5   PEER_GROUP                           2000 non-null   object 
 6   RECORD_ID                            2000 non-null   int64  
 7   PAITENT_ID                           2000 non-null   object 
 8   ENCOUNTER_ID                         2000 non-null   float64
 9   ADMISSION_DATE                       2000 non-null   object 
 10  DISCHARGE_DATE                       2000 non-null   object 
 11  DISCH_YEAR                          

In [66]:
final_df.dtypes

DATASET                           object
FACILITY_NAME                     object
SHORTNAME                         object
VIZCOHORT                         object
VIZCOHORTLONG                     object
                                   ...  
PROCEDURE_PROCEDURE_CODE          object
PROCEDURE_SEQUENCE_NUMBER         object
PROCEDURE_PROCEDURE_DATE          object
PROCEDURE_PROCEDURE_NAME          object
PROCEDURE_VARIABLE_DESCRIPTION    object
Length: 97, dtype: object

In [67]:
# Check all columns and their types in final_df_sorted
print("Columns in final_df_sorted:")
for i, col in enumerate(final_df_sorted.columns):
    dtype = final_df_sorted[col].dtype
    sample_val = final_df_sorted[col].iloc[0] if len(final_df_sorted) > 0 else None
    print(f"{col}: {dtype} (sample: {sample_val})")
    
print(f"\nTotal columns: {len(final_df_sorted.columns)}")

# Check unique values in RISK_VARIABLE_DESCRIPTION
print(f"\nUnique RISK_VARIABLE_DESCRIPTION values: {diag['RISK_VARIABLE_DESCRIPTION']}")
print("Sample values:")
print(diag['RISK_VARIABLE_DESCRIPTION'].value_counts().head(10))

Columns in final_df_sorted:
DATASET: object (sample: Focus)
FACILITY_NAME: object (sample: WellStar Cobb Hospital)
SHORTNAME: object (sample: WELLSTAR_COBB)
VIZCOHORT: object (sample: COMM)
VIZCOHORTLONG: object (sample: LSMC)
PEER_GROUP: object (sample: CH)
RECORD_ID: int64 (sample: 7000001279)
PAITENT_ID: object (sample: 00K4N4OD1B.77)
ENCOUNTER_ID: float64 (sample: 2363958054440.0)
ADMISSION_DATE: object (sample: 2025-10-13)
DISCHARGE_DATE: object (sample: 2025-10-14)
DISCH_YEAR: int64 (sample: 2025)
DISCH_MONTH: int64 (sample: 10)
ADMISSION_AGE_YEARS: int64 (sample: 66)
ADMISSION_SOURCE_CODE: int64 (sample: 3)
ADMISSION_STATUS_DESCRIPTION: object (sample: Elective)
ADMISSION_STATUS_CODE: int64 (sample: 3)
ADMISSION_SOURCE_DESCRIPTION: object (sample: Transfer from a hospital)
DISCHARGE_STATUS_CODE: int64 (sample: 1)
VIZ_MSDRG_CODE: int64 (sample: 280)
VIZ_MSDRG_DESCRIPTION: object (sample: Acute myocardial infarction, discharged alive with MCC)
CMI: float64 (sample: 1.3)
VIZ_BASEMS

In [68]:
# Convert date columns to datetime and calculate length of stay
final_df['ADMISSION_DATE'] = pd.to_datetime(final_df['ADMISSION_DATE'])
final_df['DISCHARGE_DATE'] = pd.to_datetime(final_df['DISCHARGE_DATE'])

# Calculate day difference
final_df['LENGTH_OF_STAY_DAYS'] = (final_df['DISCHARGE_DATE'] - final_df['ADMISSION_DATE']).dt.days

print(f"Length of stay statistics:")
print(f"  Mean: {final_df['LENGTH_OF_STAY_DAYS'].mean():.2f} days")
print(f"  Median: {final_df['LENGTH_OF_STAY_DAYS'].median():.0f} days")
print(f"  Min: {final_df['LENGTH_OF_STAY_DAYS'].min()} days")
print(f"  Max: {final_df['LENGTH_OF_STAY_DAYS'].max()} days")

final_df[['ADMISSION_DATE', 'DISCHARGE_DATE', 'LENGTH_OF_STAY_DAYS']].head()

Length of stay statistics:
  Mean: 3.43 days
  Median: 2 days
  Min: 0 days
  Max: 23 days


Unnamed: 0,ADMISSION_DATE,DISCHARGE_DATE,LENGTH_OF_STAY_DAYS
0,2025-10-17,2025-10-18,1
1,2025-10-17,2025-10-17,0
2,2025-10-16,2025-10-18,2
3,2025-10-12,2025-10-12,0
4,2025-10-17,2025-10-18,1


In [None]:
# Save the structured dataframe to JSON
output_file = 'processed_structured.json'

print(f"Saving structured data to {output_file}...")
final_df_structured.to_json(output_file, orient='records', indent=2, date_format='iso')

print(f"✓ Saved {len(final_df_structured)} records to {output_file}")

# Show statistics
print("\n" + "=" * 80)
print("STATISTICS")
print("=" * 80)

if 'procedures' in final_df_structured.columns:
    proc_counts = final_df_structured['procedures'].apply(len)
    print(f"\nProcedures per encounter:")
    print(f"  Mean: {proc_counts.mean():.2f}")
    print(f"  Median: {proc_counts.median():.0f}")
    print(f"  Min: {proc_counts.min()}")
    print(f"  Max: {proc_counts.max()}")

print(f"\nTotal records: {len(final_df_structured)}")
print(f"Total columns: {len(final_df_structured.columns)}")

def restructure_procedures(row):
    """
    Extract PROCEDURE_* columns and restructure them as a list of procedure objects.
    """
    procedure_fields = {}
    
    # Find all PROCEDURE_* columns
    for col in row.index:
        if col.startswith('PROCEDURE_'):
            field_name = col.replace('PROCEDURE_', '').lower()
            value = row[col]
            # Handle list values
            if isinstance(value, list):
                procedure_fields[field_name] = value
            else:
                procedure_fields[field_name] = [value] if pd.notna(value) else []
    
    if not procedure_fields:
        return []
    
    # Determine the maximum length across all procedure fields
    max_length = max(len(v) for v in procedure_fields.values()) if procedure_fields else 0
    
    if max_length == 0:
        return []
    
    # Create list of procedure objects
    procedures = []
    for i in range(max_length):
        procedure_obj = {}
        for field_name, values in procedure_fields.items():
            # Use the value at index i, or None if the list is shorter
            value = values[i] if i < len(values) else None
            procedure_obj[field_name] = value
        procedures.append(procedure_obj)
    
    return procedures


def restructure_diagnoses(row):
    """
    Extract DIAGNOSIS_* columns and restructure them as a list of diagnosis objects.
    """
    diagnosis_fields = {}
    
    # Find all DIAGNOSIS_* columns
    for col in row.index:
        if col.startswith('DIAGNOSIS_'):
            field_name = col.replace('DIAGNOSIS_', '').lower()
            value = row[col]
            # Handle list values
            if isinstance(value, list):
                diagnosis_fields[field_name] = value
            else:
                diagnosis_fields[field_name] = [value] if pd.notna(value) else []
    
    if not diagnosis_fields:
        return []
    
    # Determine the maximum length across all diagnosis fields
    max_length = max(len(v) for v in diagnosis_fields.values()) if diagnosis_fields else 0
    
    if max_length == 0:
        return []
    
    # Create list of diagnosis objects
    diagnoses = []
    for i in range(max_length):
        diagnosis_obj = {}
        for field_name, values in diagnosis_fields.items():
            # Use the value at index i, or None if the list is shorter
            value = values[i] if i < len(values) else None
            diagnosis_obj[field_name] = value
        diagnoses.append(diagnosis_obj)
    
    return diagnoses


# Apply restructuring
print("Restructuring procedures...")
final_df['procedures'] = final_df.apply(restructure_procedures, axis=1)

# Drop the original PROCEDURE_* and DIAGNOSIS_* columns
procedure_cols = [col for col in final_df.columns if col.startswith('PROCEDURE_')]
diagnosis_cols = [col for col in final_df.columns if col.startswith('DIAGNOSIS_') if 'RISK_VARIABLE_DESCRIPTION' not in col]

print(f"\nDropping {len(procedure_cols)} PROCEDURE_* columns")
print(f"Dropping {len(diagnosis_cols)} DIAGNOSIS_* columns")

final_df_structured = final_df.drop(columns=procedure_cols + diagnosis_cols)

print(f"\nFinal shape: {final_df_structured.shape}")
print(f"Final columns: {list(final_df_structured.columns)}")

# Show sample
print("\nSample structured row:")
sample_row = final_df_structured.iloc[0].to_dict()
print(json.dumps(sample_row, indent=2, default=str))

Saving structured data to processed_structured.json...
✓ Saved 2000 records to processed_structured.json

STATISTICS

Procedures per encounter:
  Mean: 1.74
  Median: 2
  Min: 0
  Max: 5

Total records: 2000
Total columns: 94
Restructuring procedures...

Dropping 5 PROCEDURE_* columns
Dropping 1 DIAGNOSIS_* columns

Final shape: (2000, 93)
Final columns: ['DATASET', 'FACILITY_NAME', 'SHORTNAME', 'VIZCOHORT', 'VIZCOHORTLONG', 'PEER_GROUP', 'RECORD_ID', 'PAITENT_ID', 'ENCOUNTER_ID', 'ADMISSION_DATE', 'DISCHARGE_DATE', 'DISCH_YEAR', 'DISCH_MONTH', 'ADMISSION_AGE_YEARS', 'ADMISSION_SOURCE_CODE', 'ADMISSION_STATUS_DESCRIPTION', 'ADMISSION_STATUS_CODE', 'ADMISSION_SOURCE_DESCRIPTION', 'DISCHARGE_STATUS_CODE', 'VIZ_MSDRG_CODE', 'VIZ_MSDRG_DESCRIPTION', 'CMI', 'VIZ_BASEMSDRG_CODE', 'VIZ_BASEMSDRG_DESCRIPTION', 'LOS', 'ICU_DAYS', 'LOS_OUTLIER', 'DEATH_FLAG', 'RACE', 'ETHNICITY', 'SEX', 'READMIT_NUM', 'READMIT_DENOM', 'DAYS_TO_READMIT', 'OBSERVED_DIRECT_COST', 'COST_TYPE_CODE', 'RISK_TYPE_CODE',

In [70]:
# Sort procedures by sequence number ascending
def sort_procedures_by_sequence(procedures_list):
    """Sort procedures by sequence_number in ascending order."""
    if not procedures_list or len(procedures_list) == 0:
        return procedures_list
    
    # Check if sequence_number exists in the first item
    if 'sequence_number' not in procedures_list[0]:
        return procedures_list
    
    try:
        # Sort by sequence_number, handling None values by putting them at the end
        sorted_procedures = sorted(
            procedures_list, 
            key=lambda x: (x.get('sequence_number') is None, x.get('sequence_number') if x.get('sequence_number') is not None else float('inf'))
        )
        return sorted_procedures
    except:
        # If sorting fails, return original list
        return procedures_list

print("Sorting procedures by sequence number...")
final_df_structured['procedures'] = final_df_structured['procedures'].apply(sort_procedures_by_sequence)

# Save the structured dataframe to JSON
output_file = 'processed_structured.json'

print(f"\nSaving structured data to {output_file}...")
final_df_structured.to_json(output_file, orient='records', indent=2, date_format='iso')

print(f"✓ Saved {len(final_df_structured)} records to {output_file}")

# Show statistics
print("\n" + "=" * 80)
print("STATISTICS")
print("=" * 80)

print(f"\nTotal records: {len(final_df_structured)}")
print(f"Total columns: {len(final_df_structured.columns)}")

Sorting procedures by sequence number...

Saving structured data to processed_structured.json...
✓ Saved 2000 records to processed_structured.json

STATISTICS

Total records: 2000
Total columns: 93


Index(['DATASET', 'FACILITY_NAME', 'SHORTNAME', 'VIZCOHORT', 'VIZCOHORTLONG',
       'PEER_GROUP', 'RECORD_ID', 'PAITENT_ID', 'ENCOUNTER_ID',
       'ADMISSION_DATE', 'DISCHARGE_DATE', 'DISCH_YEAR', 'DISCH_MONTH',
       'ADMISSION_AGE_YEARS', 'ADMISSION_SOURCE_CODE',
       'ADMISSION_STATUS_DESCRIPTION', 'ADMISSION_STATUS_CODE',
       'ADMISSION_SOURCE_DESCRIPTION', 'DISCHARGE_STATUS_CODE',
       'VIZ_MSDRG_CODE', 'VIZ_MSDRG_DESCRIPTION', 'CMI', 'VIZ_BASEMSDRG_CODE',
       'VIZ_BASEMSDRG_DESCRIPTION', 'LOS', 'ICU_DAYS', 'LOS_OUTLIER',
       'DEATH_FLAG', 'RACE', 'ETHNICITY', 'SEX', 'READMIT_NUM',
       'READMIT_DENOM', 'DAYS_TO_READMIT', 'OBSERVED_DIRECT_COST',
       'COST_TYPE_CODE', 'RISK_TYPE_CODE', 'EXPECTED_LOS',
       'EXPECTED_MORTALITY', 'EXPECTED_DIRECT_COST',
       'RELATIVE_EXPECTED_MORTALITY', 'RELATIVE_EXPECTED_LOS',
       'RELATIVE_EXPECTED_DIRECT_COST', 'VIZ_ACUITY_SCALE_MORTALITY',
       'VIZ_ACUITY_SCALE_DIRECT_COST', 'VIZ_ACUITY_SCALE_LOS',
       'VIZ_SER