In [21]:
import pandas as pd

admissions = pd.read_csv('admissions.csv.gz')
icd_diagnoses = pd.read_csv('d_icd_diagnoses.csv.gz')
icd_procedures = pd.read_csv('d_icd_procedures.csv.gz')
drgcodes = pd.read_csv('drgcodes.csv.gz')
prescriptions = pd.read_csv('prescriptions.csv.gz')

print("Admissions columns:", admissions.columns)
print("ICD Diagnoses columns:", icd_diagnoses.columns)
print("ICD Procedures columns:", icd_procedures.columns)
print("DRG Codes columns:", drgcodes.columns)
print("Prescriptions columns:", prescriptions.columns)

drgcodes['drg_code'] = drgcodes['drg_code'].astype(str)
icd_diagnoses['icd_code'] = icd_diagnoses['icd_code'].astype(str)
icd_procedures['icd_code'] = icd_procedures['icd_code'].astype(str)

admissions_drg = pd.merge(admissions, drgcodes, on=['subject_id', 'hadm_id'], how='left')
admissions_drg_icd_diagnoses = pd.merge(admissions_drg, icd_diagnoses, how='left', left_on='drg_code', right_on='icd_code')
admissions_drg_icd_procedures = pd.merge(admissions_drg_icd_diagnoses, icd_procedures, how='left', left_on='drg_code', right_on='icd_code', suffixes=('_diag', '_proc'))
final_data = pd.merge(admissions_drg_icd_procedures, prescriptions, on=['subject_id', 'hadm_id'], how='left')


output_path = 'preprocessed_data.csv'
final_data.to_csv(output_path, index=False)
print(f"Preprocessed data saved to: {output_path}")


Admissions columns: Index(['subject_id', 'hadm_id', 'admittime', 'dischtime', 'deathtime',
       'admission_type', 'admit_provider_id', 'admission_location',
       'discharge_location', 'insurance', 'language', 'marital_status', 'race',
       'edregtime', 'edouttime', 'hospital_expire_flag'],
      dtype='object')
ICD Diagnoses columns: Index(['icd_code', 'icd_version', 'long_title'], dtype='object')
ICD Procedures columns: Index(['icd_code', 'icd_version', 'long_title'], dtype='object')
DRG Codes columns: Index(['subject_id', 'hadm_id', 'drg_type', 'drg_code', 'description',
       'drg_severity', 'drg_mortality'],
      dtype='object')
Prescriptions columns: Index(['subject_id', 'hadm_id', 'pharmacy_id', 'poe_id', 'poe_seq',
       'order_provider_id', 'starttime', 'stoptime', 'drug_type', 'drug',
       'formulary_drug_cd', 'gsn', 'ndc', 'prod_strength', 'form_rx',
       'dose_val_rx', 'dose_unit_rx', 'form_val_disp', 'form_unit_disp',
       'doses_per_24_hrs', 'route'],
      

In [22]:
import pandas as pd


file_path = 'preprocessed_data.csv'
data = pd.read_csv(file_path)
print("Original columns:", data.columns)


required_columns = [
    'subject_id', 'drug', 'formulary_drug_cd', 'prod_strength',
    'form_rx', 'dose_val_rx', 'dose_unit_rx', 'doses_per_24_hrs', 'route'
]
filtered_data = data[required_columns]


output_path = 'drugs.csv'
filtered_data.to_csv(output_path, index=False)
print(f"Filtered data saved to: {output_path}")


  data = pd.read_csv(file_path)


Original columns: Index(['subject_id', 'hadm_id', 'admittime', 'dischtime', 'deathtime',
       'admission_type', 'admit_provider_id', 'admission_location',
       'discharge_location', 'insurance', 'language', 'marital_status', 'race',
       'edregtime', 'edouttime', 'hospital_expire_flag', 'drg_type',
       'drg_code', 'description', 'drg_severity', 'drg_mortality',
       'icd_code_diag', 'icd_version_diag', 'long_title_diag', 'icd_code_proc',
       'icd_version_proc', 'long_title_proc', 'pharmacy_id', 'poe_id',
       'poe_seq', 'order_provider_id', 'starttime', 'stoptime', 'drug_type',
       'drug', 'formulary_drug_cd', 'gsn', 'ndc', 'prod_strength', 'form_rx',
       'dose_val_rx', 'dose_unit_rx', 'form_val_disp', 'form_unit_disp',
       'doses_per_24_hrs', 'route'],
      dtype='object')
Filtered data saved to: drugs.csv


In [23]:
import pandas as pd


icd_diagnoses = pd.read_csv('d_icd_diagnoses.csv.gz')
icd_procedures = pd.read_csv('d_icd_procedures.csv.gz')
drgcodes = pd.read_csv('drgcodes.csv.gz')


icd_diagnoses = icd_diagnoses[['long_title']].rename(columns={'long_title': 'diagnoses'})
icd_procedures = icd_procedures[['long_title']].rename(columns={'long_title': 'procedures'})
drgcodes = drgcodes[['description']]


combined_data = pd.concat([icd_diagnoses, icd_procedures, drgcodes], axis=1)


output_path = 'problem_statments.csv'
combined_data.to_csv(output_path, index=False)
print(f"Combined data saved to: {output_path}")


Combined data saved to: problem_statments.csv


In [24]:
import pandas as pd


filtered_data = pd.read_csv('drugs.csv')
combined_data = pd.read_csv('problem_statments.csv')


print("Filtered Data columns:", filtered_data.columns)
print("Combined Data columns:", combined_data.columns)


merged_data = pd.merge(filtered_data, combined_data, left_index=True, right_index=True, how='left')


output_path = 'final.csv'
merged_data.to_csv(output_path, index=False)
print(f"Merged data saved to: {output_path}")


Filtered Data columns: Index(['subject_id', 'drug', 'formulary_drug_cd', 'prod_strength', 'form_rx',
       'dose_val_rx', 'dose_unit_rx', 'doses_per_24_hrs', 'route'],
      dtype='object')
Combined Data columns: Index(['diagnoses', 'procedures', 'description'], dtype='object')
Merged data saved to: final.csv
