In [59]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [60]:
#Read Data
raw_data = pd.read_csv('OUData.csv')

#split the InitPatientClassAndFirstPostOUClass
raw_data[['InitPatientClass', 'FirstPostOUClass']] = raw_data['InitPatientClassAndFirstPostOUClass'].str.split('->', expand=True)
raw_data = raw_data.drop(columns=['InitPatientClassAndFirstPostOUClass'])

# Subpart (i): Replace "NULL" with NaN in 'BloodPressureUpper'
raw_data['BloodPressureUpper'] = raw_data['BloodPressureUpper'].replace('NULL', np.nan)

# Subpart (ii): Convert 'BloodPressureUpper' to numeric
raw_data['BloodPressureUpper'] = pd.to_numeric(raw_data['BloodPressureUpper'], errors='coerce')

# Subpart (iii): Replace 0 with NaN in 'BloodPressureLower' and convert to numeric
raw_data['BloodPressureLower'] = raw_data['BloodPressureLower'].replace(0, np.nan)
raw_data['BloodPressureLower'] = pd.to_numeric(raw_data['BloodPressureLower'], errors='coerce')

# Subpart (iv): Replace "#VALUE!" with NaN in 'BloodPressureDiff' and convert to numeric
raw_data['BloodPressureDiff'] = raw_data['BloodPressureDiff'].replace('#VALUE!', np.nan)
raw_data['BloodPressureDiff'] = pd.to_numeric(raw_data['BloodPressureDiff'], errors='coerce')

# Subpart (v): Replace "NULL" with NaN in 'Pulse' and convert to numeric
raw_data['Pulse'] = raw_data['Pulse'].replace('NULL', np.nan)
raw_data['Pulse'] = pd.to_numeric(raw_data['Pulse'], errors='coerce')

# Subpart (vi): Replace "NULL" with NaN in 'PulseOximetry' and convert to numeric
raw_data['PulseOximetry'] = raw_data['PulseOximetry'].replace('NULL', np.nan)
raw_data['PulseOximetry'] = pd.to_numeric(raw_data['PulseOximetry'], errors='coerce')

# Subpart (vii): Replace "NULL" with NaN in 'Respirations' and convert to numeric
raw_data['Respirations'] = raw_data['Respirations'].replace('NULL', np.nan)
raw_data['Respirations'] = pd.to_numeric(raw_data['Respirations'], errors='coerce')

# Subpart (viii): Replace "NULL" with NaN in 'Temperature' and convert to numeric
raw_data['Temperature'] = raw_data['Temperature'].replace('NULL', np.nan)
raw_data['Temperature'] = pd.to_numeric(raw_data['Temperature'], errors='coerce')

# Subpart (ix): Convert 'DRG01' and 'Flipped' to categorical types
raw_data['DRG01'] = raw_data['DRG01'].astype('category')
raw_data['Flipped'] = raw_data['Flipped'].astype('category')

# Subpart (x): Remove any rows that are missing data from vitals
columns_to_check = ['BloodPressureUpper', 'BloodPressureLower', 'BloodPressureDiff', 'Pulse', 'PulseOximetry', 'Respirations', 'Temperature']
raw_data = raw_data.dropna(subset=columns_to_check)

raw_data['Flipped'] = raw_data['Flipped'].astype(bool)

# Subpart (xi): Output a cleaned data set to a CSV file 'OUDataClean.csv'
raw_data.to_csv('OUDataClean.csv', index=False)

In [61]:
clean_data = pd.read_csv('OUDataClean.csv')

#map codes to meanings
code_to_condition = {
    276: 'Dehydration',
    428: 'Heart Failure',
    486: 'Pneumonia',
    558: 'Colitis',
    577: 'Pancreatitis',
    578: 'GI Bleed',
    599: 'UTI',
    780: 'Syncope',
    782: 'Edema',
    786: 'Chest Pain',
    787: 'Nausea',
    789: 'Abdominal Pain'
}

# Replace the codes with conditions
clean_data['DRG01'] = clean_data['DRG01'].map(code_to_condition)

# Categorical columns to convert to dummy variables
categorical_cols = ['PrimaryInsuranceCategory', 'InitPatientClass', 'FirstPostOUClass', 'Gender', 'DRG01']

# Generate dummy variables for the specified categorical columns
dummies = pd.get_dummies(clean_data[categorical_cols], prefix_sep='_')

# Concatenate the dummy variables with the original DataFrame to preserve the original columns
clean_data_dummies = pd.concat([clean_data, dummies], axis=1)

clean_data_dummies.to_csv('OUData_Cleaned_and_Processed.csv')