# Use clustering method to model hospitalization outcome

In [None]:
import os
import time
import random
import tensorflow as tf
import numpy as np
import pandas as pd


path = '../output2'
output_path = os.path.join(path, "Figure3")
df_train = pd.read_csv((os.path.join(path, 'train.csv')))
df_test = pd.read_csv((os.path.join(path, 'test.csv')))
confidence_interval = 95
random_seed=0

In [None]:
random.seed(random_seed)
np.random.seed(random_seed)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
df_train.head()

In [None]:
print('training size =', len(df_train), ', testing size =', len(df_test))

In [None]:
variable = ["age", "gender",

            "n_ed_30d", "n_ed_90d", "n_ed_365d", "n_hosp_30d", "n_hosp_90d",
            "n_hosp_365d", "n_icu_30d", "n_icu_90d", "n_icu_365d",

            "triage_temperature", "triage_heartrate", "triage_resprate",
            "triage_o2sat", "triage_sbp", "triage_dbp", "triage_pain", "triage_acuity",

            "chiefcom_chest_pain", "chiefcom_abdominal_pain", "chiefcom_headache",
            "chiefcom_shortness_of_breath", "chiefcom_back_pain", "chiefcom_cough",
            "chiefcom_nausea_vomiting", "chiefcom_fever_chills", "chiefcom_syncope",
            "chiefcom_dizziness",

            "cci_MI", "cci_CHF", "cci_PVD", "cci_Stroke", "cci_Dementia",
            "cci_Pulmonary", "cci_Rheumatic", "cci_PUD", "cci_Liver1", "cci_DM1",
            "cci_DM2", "cci_Paralysis", "cci_Renal", "cci_Cancer1", "cci_Liver2",
            "cci_Cancer2", "cci_HIV",

            "eci_Arrhythmia", "eci_Valvular", "eci_PHTN", "eci_HTN1", "eci_HTN2",
            "eci_NeuroOther", "eci_Hypothyroid", "eci_Lymphoma", "eci_Coagulopathy",
            "eci_Obesity", "eci_WeightLoss", "eci_FluidsLytes", "eci_BloodLoss",
            "eci_Anemia", "eci_Alcohol", "eci_Drugs", "eci_Psychoses", "eci_Depression"]

outcome = "outcome_hospitalization"
X_train = df_train[variable].copy()
y_train = df_train[outcome].copy()
X_test = df_test[variable].copy()
y_test = df_test[outcome].copy()
X_train.dtypes.to_frame().T
encoder = LabelEncoder()
X_train['gender'] = encoder.fit_transform(X_train['gender'])
X_test['gender'] = encoder.transform(X_test['gender'])
print('class ratio')
ratio = y_train.sum() / (~y_train).sum()
print('positive : negative =', ratio, ': 1')

# Cluster dataset by different algorithm
1. Traditional Clustering algorithms
   1. Kmeans
   2. Aggomerative
   3. DBSCAN
   4. Gaussian Mixture Model
2. Bayesian(DP) based clustering algorithm
    1. Dirichlet Process Mixture Model
    2. Hierarchical Dirichlet Process
    3. Nested Dirichlet Process
    4. Bayesian Agglomerative Clustering with the Dirichlet Process(BAC-DP)

Procedure
1. Fit a Cluster Model with training data
2. Use this fitted cluster model to get cluster labels for both training and test data
3. Append the cluster labels as a new feature in both datasets
4. Fit your classifier with this "enhanced" training data

## 1. Kmeans

In [None]:
from sklearn.cluster import KMeans
from helpers import PlotROCCurve
from sklearn.linear_model import LogisticRegression

# For different value of n_cluster
for i in range(1,10):
    # Use Kmeans to cluster data, and add new feature to train set 
    n_cluster = i
    print(f"Kmeans parameter {i}")
    kmeans = KMeans(n_clusters = n_cluster, random_state = 42)
    train_kmeans_clusters = kmeans.fit_predict(X_train)
    test_kmeans_clusters =  kmeans.predict(X_test)
    X_train['Kmeans'] = train_kmeans_clusters
    X_test['Kmeans'] = test_kmeans_clusters
    
    # Use classifier to predict
    # Containers for all results
    result_list = []
    
    logreg = LogisticRegression(random_state=random_seed)
    start = time.time()
    logreg.fit(X_train,y_train)
    runtime = time.time()-start
    
    
    probs = logreg.predict_proba(X_test)
    result = PlotROCCurve(probs[:,1],y_test, ci=confidence_interval, random_seed=random_seed)
    
    results = ["LR"]
    results.extend(result)
    results.append(runtime)
    
result_list.append(results)
    
    
    


## Use logistic regression to predict


In [None]:
# Containers for all results
result_list = []

from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(random_state=random_seed)
start = time.time()
logreg.fit(X_train,y_train)
runtime = time.time()-start


probs = logreg.predict_proba(X_test)
result = PlotROCCurve(probs[:,1],y_test, ci=confidence_interval, random_seed=random_seed)

results = ["LR"]
results.extend(result)
results.append(runtime)
result_list.append(results)