In [1]:
# Importing everything we need 
# The usual packages
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

# model selection tools
from sklearn.model_selection import train_test_split

# Scalars
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

Let's load in the dataset we created at the end of the EDA stage. This preprocessing is for our classifier, or predictive model. We need to do a train-test split, set aside the test set, and then do pre-processing on the train set only.

Here are the proper steps to ensure that the test set remains completely unseen by the model.
1. Train-test split
2. Pre-process 
    1. Fit scaler on train data
    2. Transform train data
    3. Transform test data using the same scaler used on the train data
3. Train model on scaled train data
4. Test model on scaled test data

In [40]:
# import dataset

df_p = pd.read_csv('data/df_preprocess.csv', index_col=[0])

df_p.T.head(50)

Unnamed: 0,2,3,4,5,6,7,8,9,10,11,...,101748,101749,101751,101752,101753,101754,101755,101756,101758,101765
encounter_id,64410,500364,16680,35754,55842,63768,12522,15738,28236,36900,...,443811536,443816024,443835140,443835512,443841992,443842016,443842022,443842070,443842340,443867222
patient_nbr,86047875,82442376,42519267,82637451,84259809,114882984,48330783,63555939,89869032,77391171,...,189481478,106392411,175326800,139605341,184875899,183087545,188574944,140199494,120975314,175429310
race,Non_Caucasian,Caucasian,Caucasian,Caucasian,Caucasian,Caucasian,Caucasian,Caucasian,Non_Caucasian,Non_Caucasian,...,Caucasian,Caucasian,Caucasian,Non_Caucasian,Non_Caucasian,Caucasian,Non_Caucasian,Non_Caucasian,Caucasian,Caucasian
gender,Female,Male,Male,Male,Male,Male,Female,Female,Female,Male,...,Female,Female,Male,Female,Male,Female,Female,Female,Female,Male
age,[20-30),[30-40),[40-50),[50-60),[60-70),[70-80),[80-90),[90-100),[40-50),[60-70),...,[40-50),[70-80),[70-80),[40-50),[40-50),[70-80),[40-50),[60-70),[80-90),[70-80)
admission_type_id,emergency,emergency,emergency,urgent,elective,emergency,urgent,elective,emergency,urgent,...,emergency,elective,elective,elective,emergency,emergency,emergency,emergency,emergency,emergency
discharge_disposition_id,home,home,home,home,home,home,home,transferred_SNF,home,home,...,transferred_other,home_with_home_health_IV,home_with_home_health_IV,home,home,home,home,home,home,home
admission_source_id,emergency_room,emergency_room,emergency_room,other,other,emergency_room,transfer,transfer,emergency_room,transfer,...,emergency_room,physician_referral,physician_referral,physician_referral,emergency_room,emergency_room,emergency_room,emergency_room,emergency_room,emergency_room
time_in_hospital,2,2,1,3,4,5,13,12,9,7,...,14,3,13,3,13,9,14,2,5,6
medical_specialty,unknown,unknown,unknown,unknown,unknown,unknown,unknown,InternalMedicine,unknown,unknown,...,unknown,other,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown


In [41]:
# assign X (features) and y (target variable)

X = df_p.drop(['readmitted'], axis=1)
y = df_p['readmitted']

In [4]:
# train test split here before preprocessing
# 30% as test set
# stratify = y to maintain same proportion of class labels in test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=1)

In [5]:
# check shape of train set

X_train.shape

(48992, 27)

Let's save the test set as a csv, since we'll need this after we train the classifier on the train dataset.

In [42]:
df_classify_testset = pd.merge(X_test, y_test, how='inner', left_index=True, right_index=True)

df_classify_testset.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,...,diag_3,number_diagnoses,max_glu_serum,A1Cresult,change,diabetesMed,num_med_changes,race_orig,readmitted_orig,readmitted
62464,173973738,102910347,Caucasian,Male,[50-60),urgent,home,emergency_room,2,Emergency/Trauma,...,circulatory,9,,,Ch,Yes,1,Caucasian,>30,1
51065,152700834,58235868,Non_Caucasian,Female,[20-30),elective,home,physician_referral,2,other,...,other,7,,,No,No,0,AfricanAmerican,NO,0
29261,95724780,12614958,Non_Caucasian,Female,[70-80),urgent,home,transfer,11,other,...,musculoskeletal,5,,,No,Yes,0,AfricanAmerican,NO,0
76702,231841914,44028369,Caucasian,Male,[70-80),elective,home,physician_referral,1,other,...,circulatory,6,,,No,Yes,0,Caucasian,<30,1
76197,229651416,41854563,Caucasian,Female,[40-50),emergency,home,emergency_room,2,unknown,...,genitourinary,8,,,Ch,Yes,2,Caucasian,NO,0


In [43]:
# output to csv

df_classify_testset.to_csv('data/df_classify_testset.csv')

### One-Hot Encoding

Now let's do preprocessing on the train dataset, which is mainly one-hot encoding.

In [7]:
# check columns on train set

X_train.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'medical_specialty', 'num_lab_procedures',
       'num_procedures', 'num_medications', 'number_outpatient',
       'number_emergency', 'number_inpatient', 'diag_1', 'diag_2', 'diag_3',
       'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'change',
       'diabetesMed', 'num_med_changes', 'race_orig', 'readmitted_orig'],
      dtype='object')

In [8]:
# instantiate the OneHotEncoder

ohe = OneHotEncoder()

# fit the OneHotEncoder on the features

categorical = X_train[['race','gender','age','admission_type_id','discharge_disposition_id','admission_source_id', \
                'medical_specialty','diag_1','diag_2','diag_3','max_glu_serum','A1Cresult','change','diabetesMed']]

# transform the features
encoded = ohe.fit_transform(categorical) 
encoded

<48992x77 sparse matrix of type '<class 'numpy.float64'>'
	with 685888 stored elements in Compressed Sparse Row format>

In [44]:
# get each feature's categories

ohe.categories_

[array(['Caucasian', 'Non_Caucasian'], dtype=object),
 array(['Female', 'Male'], dtype=object),
 array(['[20-30)', '[30-40)', '[40-50)', '[50-60)', '[60-70)', '[70-80)',
        '[80-90)', '[90-100)'], dtype=object),
 array(['elective', 'emergency', 'other', 'unknown', 'urgent'],
       dtype=object),
 array(['expired', 'home', 'home_with_home_health_IV', 'hospice',
        'left_ama', 'other', 'transferred_SNF', 'transferred_other',
        'transferred_shortterm_hospital', 'unknown'], dtype=object),
 array(['emergency_room', 'other', 'physician_referral', 'transfer',
        'unknown'], dtype=object),
 array(['Cardiology', 'Emergency/Trauma', 'Family/GeneralPractice',
        'InternalMedicine', 'other', 'unknown'], dtype=object),
 array(['circulatory', 'diabetes', 'digestive', 'genitourinary', 'injury',
        'musculoskeletal', 'neoplasms', 'other', 'respiratory'],
       dtype=object),
 array(['circulatory', 'diabetes', 'digestive', 'genitourinary', 'injury',
        'musculoskel

In [45]:
# put into a dataframe to get column names

encoded_df = pd.DataFrame.sparse.from_spmatrix(encoded, index=X_train.index)
encoded_df.columns = ohe.get_feature_names(categorical.columns)

# show
encoded_df.head()

Unnamed: 0,race_Caucasian,race_Non_Caucasian,gender_Female,gender_Male,age_[20-30),age_[30-40),age_[40-50),age_[50-60),age_[60-70),age_[70-80),...,max_glu_serum_None,max_glu_serum_Norm,A1Cresult_>7,A1Cresult_>8,A1Cresult_None,A1Cresult_Norm,change_Ch,change_No,diabetesMed_No,diabetesMed_Yes
14930,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
6620,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
905,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
66951,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
77468,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0


In [11]:
# check

X_train.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,...,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,change,diabetesMed,num_med_changes,race_orig,readmitted_orig
14930,57722220,1202031,Caucasian,Male,[50-60),emergency,unknown,emergency_room,13,unknown,...,other,diabetes,9,,,No,No,0,Caucasian,>30
6620,32581692,22248396,Caucasian,Female,[60-70),unknown,home,unknown,5,Cardiology,...,circulatory,digestive,5,,,No,No,0,Caucasian,NO
905,6974160,2096937,Caucasian,Female,[30-40),unknown,unknown,physician_referral,3,other,...,diabetes,other,8,,,No,Yes,0,Caucasian,NO
66951,187567206,34146657,Caucasian,Female,[50-60),urgent,home_with_home_health_IV,physician_referral,7,other,...,other,digestive,6,,,No,Yes,0,Caucasian,NO
77468,235197126,55407519,Non_Caucasian,Male,[60-70),emergency,home_with_home_health_IV,emergency_room,4,unknown,...,circulatory,diabetes,9,,,Ch,Yes,1,Other,NO


In [12]:
# convert values of 0 and 1 from float to int

encoded_df = encoded_df.astype('int64')

In [13]:
# show

encoded_df.head()

Unnamed: 0,race_Caucasian,race_Non_Caucasian,gender_Female,gender_Male,age_[20-30),age_[30-40),age_[40-50),age_[50-60),age_[60-70),age_[70-80),...,max_glu_serum_None,max_glu_serum_Norm,A1Cresult_>7,A1Cresult_>8,A1Cresult_None,A1Cresult_Norm,change_Ch,change_No,diabetesMed_No,diabetesMed_Yes
14930,1,0,0,1,0,0,0,1,0,0,...,1,0,0,0,1,0,0,1,1,0
6620,1,0,1,0,0,0,0,0,1,0,...,1,0,0,0,1,0,0,1,1,0
905,1,0,1,0,0,1,0,0,0,0,...,1,0,0,0,1,0,0,1,0,1
66951,1,0,1,0,0,0,0,1,0,0,...,1,0,0,0,1,0,0,1,0,1
77468,0,1,0,1,0,0,0,0,1,0,...,1,0,0,0,1,0,1,0,0,1


In [51]:
# check

encoded_df.iloc[:,40:]

Unnamed: 0,diag_1_digestive,diag_1_genitourinary,diag_1_injury,diag_1_musculoskeletal,diag_1_neoplasms,diag_1_other,diag_1_respiratory,diag_2_circulatory,diag_2_diabetes,diag_2_digestive,...,max_glu_serum_None,max_glu_serum_Norm,A1Cresult_>7,A1Cresult_>8,A1Cresult_None,A1Cresult_Norm,change_Ch,change_No,diabetesMed_No,diabetesMed_Yes
14930,0,0,1,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,1,1,0
6620,0,0,0,0,0,0,1,1,0,0,...,1,0,0,0,1,0,0,1,1,0
905,0,0,0,0,0,1,0,0,1,0,...,1,0,0,0,1,0,0,1,0,1
66951,0,0,1,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,1,0,1
77468,0,0,0,0,0,0,1,1,0,0,...,1,0,0,0,1,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42784,0,1,0,0,0,0,0,0,1,0,...,1,0,0,0,1,0,0,1,1,0
44580,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,1,0,1
7327,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,1,0,1
10833,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,1


In [52]:
# get names of columns as a result of one-hot encoding

ohe.get_feature_names(categorical.columns)

array(['race_Caucasian', 'race_Non_Caucasian', 'gender_Female',
       'gender_Male', 'age_[20-30)', 'age_[30-40)', 'age_[40-50)',
       'age_[50-60)', 'age_[60-70)', 'age_[70-80)', 'age_[80-90)',
       'age_[90-100)', 'admission_type_id_elective',
       'admission_type_id_emergency', 'admission_type_id_other',
       'admission_type_id_unknown', 'admission_type_id_urgent',
       'discharge_disposition_id_expired',
       'discharge_disposition_id_home',
       'discharge_disposition_id_home_with_home_health_IV',
       'discharge_disposition_id_hospice',
       'discharge_disposition_id_left_ama',
       'discharge_disposition_id_other',
       'discharge_disposition_id_transferred_SNF',
       'discharge_disposition_id_transferred_other',
       'discharge_disposition_id_transferred_shortterm_hospital',
       'discharge_disposition_id_unknown',
       'admission_source_id_emergency_room', 'admission_source_id_other',
       'admission_source_id_physician_referral',
       'admis

After spot checking that the one-hot-encoding is done correctly, let's now join this new `encoded_df` back to the data set, drop the original features we've encoded.

In [14]:
# join encoded_df with X_train

df_model_p = pd.merge(X_train, encoded_df, how='left', left_index=True, right_index=True)

In [15]:
df_model_p.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48992 entries, 14930 to 66442
Columns: 104 entries, encounter_id to diabetesMed_Yes
dtypes: Sparse[int64, 0](77), int64(11), object(16)
memory usage: 19.6+ MB


In [16]:
# check

df_model_p.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,medical_specialty,...,max_glu_serum_None,max_glu_serum_Norm,A1Cresult_>7,A1Cresult_>8,A1Cresult_None,A1Cresult_Norm,change_Ch,change_No,diabetesMed_No,diabetesMed_Yes
14930,57722220,1202031,Caucasian,Male,[50-60),emergency,unknown,emergency_room,13,unknown,...,1,0,0,0,1,0,0,1,1,0
6620,32581692,22248396,Caucasian,Female,[60-70),unknown,home,unknown,5,Cardiology,...,1,0,0,0,1,0,0,1,1,0
905,6974160,2096937,Caucasian,Female,[30-40),unknown,unknown,physician_referral,3,other,...,1,0,0,0,1,0,0,1,0,1
66951,187567206,34146657,Caucasian,Female,[50-60),urgent,home_with_home_health_IV,physician_referral,7,other,...,1,0,0,0,1,0,0,1,0,1
77468,235197126,55407519,Non_Caucasian,Male,[60-70),emergency,home_with_home_health_IV,emergency_room,4,unknown,...,1,0,0,0,1,0,1,0,0,1


In [58]:
# spot check the above

df_model_p.loc[77468]['medical_specialty_unknown']

1

In [17]:
# save this df_model_p to another df before dropping some categorical features

df_model_p_predrop = df_model_p.copy()

In [18]:
# check columns

df_model_p_predrop.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'medical_specialty',
       ...
       'max_glu_serum_None', 'max_glu_serum_Norm', 'A1Cresult_>7',
       'A1Cresult_>8', 'A1Cresult_None', 'A1Cresult_Norm', 'change_Ch',
       'change_No', 'diabetesMed_No', 'diabetesMed_Yes'],
      dtype='object', length=104)

In [19]:
# note that we are dropping race, gender here as well since we want our classifer to be 
# race and gender-blind

df_model_classify = df_model_p.drop(columns=['encounter_id', 'patient_nbr', 'race', 'gender', 'age', \
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id', \
       'medical_specialty', 'diag_1', 'diag_2', 'diag_3', 'max_glu_serum', 'A1Cresult', 'change', \
       'diabetesMed', 'race_orig', 'readmitted_orig', 'race_Caucasian', 'race_Non_Caucasian', \
       'gender_Female', 'gender_Male'])

Now remember that the target variable - `readmitted` - is imbalanced. 

In [21]:
# check `readmitted` proportions

y_train.value_counts()

0    29300
1    19692
Name: readmitted, dtype: int64

`readmitted` = 1 is ~40% of the train set. The class imbalance is not horrible but let's address it anyway with sampling. In general, i's important that we handle class imbalance because it's problematic for modeling. The reason being with unbalanced data, a model might try to maximize accuracy and thus make predictions that favor the class that have more values in the dataset to begin with (or what we call, the majority class). Essentially, it'll learn from the majority class but not enough from the minority class. So, in order to correctly do classification, we'll need to apply some sampling techniques. What we'll do here is a combination of upsampling and downsampling. Meaning, we'll upsample the minority class and downsample the majority class. In our specific case, after both upsampling and downsampling, we would still like the dataset to remain as 48,992 rows. Remember, we're still just doing sampling on our train dataset, which has 48,992 rows. So then our data will end up having:

- class 1 => 24,496 rows
- class 0 => 24,496 rows

However, before we do so, let's append the cluster labels back in and then implement sampling technique on the dataset with the cluster labels added back in. We can get the cluster labels from the csv file we saved in our clustering stage.

In [22]:
# load dataset with the cluster labels

df_final_labels = pd.read_csv('data/df_final_cluster_labels.csv', index_col=[0])

df_final_labels.head()

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,num_med_changes,age_[40-50),...,diag_3_respiratory,A1Cresult_>8,A1Cresult_None,A1Cresult_Norm,change_Ch,change_No,diabetesMed_No,diabetesMed_Yes,kmeans,agglom
2,0.076923,0.076336,0.833333,0.15,0.047619,0.0,0.083333,0.230769,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1,1
3,0.076923,0.328244,0.166667,0.1875,0.0,0.0,0.0,0.307692,0.25,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,2,2
4,0.0,0.381679,0.0,0.0875,0.0,0.0,0.0,0.153846,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,2,2
5,0.153846,0.229008,1.0,0.1875,0.0,0.0,0.0,0.461538,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,3,0
6,0.230769,0.526718,0.166667,0.25,0.0,0.0,0.0,0.307692,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0,3


In [58]:
# get the cluster labels only (last two columns) and assign to a dataframe

df_clabels_only = df_final_labels.iloc[:,-2:]

df_clabels_only

Unnamed: 0,kmeans,agglom
2,1,1
3,2,2
4,2,2
5,3,0
6,0,3
...,...,...
101754,2,2
101755,2,2
101756,1,1
101758,2,2


In [65]:
# check

df_model_classify.head()

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,num_med_changes,age_[20-30),...,max_glu_serum_None,max_glu_serum_Norm,A1Cresult_>7,A1Cresult_>8,A1Cresult_None,A1Cresult_Norm,change_Ch,change_No,diabetesMed_No,diabetesMed_Yes
14930,13,56,2,31,0,0,0,9,0,0,...,1,0,0,0,1,0,0,1,1,0
6620,5,74,1,22,0,0,0,5,0,0,...,1,0,0,0,1,0,0,1,1,0
905,3,41,3,17,0,0,0,8,0,0,...,1,0,0,0,1,0,0,1,0,1
66951,7,52,1,15,0,0,0,6,0,0,...,1,0,0,0,1,0,0,1,0,1
77468,4,48,0,12,0,0,0,9,1,0,...,1,0,0,0,1,0,1,0,0,1


In [74]:
# check against the above

df_clabels_only.loc[6620]

kmeans    3
agglom    0
Name: 6620, dtype: int64

In [70]:
# append cluster labels to df_model_classify

df_model_classify = pd.merge(df_model_classify, df_clabels_only, how='left', left_index=True, right_index=True)

In [71]:
# double check the above that cluster labels are added correctly

df_model_classify.head()

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,num_med_changes,age_[20-30),...,A1Cresult_>7,A1Cresult_>8,A1Cresult_None,A1Cresult_Norm,change_Ch,change_No,diabetesMed_No,diabetesMed_Yes,kmeans,agglom
14930,13,56,2,31,0,0,0,9,0,0,...,0,0,1,0,0,1,1,0,1,1
6620,5,74,1,22,0,0,0,5,0,0,...,0,0,1,0,0,1,1,0,3,0
905,3,41,3,17,0,0,0,8,0,0,...,0,0,1,0,0,1,0,1,3,0
66951,7,52,1,15,0,0,0,6,0,0,...,0,0,1,0,0,1,0,1,3,0
77468,4,48,0,12,0,0,0,9,1,0,...,0,0,1,0,1,0,0,1,2,2


In [75]:
# output to csv

df_model_classify.to_csv('data/df_model_classify.csv')

After we've saved our train dataset with the cluster labels as a csv, now let's proceed with sampling. We've noted above that after sampling, we would like

- class 1 => 24,496 rows
- class 0 => 24,496 rows

In [76]:
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss

# specify size for each class
# need these for the pipeline

count_class_0 = 24496
count_class_1 = 24496

# proceed with oversampling and downsampling in a pipeline

pipe = make_pipeline(
    SMOTE(sampling_strategy={1: count_class_1}, random_state=1),
    NearMiss(sampling_strategy={0: count_class_0}) # NearMiss doesn't have random_state
)

# return sampled features and target variable

X_resampled, y_resampled = pipe.fit_resample(df_model_classify, y_train)

In [77]:
# check 

X_resampled

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,num_med_changes,age_[20-30),...,A1Cresult_>7,A1Cresult_>8,A1Cresult_None,A1Cresult_Norm,change_Ch,change_No,diabetesMed_No,diabetesMed_Yes,kmeans,agglom
0,4,45,0,11,0,0,0,9,0,0,...,0,0,1,0,0,1,1,0,3,0
1,2,39,0,14,0,0,0,9,0,0,...,0,0,1,0,0,1,0,1,1,1
2,1,33,6,9,0,0,0,9,0,0,...,0,0,1,0,0,1,0,1,3,0
3,3,44,0,13,0,0,0,8,0,0,...,0,0,1,0,1,0,0,1,2,2
4,1,43,0,3,0,0,0,4,0,0,...,0,0,1,0,0,1,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48987,6,55,1,18,0,0,0,8,0,0,...,0,0,1,0,1,0,0,1,0,0
48988,12,64,5,22,15,0,0,9,0,0,...,0,0,1,0,0,0,0,1,1,1
48989,6,64,0,4,2,0,0,8,0,0,...,0,0,1,0,0,1,0,0,2,0
48990,2,64,0,14,0,0,0,4,1,0,...,0,0,1,0,1,0,0,1,2,1


Notice that indices from before are no longer maintained since we've done a combination of upsampling and downsampling. This is why we want to implement sampling with the cluster labels as well, as this will be more useful for our later analyses.

In [30]:
# check

y_resampled.value_counts()

1    24496
0    24496
Name: readmitted, dtype: int64

In [78]:
# merge X_resampled and y_resampled as a df

df_classify_post_sampling = pd.merge(X_resampled, y_resampled, how='inner', left_index=True, right_index=True)

In [79]:
# check

df_classify_post_sampling.head()

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,num_med_changes,age_[20-30),...,A1Cresult_>8,A1Cresult_None,A1Cresult_Norm,change_Ch,change_No,diabetesMed_No,diabetesMed_Yes,kmeans,agglom,readmitted
0,4,45,0,11,0,0,0,9,0,0,...,0,1,0,0,1,1,0,3,0,0
1,2,39,0,14,0,0,0,9,0,0,...,0,1,0,0,1,0,1,1,1,0
2,1,33,6,9,0,0,0,9,0,0,...,0,1,0,0,1,0,1,3,0,0
3,3,44,0,13,0,0,0,8,0,0,...,0,1,0,1,0,0,1,2,2,0
4,1,43,0,3,0,0,0,4,0,0,...,0,1,0,0,1,1,0,1,1,0


In [80]:
# output to csv

df_classify_post_sampling.to_csv('data/df_classify_post_sampling.csv')

We'll use this csv file we just saved for our classifier modeling.