In [1]:
import pandas as pd
import numpy as np

from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('data/diabetic_data.csv')

In [3]:
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [4]:
df.count()

encounter_id                101766
patient_nbr                 101766
race                        101766
gender                      101766
age                         101766
weight                      101766
admission_type_id           101766
discharge_disposition_id    101766
admission_source_id         101766
time_in_hospital            101766
payer_code                  101766
medical_specialty           101766
num_lab_procedures          101766
num_procedures              101766
num_medications             101766
number_outpatient           101766
number_emergency            101766
number_inpatient            101766
diag_1                      101766
diag_2                      101766
diag_3                      101766
number_diagnoses            101766
max_glu_serum               101766
A1Cresult                   101766
metformin                   101766
repaglinide                 101766
nateglinide                 101766
chlorpropamide              101766
glimepiride         

In [5]:
uk_df = df.loc[: , (df == '?').any()]

In [6]:
df = df.replace('?', np.NaN) 

In [7]:
print(df.isna().any())

encounter_id                False
patient_nbr                 False
race                         True
gender                      False
age                         False
weight                       True
admission_type_id           False
discharge_disposition_id    False
admission_source_id         False
time_in_hospital            False
payer_code                   True
medical_specialty            True
num_lab_procedures          False
num_procedures              False
num_medications             False
number_outpatient           False
number_emergency            False
number_inpatient            False
diag_1                       True
diag_2                       True
diag_3                       True
number_diagnoses            False
max_glu_serum               False
A1Cresult                   False
metformin                   False
repaglinide                 False
nateglinide                 False
chlorpropamide              False
glimepiride                 False
acetohexamide 

In [8]:
df.count()

encounter_id                101766
patient_nbr                 101766
race                         99493
gender                      101766
age                         101766
weight                        3197
admission_type_id           101766
discharge_disposition_id    101766
admission_source_id         101766
time_in_hospital            101766
payer_code                   61510
medical_specialty            51817
num_lab_procedures          101766
num_procedures              101766
num_medications             101766
number_outpatient           101766
number_emergency            101766
number_inpatient            101766
diag_1                      101745
diag_2                      101408
diag_3                      100343
number_diagnoses            101766
max_glu_serum               101766
A1Cresult                   101766
metformin                   101766
repaglinide                 101766
nateglinide                 101766
chlorpropamide              101766
glimepiride         

In [9]:
df = df.drop(['weight'], axis=1)

In [10]:
df = df.dropna()

In [11]:
df.count()

encounter_id                26755
patient_nbr                 26755
race                        26755
gender                      26755
age                         26755
admission_type_id           26755
discharge_disposition_id    26755
admission_source_id         26755
time_in_hospital            26755
payer_code                  26755
medical_specialty           26755
num_lab_procedures          26755
num_procedures              26755
num_medications             26755
number_outpatient           26755
number_emergency            26755
number_inpatient            26755
diag_1                      26755
diag_2                      26755
diag_3                      26755
number_diagnoses            26755
max_glu_serum               26755
A1Cresult                   26755
metformin                   26755
repaglinide                 26755
nateglinide                 26755
chlorpropamide              26755
glimepiride                 26755
acetohexamide               26755
glipizide     

In [12]:
df.loc[:, "medicare"] = (df.payer_code == "MC")
df.loc[:, "medicaid"] = (df.payer_code == "MD")

In [13]:
df.drop(['encounter_id', 'patient_nbr', 'payer_code', 'medical_specialty', 'admission_type_id', 
         'repaglinide','nateglinide','chlorpropamide','glimepiride','acetohexamide','glipizide','glyburide','tolbutamide',
        'pioglitazone','rosiglitazone','acarbose','miglitol','troglitazone','tolazamide','examide','citoglipton', 'metformin',
        'glyburide-metformin','glipizide-metformin','glimepiride-pioglitazone','metformin-rosiglitazone','metformin-pioglitazone', 'diag_2', 'diag_3', 'change'], axis=1, inplace=True)

In [14]:
df.loc[:, "age"] = df["age"].replace( ["[0-10)", "[10-20)", "[20-30)"], "30 years or younger")
df.loc[:, "age"] = df["age"].replace(["[30-40)", "[40-50)", "[50-60)"], "30-60 years")
df.loc[:, "age"] = df["age"].replace(["[60-70)", "[70-80)", "[80-90)", "[90-100)"], "Over 60 years")

In [15]:
df.rename(columns={ 'readmitted':'readmit_status', 'number_outpatient':'prior_outpatient', 'number_emergency': 'prior_emergency', 'number_inpatient':'prior_inpatient', 'diag_1':'primary_diagnosis', 'discharge_disposition_id':'discharge_destination', 'admission_source_id':'admission_source', 'diabetesMed':'diabetes_Med_prescribe'}, inplace=True)

In [16]:
df = df.drop(df[df.readmit_status ==  '>30'].index)
df['readmit_status'] = df['readmit_status'].replace({"NO":"not readmitted", "<30":"readmitted"})

In [17]:
    df.loc[:, "primary_diagnosis"] = df["primary_diagnosis"].replace(
        regex={
            "[7][1-3][0-9]": "Musculoskeltal primary_diagnosis",
            "250.*": "Diabetes",
            "[4][6-9][0-9]|[5][0-1][0-9]|786": "Respitory Issues",
            "[5][8-9][0-9]|[6][0-2][0-9]|788": "Genitourinary Issues"
        }
    )
    diagnoses = ["Respitory Issues", "Diabetes", "Genitourinary Issues", "Musculoskeltal Issues"]
    df.loc[:, "primary_diagnosis"] = df["primary_diagnosis"].apply(lambda x: x if x in diagnoses else "Other")

In [18]:
df = df[((df.discharge_destination != 11) & 
                                          (df.discharge_destination != 13) &
                                          (df.discharge_destination != 14) & 
                                          (df.discharge_destination != 19) & 
                                          (df.discharge_destination != 20) & 
                                          (df.discharge_destination != 21))] 

In [19]:
df.loc[:, "discharge_destination"] = (df.discharge_destination
                                            .apply(lambda x: "Discharged to Home" if x==1 else "Other"))

In [20]:
df.loc[:, "admission_source"] = df["admission_source"].apply(lambda x: x if x in ["Emergency", "Referral"] else "Other")

In [21]:
df = df.sample(frac=0.20)

In [22]:
df.count()

race                      3480
gender                    3480
age                       3480
discharge_destination     3480
admission_source          3480
time_in_hospital          3480
num_lab_procedures        3480
num_procedures            3480
num_medications           3480
prior_outpatient          3480
prior_emergency           3480
prior_inpatient           3480
primary_diagnosis         3480
number_diagnoses          3480
max_glu_serum             3480
A1Cresult                 3480
insulin                   3480
diabetes_Med_prescribe    3480
readmit_status            3480
medicare                  3480
medicaid                  3480
dtype: int64

In [23]:
df.head()

Unnamed: 0,race,gender,age,discharge_destination,admission_source,time_in_hospital,num_lab_procedures,num_procedures,num_medications,prior_outpatient,...,prior_inpatient,primary_diagnosis,number_diagnoses,max_glu_serum,A1Cresult,insulin,diabetes_Med_prescribe,readmit_status,medicare,medicaid
35152,Caucasian,Female,Over 60 years,Other,Other,3,10,1,9,0,...,1,Other,7,>300,,Down,Yes,not readmitted,True,False
30104,Caucasian,Female,Over 60 years,Other,Other,9,62,0,17,0,...,1,Genitourinary Issues,5,,,Steady,Yes,readmitted,True,False
61266,Caucasian,Male,Over 60 years,Discharged to Home,Other,2,37,0,13,0,...,0,Other,9,,,Up,Yes,not readmitted,True,False
35919,Caucasian,Female,30-60 years,Discharged to Home,Other,9,42,0,18,0,...,0,Other,5,,,No,No,not readmitted,False,False
77558,Caucasian,Male,30-60 years,Discharged to Home,Other,8,91,5,26,2,...,0,Respitory Issues,9,,,Up,Yes,not readmitted,True,False


In [24]:
df.dtypes

race                      object
gender                    object
age                       object
discharge_destination     object
admission_source          object
time_in_hospital           int64
num_lab_procedures         int64
num_procedures             int64
num_medications            int64
prior_outpatient           int64
prior_emergency            int64
prior_inpatient            int64
primary_diagnosis         object
number_diagnoses           int64
max_glu_serum             object
A1Cresult                 object
insulin                   object
diabetes_Med_prescribe    object
readmit_status            object
medicare                    bool
medicaid                    bool
dtype: object

In [25]:

train, test = train_test_split(df, train_size=0.80, random_state=1)

In [26]:
train.to_parquet('data/training_data.parquet')
test.to_parquet('data/testing_data.parquet')

In [27]:
target_column_name = 'readmit_status'

In [28]:
def get_categorical_index(categorical_fields):
    cat_idx = []
    for col, value in categorical_fields.iteritems():
        if value.dtype == 'object':
            cat_idx.append(categorical_fields.columns.get_loc(col))
    print("col indices: ", cat_idx)  
    return cat_idx       

In [29]:
# Read in data
print("Reading data")
all_training_data = pd.read_parquet('data/training_data.parquet')
target = all_training_data[target_column_name]
features = all_training_data.drop([target_column_name], axis = 1)  

# Transform string data to numeric
numerical_selector = selector(dtype_include=np.number)
categorical_selector = selector(dtype_exclude=np.number)

numerical_columns = numerical_selector(features)
categorical_columns = categorical_selector(features)

categorial_encoder = OneHotEncoder(handle_unknown="ignore")
numerical_encoder = StandardScaler()

preprocessor = ColumnTransformer([
('categorical-encoder', categorial_encoder, categorical_columns),
('standard_scaler', numerical_encoder, numerical_columns)])

categorical_indices = get_categorical_index(features)
clf = make_pipeline(preprocessor, LogisticRegression())

X_train, X_test, y_train, y_test = train_test_split(features, target, 
test_size=0.3, random_state=1)

print("Training model...") 
model = clf.fit(X_train, y_train)
print("Accuracy score: ", clf.score(X_test,y_test))

Reading data
col indices:  [0, 1, 2, 3, 4, 12, 14, 15, 16, 17]
Training model...
Accuracy score:  0.8492822966507177


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
