# Setup

## Imports

In [2]:
# Setup
import os
from utils.custom_transformers import ColumnSelector, SaveTransformer, NumericalTransformer, CategoricalTransformer
from IPython.display import Image

# Data Analysis
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport


#Scikit learn
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer

## Configs

In [3]:
%matplotlib inline
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Functions

In [5]:
def load_data(file_name, folder = "data"):
    df = pd.read_csv(os.path.join(folder, file_name))
    return df

def target_transformation(text):
    text = text.replace(" ", "").lower()
    if text == "yes" :
        return 1
    elif text == "no":
        return 0

# Data Analysis

## Data Load

In [6]:
df_original = load_data("train_data.csv","data")

In [None]:
patient_list = set(df_original['patient_id'])
patients_to_remove = []
for patient_id in patient_list:
    data = df_original[df_original['patient_id']==patient_id]
    distinct_gender = data['gender'].nunique()
    distinct_blood_type = data['blood_type'].nunique()
    if distinct_gender > 1 or distinct_blood_type > 1:
        patients_to_remove.append(patient_id)
        
df_original_filtered = df_original.copy()
df_original_filtered = df_original_filtered.loc[~df_original_filtered['patient_id'].isin(patients_to_remove)]

## Data Transformation

In [7]:
categorical = ['readmitted','change','diabetesMed','has_prosthesis','blood_transfusion','diuretics','insulin','admission_source_code','discharge_disposition_code','admission_type_code','payer_code','complete_vaccination_status','blood_type','max_glu_serum','A1Cresult','age','race','gender','weight','diag_1','diag_2','diag_3','medical_specialty']
numerical = ['time_in_hospital','num_lab_procedures','num_procedures','num_medications','number_outpatient','number_emergency','number_inpatient','number_diagnoses','hemoglobin_level']
new_column = []
cols_to_remove = []
columns_features = categorical + new_column + numerical
columns_features = [col for col in columns_features if col not in cols_to_remove]
columns_extra = [column for column in df_original.columns if column not in columns_features ]

cat_pipe = Pipeline([
    ('selector', ColumnSelector(categorical))
    ,('transformer',CategoricalTransformer(mininum_records = 250))
    ,("save_to_csv_cat1", SaveTransformer(step="cat1"))
    ,('imputer', SimpleImputer(missing_values = None, strategy='constant', fill_value='missing'))
    #,('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))
    ,("save_to_csv_cat2", SaveTransformer(step="cat2"))
])

num_pipe = Pipeline([
    ('selector', ColumnSelector(numerical))
    ,('transformer',NumericalTransformer())
    ,("save_to_csv_num1", SaveTransformer(step="num1"))
    ,('imputer', SimpleImputer(strategy='median'))
    #,('scaler', MinMaxScaler())
    #,("save_to_csv_num2", Save(step="num2"))
])

extra_pipe = Pipeline([
    ('selector', ColumnSelector(columns_extra))
    ,('imputer', SimpleImputer(strategy='median'))
    #,('scaler', MinMaxScaler())
    #,("save_to_csv_num2", Save(step="num2"))
])

pre_processor = FeatureUnion([
    ('cat', cat_pipe)
    ,('num', num_pipe)
    ,('extra', extra_pipe)
])

pipeline = Pipeline([
    ('preprocessor', pre_processor)
    #,('model', GradientBoostingClassifier())
])

pipeline.fit(df_original)
df_transformed = pipeline.transform(df_original)

total_columns = np.append(columns_features, columns_extra)
df_transformed = pd.DataFrame(df_transformed, columns = total_columns)
dtypes_list =     {
    'readmitted':"Int64",
    'change':"Int64",
    'diabetesMed':"Int64",
    'has_prosthesis':"Int64",
    'blood_transfusion':"Int64",
    'diuretics':"Int64",
    'insulin':"Int64",
    'admission_source_code':"object",
    'discharge_disposition_code':"object",
    'admission_type_code':"object",
    'payer_code':"object",
    'complete_vaccination_status':"object",
    'blood_type':"object",
    'max_glu_serum':"object",
    'A1Cresult':"object",
    'age':"object",
    'race':"object",
    'gender':"object",
    'diag_1':"object",
    'diag_2':"object",
    'diag_3':"object",
    'medical_specialty':"object",
    'time_in_hospital':"Int64",
    'num_lab_procedures':"Int64",
    'num_procedures':"Int64",
    'num_medications':"Int64",
    'number_outpatient':"Int64",
    'number_emergency':"Int64",
    'number_inpatient':"Int64",
    'number_diagnoses':"Int64",
    'hemoglobin_level':"Float64",
    'admission_id':"Int64",
    'patient_id':"Int64",
    'weight':"object"
}
df_transformed = df_transformed.astype(dtypes_list)


In [None]:
corrMatrix = df_transformed.corr()
print (corrMatrix)

In [None]:
df_transformed.isnull().sum()

In [10]:
mean = df_transformed.groupby(['medical_specialty']).mean()
print(mean)

                                   readmitted    change  diabetesMed  \
medical_specialty                                                      
cardiology                           0.079335  0.413527     0.754505   
emergency/trauma                     0.111111  0.608703     0.828766   
family/generalpractice               0.116518  0.466745     0.783128   
gastroenterology                      0.11828  0.417204      0.76129   
internalmedicine                     0.110229  0.438525     0.778945   
missing                              0.116392  0.456747      0.76052   
nephrology                           0.157814  0.385681     0.721324   
obstetricsandgynecology              0.050847  0.293785     0.664783   
oncology                             0.189286  0.389286          0.7   
orthopedics                          0.105455  0.501818     0.800909   
orthopedics-reconstructive           0.076453  0.477064     0.782875   
others                               0.088901  0.433728     0.81

In [11]:
df_transformed['medical_specialty'].value_counts()

missing                              40020
internalmedicine                     11712
emergency/trauma                      6021
family/generalpractice                5939
cardiology                            4273
surgery-general                       2473
others                                1856
nephrology                            1299
orthopedics                           1100
orthopedics-reconstructive             981
radiologist                            913
pulmonology                            700
psychiatry                             674
urology                                539
obstetricsandgynecology                531
surgery-cardiovascular/thoracic        527
gastroenterology                       465
surgery-vascular                       438
surgery-neuro                          373
physicalmedicineandrehabilitation      298
oncology                               280
Name: medical_specialty, dtype: int64

In [None]:
df_transformed_report = ProfileReport(df_transformed)
df_transformed_report

In [None]:
is_float("v123")

# Feature Importance

In [None]:
import pandas as pd
import pydotplus
from sklearn.tree import (
    DecisionTreeClassifier,
    export_graphviz,
)
def separate_target_variable(data):
    X = data.copy()
    y = X.pop('readmitted')
    
    return X, y

def process_categorical_features(X):
    return pd.get_dummies(X)


def visualize_tree(clf, feature_names, class_names):
    dot_data = export_graphviz(
        clf,
        out_file=None,
        feature_names=feature_names,
        class_names=class_names,
    )

    graph = pydotplus.graph_from_dot_data(dot_data)
    return graph.create_png()



In [None]:
X, y = separate_target_variable(df_transformed)
X = X.drop(columns=['patient_id','admission_id'])
X = process_categorical_features(X)
y= y.astype('int')

In [None]:

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

model = DecisionTreeClassifier(
    criterion='entropy',
    random_state=101
)

model.fit(X, y)

In [None]:
tree = visualize_tree(model, X.columns, ["negative_class", "positive_class"])
Image(tree)

In [None]:
feature_importances = pd.Series(data=model.feature_importances_, index=X.columns)
feature_importances.sort_values(ascending=False).head(50)