In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [3]:
df.isnull().sum()

age                         0
anaemia                     0
creatinine_phosphokinase    0
diabetes                    0
ejection_fraction           0
high_blood_pressure         0
platelets                   0
serum_creatinine            0
serum_sodium                0
sex                         0
smoking                     0
time                        0
DEATH_EVENT                 0
dtype: int64

In [60]:
from sklearn.model_selection import train_test_split

X, y = df.drop('DEATH_EVENT', axis=1), df['DEATH_EVENT']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((209, 12), (90, 12), (209,), (90,))

In [61]:
# describe all numerical columns, turns out there are no object columns at all

X_train.describe().T 

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,209.0,60.379589,11.847341,40.0,50.0,60.0,68.0,95.0
anaemia,209.0,0.406699,0.492397,0.0,0.0,0.0,1.0,1.0
creatinine_phosphokinase,209.0,565.043062,921.812149,23.0,118.0,235.0,582.0,7702.0
diabetes,209.0,0.430622,0.496352,0.0,0.0,0.0,1.0,1.0
ejection_fraction,209.0,38.296651,11.785668,14.0,30.0,38.0,45.0,70.0
high_blood_pressure,209.0,0.344498,0.476345,0.0,0.0,0.0,1.0,1.0
platelets,209.0,263705.600239,103758.741003,25100.0,212000.0,259000.0,304000.0,850000.0
serum_creatinine,209.0,1.370622,1.016622,0.5,0.9,1.1,1.3,9.0
serum_sodium,209.0,136.626794,4.563068,113.0,134.0,137.0,140.0,146.0
sex,209.0,0.602871,0.490478,0.0,0.0,1.0,1.0,1.0


In [62]:
# there are no categorical features
categorical_features = X_train.select_dtypes(exclude='number').columns.tolist()
categorical_features

[]

In [63]:
# COLUMN PIPELINE

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
›

col_transformation_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean')),
    ('scale', MinMaxScaler())
])

all_columns = X_train.columns.tolist()

columns_transformer = ColumnTransformer(transformers=[
    ('cols', col_transformation_pipeline, all_columns),
])

In [64]:
columns_transformer.fit_transform(X_train)

array([[0.10909091, 0.        , 0.0906368 , ..., 0.        , 0.        ,
        0.36654804],
       [0.21818182, 0.        , 0.51347832, ..., 1.        , 1.        ,
        0.50533808],
       [0.90909091, 1.        , 0.04089074, ..., 0.        , 0.        ,
        0.89679715],
       ...,
       [0.54545455, 0.        , 0.07279594, ..., 1.        , 1.        ,
        0.87544484],
       [0.18181818, 1.        , 0.01875244, ..., 0.        , 0.        ,
        0.65124555],
       [0.45454545, 0.        , 0.0123714 , ..., 1.        , 1.        ,
        0.6975089 ]])

In [43]:
# MODEL Pipeline

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
import matplotlib.pyplot as plt

# random forest classifier
rf_classifier = RandomForestClassifier(n_estimators = 11, criterion='entropy', random_state=0)

rf_model_pipeline = Pipeline(steps=[
    ('preprocessing', columns_transformer),
    ('rf_model', rf_classifier),
])

rf_model_pipeline.fit(X_train, y_train)

# predict on test set
y_pred = rf_model_pipeline.predict(X_test)

# calculate accuracy
ac = accuracy_score(y_test, y_pred)
print(f"Accuracy= {ac}")

Accuracy= 0.85


In [47]:
from sklearn import set_config
set_config(display='diagram')

rf_model_pipeline

In [None]:
# Do grid search

from sklearn.model_selection import GridSearchCV

rf_classifier = RandomForestClassifier(random_state=0)

rf_model_pipeline = Pipeline(steps=[
    ('preprocessing', columns_transformer),
    ('rf_model', rf_classifier),
])

params_dict = {'rf_model__n_estimators' : np.arange(5, 100, 1), 'rf_model__criterion': ['gini', 'entropy']}

grid_search = GridSearchCV(rf_model_pipeline, params_dict, cv=10, n_jobs=-1)
grid_search.fit(X_train, y_train)


In [None]:
grid_search.best_params_

{'rf_model__criterion': 'gini', 'rf_model__n_estimators': 27}

In [None]:
y_pred = grid_search.predict(X_test)
ac = accuracy_score(y_test, y_pred)
print(f"Accuracy= {ac}")


Accuracy= 0.8111111111111111
