In [None]:
#Adding in our libraries and data file
%matplotlib inline
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter
import matplotlib.pyplot as plt
from matplotlib import pyplot as plt
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import confusion_matrix, classification_report

survival_df = pd.read_csv("../Resources/cleaned_data_survival_prediction.csv")

In [None]:
survival_df.head()

In [None]:
#labeling our X variable to drop our target variable
#setting our y variable to be our target variable
X = survival_df.drop('hospital_death', axis=1)
y = survival_df['hospital_death']

#dummy-coding the rest of our X categorical variables
X = pd.get_dummies(X)

### Balance the data set - OVERsampling

#### Since our data was not evenly distributed, we decided to use a method called OVERsampling to help compensate.

In [None]:
#Balance Data - oversampling
ros = RandomOverSampler(random_state=42)

x_ros, y_ros = ros.fit_resample(X, y)

print('Original dataset shape', Counter(y))
print('Resample dataset shape', Counter(y_ros))


In [None]:
from sklearn.ensemble import ExtraTreesClassifier

#Splitting our data into train and test
X_train, X_test, y_train, y_test = train_test_split(x_ros, y_ros, random_state=45)

#Running ExtraTreesClassifer
clf = ExtraTreesClassifier(random_state=45, n_estimators=50).fit(X_train, y_train)




In [None]:
print(f'Training Score: {clf.score(X_train, y_train)}')
print(f'Testing Score: {clf.score(X_test, y_test)}')

In [None]:
#Testing our model
y_true = y_test
y_pred = clf.predict(X_test)

print('Test Acc: %.3f' % clf.score(X_test, y_test))
print(confusion_matrix(y_true, y_pred))
print(classification_report(y_true, y_pred))


### Since there is a large amount of features, we want to see which features are the most important. Below we are running feature importance.

In [None]:
feature_importances = clf.feature_importances_

In [None]:
features = sorted(zip(X.columns, clf.feature_importances_), key = lambda x: x[1])
cols = [f[0] for f in features]
width = [f[1] for f in features]

fig, ax = plt.subplots()

fig.set_size_inches(10,200)
plt.margins(y=0.001)

ax.barh(y=cols, width=width)

plt.show()

In [None]:
### Here are our top 10 features

### Evaluating Top 10 Features

1. ICU death probability
2. Ventilation
3. Glascow Coma Scale (GCS) verbal score
4. GCS eyes score
5. Hospital death probability
6. GCS motor score
7. Day 1 minimum systolic blood pressure (BP), noninvasive
8. Day 1 minimum systolic blood pressure
9. Age
10. Minimum mean blood pressure


In [None]:
survival_df_features = survival_df[['apache_4a_hospital_death_prob', 'apache_4a_icu_death_prob', 'd1_sysbp_noninvasive_min',
                                  'd1_sysbp_min', 'age', 'd1_mbp_noninvasive_min', 'gcs_motor_apache', 'ventilated_apache',
                                  'gcs_verbal_apache', 'gcs_eyes_apache','hospital_death']]

In [None]:
X = survival_df_features.drop('hospital_death', axis=1)
y = survival_df_features['hospital_death']

In [None]:
X = pd.get_dummies(X)

In [None]:
#Balance Data - oversampling
ros = RandomOverSampler(random_state=42)

x_ros, y_ros = ros.fit_resample(X, y)

print('Original dataset shape', Counter(y))
print('Resample dataset shape', Counter(y_ros))

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
X_train, X_test, y_train, y_test = train_test_split(x_ros, y_ros, random_state=45)

clf = ExtraTreesClassifier(random_state=45, n_estimators=50).fit(X_train, y_train)

from joblib import dump, load
dump(clf, 'model_ExtraTrees.joblib') 

In [None]:
print(f'Training Score: {clf.score(X_train, y_train)}')
print(f'Testing Score: {clf.score(X_test, y_test)}')