In [None]:
#importing all the libraries and our datafile
%matplotlib inline
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter
import matplotlib.pyplot as plt
from matplotlib import pyplot
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import confusion_matrix, classification_report

survival_df = pd.read_csv("../Resources/cleaned_data_survival_prediction.csv")

In [None]:
survival_df.head()

In [None]:
#labeling our X variable to drop our target variable
#setting our y variable to be our target variable
X = survival_df.drop('hospital_death', axis=1)
y = survival_df['hospital_death']

In [None]:
#dummy-coding the rest of our X categorical variables
X = pd.get_dummies(X)

### Balance the data set - OVERsampling

#### Since our data was not evenly distributed, we decided to use a method called OVERsampling to help compensate.

In [None]:
#Balance Data - oversampling
ros = RandomOverSampler(random_state=42)

x_ros, y_ros = ros.fit_resample(X, y)

print('Original dataset shape', Counter(y))
print('Resample dataset shape', Counter(y_ros))

In [None]:
#Splitting our data into train and test
X_train, X_test, y_train, y_test = train_test_split(x_ros, y_ros, random_state=40)

#Running RandomForestClassifier
clf = RandomForestClassifier(random_state=40, n_estimators=50).fit(X_train, y_train)
print(f'Training Score: {clf.score(X_train, y_train)}')
print(f'Testing Score: {clf.score(X_test, y_test)}')

In [None]:
#Testing our model
y_true = y_test
y_pred = clf.predict(X_test)

print('Test Acc: %.3f' % clf.score(X_test, y_test))
print(confusion_matrix(y_true, y_pred))
print(classification_report(y_true, y_pred))

### Since there is a large amount of features, we want to see which features are the most important. Below we are running feature importance. 

In [None]:
importance = clf.feature_importances_

In [None]:
# summarize feature importance
for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
pyplot.bar([x for x in range(len(importance))], importance)
pyplot.show()