In [48]:
import matplotlib.pyplot as plt
import numpy as np
import math
import os
import pandas as pd
import time
import seaborn as sns
from sklearn import metrics
from sklearn.decomposition import PCA
from collections import Counter
from imblearn.over_sampling import SMOTE

In [49]:
path = '../output/data.csv'
data = pd.read_csv(path)

In [50]:
data

Unnamed: 0.1,Unnamed: 0,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,...,max_glu_serum_Norm,A1Cresult_>7,A1Cresult_>8,A1Cresult_None,A1Cresult_Norm,payer_code_medicare,payer_code_private_insurance,payer_code_selfpay,payer_code_unknown,readmitted
0,0,5,0.693147,3.737670,0.000000,0.693147,0.000000,0.0,0.000000,0.693147,...,0,0,0,1,0,0,0,0,1,0
1,1,15,1.386294,4.094345,0.000000,2.944439,0.000000,0.0,0.000000,2.302585,...,0,0,0,1,0,0,0,0,1,0
2,2,25,1.098612,2.484907,1.791759,2.639057,1.098612,0.0,0.693147,1.945910,...,0,0,0,1,0,0,0,0,1,0
3,3,35,1.098612,3.806662,0.693147,2.833213,0.000000,0.0,0.000000,2.079442,...,0,0,0,1,0,0,0,0,1,0
4,4,45,0.693147,3.951244,0.000000,2.197225,0.000000,0.0,0.000000,1.791759,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180813,180813,65,2.564949,3.293276,1.753211,3.209132,0.908936,0.0,0.000000,2.302585,...,0,0,0,1,0,0,0,0,0,1
180814,180814,73,2.013653,3.846561,0.241220,2.906130,1.654261,0.0,0.797035,2.302585,...,0,0,0,1,0,0,0,0,0,1
180815,180815,55,1.329556,4.009158,0.556441,3.015507,0.000000,0.0,0.556441,1.945910,...,0,0,0,0,0,0,0,0,1,1
180816,180816,40,1.251585,3.913159,0.415143,3.161152,0.000000,0.0,0.814569,2.302585,...,0,0,0,1,0,0,0,0,1,1


In [51]:
[c for c in data.columns if 'future' in c]

['future_med_cost',
 'future_lab_procedure_cost',
 'future_procedure_cost',
 'future_emergency_cost',
 'future_cost_total']

In [52]:
data['future_cost_total']

0             0.000000
1             0.000000
2             0.000000
3             0.000000
4             0.000000
              ...     
180813        0.000000
180814    48556.336719
180815        0.000000
180816    11834.641945
180817        0.000000
Name: future_cost_total, Length: 180818, dtype: float64

In [68]:
to_drop = ['future_med_cost', 'future_lab_procedure_cost', 'future_procedure_cost', 'future_emergency_cost', 'future_cost_total', 'encounter_id', 'patient_nbr', 'readmitted', 'diag_1', 'diag_2', 'diag_3', 'weight'] + ['readmitted']
X = data[[c for c in data.columns if c not in to_drop]]
y = data[['future_cost_total', 'readmitted']]

In [69]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [71]:
y_train_readmitted = y_train['readmitted']
y_train_cost = y_train['future_cost_total']
y_test_readmitted = y_test['readmitted']
y_test_cost = y_test['future_cost_total']

In [72]:
from sklearn import ensemble

In [73]:
params = {
    'n_estimators': 200,
    'max_depth': 5,
    'criterion': 'mse'
}

model = ensemble.GradientBoostingRegressor(**params)

In [18]:
model.fit(X_train, y_train)

GradientBoostingRegressor(criterion='mse', max_depth=5, n_estimators=200)

In [21]:
from sklearn.metrics import mean_squared_error

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

print("MSE: %.2f" % mse)

MSE: 20937876.36


In [23]:
print(math.sqrt(mse))

4575.792429982865


In [24]:
from sklearn.metrics import mean_absolute_error
print(mean_absolute_error(y_test, y_pred))

1866.9038936192035


# TWO LAYER

In [74]:
nonzero_X = X_train[y_train_cost > 1]

In [78]:
nonzero_y = y_train_cost[y_train_cost > 1]

In [79]:
params = {
    'n_estimators': 200,
    'max_depth': 5,
    'criterion': 'mse'
}

model = ensemble.GradientBoostingRegressor(**params)

In [80]:
model.fit(nonzero_X, nonzero_y)

GradientBoostingRegressor(criterion='mse', max_depth=5, n_estimators=200)

In [81]:
from sklearn.metrics import mean_squared_error

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test_cost, y_pred)

print("MSE: %.2f" % mse)

print(math.sqrt(mse))

from sklearn.metrics import mean_absolute_error
print(mean_absolute_error(y_test_cost, y_pred))

MSE: 145393094.89
12057.905908140092
10284.995753988387


In [82]:
from sklearn.ensemble import RandomForestClassifier

In [83]:
clf = RandomForestClassifier(max_depth=5, n_estimators=200, max_features=1)
clf.fit(X_train, y_train_readmitted)

RandomForestClassifier(max_depth=5, max_features=1, n_estimators=200)

In [86]:
y_pred = []

for i in range(X_test.shape[0]):
    readmitted = clf.predict([X_test.iloc[i]])
    if readmitted < 0.5:
        y_pred.append(0)
    else:
        y_pred.append(model.predict([X_test.iloc[i]])[0])

KeyboardInterrupt: 

In [None]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test_cost, y_pred)

print("MSE: %.2f" % mse)

print(math.sqrt(mse))

from sklearn.metrics import mean_absolute_error
print(mean_absolute_error(y_test_cost, y_pred))