# Predicting the opening / closing of hospitals!

In [109]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score


In [110]:
import pickle
train_data = 0
test_data = 0

with open('train_data.pkl', 'rb') as f:
    train_data = pickle.load(f)

with open('test_data.pkl', 'rb') as f:
    test_data = pickle.load(f)


In [111]:
train_X = np.array(train_data.iloc[:,2:])
train_y = np.array(train_data['OC'])

test_X = np.array(test_data.iloc[:,2:])

ID = np.array(test_data.inst_id)

---

# Gradient Boosting

In [112]:
from sklearn.ensemble import GradientBoostingClassifier

In [113]:
eclf_gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1)

In [114]:
cross_val_score(eclf_gb, train_X, train_y, cv=5).mean()

0.9534972677595629

In [115]:
params = {
    'n_estimators': [10,20,30,50,100,150,200],
    'learning_rate': [i for i in np.linspace(0.1,1,10)]
}

In [116]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(estimator=eclf_gb, param_grid=params, cv=5, n_jobs=-1)
grid = grid.fit(train_X, train_y)



In [117]:
grid.best_score_

0.9568106312292359

In [118]:
grid.best_params_

{'learning_rate': 0.1, 'n_estimators': 10}

In [119]:
# best parameter에 맞추어 수정
eclf_gb = GradientBoostingClassifier(n_estimators=10, learning_rate=0.1)

In [120]:
model_gb = eclf_gb.fit(train_X, train_y)
pred_gb = eclf_gb.predict_proba(test_X)[:,1]

result_gb = pd.DataFrame({'inst_id':ID, 'OC':pred_gb})

---

# Random Forest

In [121]:
# RF
from sklearn.ensemble import RandomForestClassifier
eclf_rf = RandomForestClassifier(n_estimators=100,
                              max_features=2,
                              n_jobs=-1,
                              oob_score = True)

In [122]:
cross_val_score(eclf_rf, train_X, train_y, cv=5).mean()

0.9468306010928963

In [123]:
params = {
    'n_estimators' : [10, 20, 30, 50, 100],
    'max_features' : [1,2,3,4,5,6,7,10,15,20,25,len(train_X[0])]
}

In [124]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(estimator=eclf_rf, param_grid=params, cv=5, n_jobs=-1)
grid = grid.fit(train_X, train_y)



In [125]:
grid.best_score_

0.9601328903654485

In [126]:
grid.best_params_

{'max_features': 10, 'n_estimators': 50}

In [127]:
grid.best_estimator_.oob_score_

0.9435215946843853

In [128]:
grid.best_estimator_.feature_importances_

array([0.01517624, 0.03387023, 0.06739127, 0.117588  , 0.03841295,
       0.01154649, 0.00977793, 0.01550736, 0.02006689, 0.01591249,
       0.01961065, 0.01856602, 0.00843127, 0.0496472 , 0.00527907,
       0.01015249, 0.01076805, 0.00792578, 0.00989993, 0.00763127,
       0.00695587, 0.0200983 , 0.00664676, 0.01114517, 0.00590345,
       0.01138791, 0.01741216, 0.01093838, 0.01979644, 0.01317454,
       0.        , 0.01209884, 0.01851922, 0.01265705, 0.01314064,
       0.01583526, 0.00518947, 0.04508333, 0.01344128, 0.00867074,
       0.01632044, 0.01149173, 0.01388117, 0.01529793, 0.00906903,
       0.01707655, 0.01509477, 0.00819941, 0.0061539 , 0.01026659,
       0.01003307, 0.0075477 , 0.00166719, 0.01547864, 0.04938754,
       0.03177797])

In [129]:
np.argsort(grid.best_estimator_.feature_importances_)[::-1]

array([ 3,  2, 13, 54, 37,  4,  1, 55, 21,  8, 28, 10, 11, 32, 26, 45, 40,
        9, 35,  7, 53, 43,  0, 46, 42, 38, 29, 34, 33, 31,  5, 41, 25, 23,
       27, 16, 49, 15, 50, 18,  6, 44, 39, 12, 47, 17, 19, 51, 20, 22, 48,
       24, 14, 36, 52, 30])

In [159]:
print('변수',train_data.columns[30],'가 중요도가 가장 높게 출력되었다.')

변수 surplus1 가 중요도가 가장 높게 출력되었다.


In [130]:
# best parameter에 맞추어 수정
from sklearn.ensemble import RandomForestClassifier
eclf_rf = RandomForestClassifier(n_estimators=100,
                              max_features=20,
                              n_jobs=-1,
                              oob_score = True)

In [131]:
model_rf = eclf_rf.fit(train_X, train_y)
pred_rf  = eclf_rf.predict_proba(test_X)[:,1]

result_rf = pd.DataFrame({'inst_id':ID, 'OC':pred_rf})

---
# XGBoost

In [132]:
from xgboost import XGBClassifier

eclf_xgb = XGBClassifier(n_estimators=1000, max_depth=2, learning_rate=0.5, nthread=7)
model_xgb = eclf_xgb.fit(train_X, train_y)
pred_xgb = eclf_xgb.predict_proba(test_X)[:,1]

result_xgb = pd.DataFrame({'inst_id':ID, 'OC':pred_xgb})

In [133]:
cross_val_score(eclf_xgb, train_X, train_y, cv=5).mean()

0.9468306010928963

---

In [186]:
ensemble = pd.DataFrame({'inst_id':ID})

In [187]:
ensemble['GB']  = result_gb['OC']
ensemble['RF']  = result_rf['OC']
ensemble['XGB'] = result_xgb['OC']
ensemble.head()

Unnamed: 0,inst_id,GB,RF,XGB
0,2,0.965355,0.96,0.999075
1,5,0.421943,0.76,0.972123
2,6,0.694844,0.52,0.903616
3,8,0.975941,0.89,0.997092
4,10,0.965355,0.95,0.997635


In [188]:
ensemble['avg'] = (ensemble['GB'] + ensemble['RF'] + ensemble['XGB']) / 3

In [189]:
# 0.8을 기준으로 타겟변수 분류
ensemble['OC'] = (ensemble['avg'] > 0.8).astype(int)

In [190]:
ensemble = ensemble.loc[:, ['inst_id','OC']]

In [191]:
ensemble.to_csv('submission.csv', index=False)

92.91339의 정확도