In [43]:
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.covariance import EllipticEnvelope
from xgboost import XGBClassifier

In [44]:
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns',200)
pd.set_option('display.max_rows',200)

#### Main approach:
Extract hospital features in 2012 to predict closure in 2013,hospital features in 2013 to predict closure in 2014


#### Dataset:
Closure status from _Data Shared/Hospital Closure/verified_closure_combined_datereconciled_infoadded_190521 (only closure status used)


hospital features from Agency for Healthcare Research and Quality 2012/2013

In [45]:
dataset_close=pd.read_csv('data\hospital_closure\hospital_closure.csv')
dataset_2012=pd.read_csv('data\hospital_compare\Copy of Agency for Healthcare Research and Quality 2012.csv')
dataset_2013=pd.read_csv('data\hospital_compare\Copy of Agency for Healthcare Research and Quality 2013.csv')

#### Data Cleaning




In [46]:
dataset_close['year_close_combined'].value_counts(dropna=False).sort_index()

2002     1
2003     1
2004     1
2005     1
2006     3
2007    13
2008    16
2009    17
2010    12
2011    13
2012    25
2013    23
2014    24
2015    18
2016    10
2017     7
Name: year_close_combined, dtype: int64

23 hospitals closed in 2013, and 24 hospitals closed in 2014.

##### first clean data of Agency for Healthcare Research and Quality 2012

In [47]:
#create 'Closure' column, where '1' represents closed hospitals in 2013, '0' represents the others
sub_ds=dataset_close[dataset_close['year_close_combined']==2013]
dataset_2012=dataset_2012.rename(columns={'Provider Number':'id'})
dataset_2012['Closure']=dataset_2012.id.apply(lambda x:1 if x in np.array(sub_ds['id']) else 0)

In [48]:
#drop columns about name and geographic information
dataset_2012.drop(dataset_2012.columns[:10], axis = 1, inplace = True)

In [49]:
dataset_2012.head()

Unnamed: 0,Death from serious treatable complications after surgery,Footnote - Death from serious treatable complications after surgery,Collapsed lung due to medical treatment,Footnote - Collapsed lung due to medical treatment,Breathing failure after surgery,Footnote - Breathing failure after surgery,Serious blood clots after surgery,Footnote - Serious blood clots after surgery,A wound that splits open after surgery,Footnote - A wound that splits open after surgery,Accidental cuts and tears from medical treatment,Footnote - Accidental cuts and tears from medical treatment,Serious Complications,Footnote - Serious Complications,Death after surgery to repair a weakness in the abdominal aorta,Footnote - Death after surgery to repair a weakness in the abdominal aorta,Deaths after admission for broken hip,Footnote - Deaths after admission for broken hip,Deaths from Certain Conditions,Footnote - Deaths from Certain Conditions,Number of Patients - Death from serious treatable complications after surgery,Rate - Death from serious treatable complications after surgery,Lower Estimate - Death from serious treatable complications after surgery,Higher Estimate - Death from serious treatable complications after surgery,Number of Patients - Collapsed lung due to medical treatment,Rate - Collapsed lung due to medical treatment,Lower Estimate - Collapsed lung due to medical treatment,Higher Estimate - Collapsed lung due to medical treatment,Number of Patients - Breathing failure after surgery,Rate - Breathing failure after surgery,Lower Estimate - Breathing failure after surgery,Higher Estimate - Breathing failure after surgery,Number of Patients - Serious blood clots after surgery,Rate - Serious blood clots after surgery,Lower Estimate - Serious blood clots after surgery,Higher Estimate - Serious blood clots after surgery,Number of Patients - A wound that splits open after surgery,Rate - A wound that splits open after surgery,Lower Estimate - A wound that splits open after surgery,Higher Estimate - A wound that splits open after surgery,Number of Patients - Accidental cuts and tears from medical treatment,Rate - Accidental cuts and tears from medical treatment,Lower Estimate - Accidental cuts and tears from medical treatment,Higher Estimate - Accidental cuts and tears from medical treatment,Number of Patients - Serious Complications,Rate - Serious Complications,Lower Estimate - Serious Complications,Higher Estimate - Serious Complications,Number of Patients - Death after surgery to repair a weakness in the abdominal aorta,Rate - Death after surgery to repair a weakness in the abdominal aorta,Lower Estimate - Death after surgery to repair a weakness in the abdominal aorta,Higher Estimate - Death after surgery to repair a weakness in the abdominal aorta,Number of Patients - Deaths after admission for broken hip,Rate - Deaths after admission for broken hip,Lower Estimate - Deaths after admission for broken hip,Higher Estimate - Deaths after admission for broken hip,Number of Patients - Deaths from Certain Conditions,Rate - Deaths from Certain Conditions,Lower Estimate - Deaths from Certain Conditions,Higher Estimate - Deaths from Certain Conditions,Closure
0,No Different than U.S. National Rate,,No Different than U.S. National Rate,,No Different than U.S. National Rate,,Better than U.S. National Rate,,No Different than U.S. National Rate,,Worse than U.S. National Rate,,No Different than U.S. National Rate,,Not Available,Suppressed for one or more quarters by CMS.,Not Available,Suppressed for one or more quarters by CMS.,Not Available,Suppressed for one or more quarters by CMS.,157,140.58,106.16,175,15149,0.32,0.08,0.56,Not Available,Not Available,Not Available,Not Available,4941,1.47,0.0,3.18,393,0.9,0,2.37,15820,3.28,2.48,4.08,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,0
1,No Different than U.S. National Rate,,No Different than U.S. National Rate,,No Different than U.S. National Rate,,No Different than U.S. National Rate,,No Different than U.S. National Rate,,No Different than U.S. National Rate,,No Different than U.S. National Rate,,Not Available,Suppressed for one or more quarters by CMS.,Not Available,Suppressed for one or more quarters by CMS.,Not Available,Suppressed for one or more quarters by CMS.,29,123.92,74.29,173.55,4467,0.3,0.0,0.65,Not Available,Not Available,Not Available,Not Available,689,4.7,0.98,8.42,102,0.69,0,2.65,4584,1.5,0.0,3.03,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,0
2,No Different than U.S. National Rate,,No Different than U.S. National Rate,,Not Available,No data are available from the hospital for th...,No Different than U.S. National Rate,,No Different than U.S. National Rate,,Worse than U.S. National Rate,,No Different than U.S. National Rate,,Not Available,Suppressed for one or more quarters by CMS.,Not Available,Suppressed for one or more quarters by CMS.,Not Available,Suppressed for one or more quarters by CMS.,110,146.33,106.95,185.71,9962,0.5,0.23,0.77,Not Available,Not Available,Not Available,Not Available,2805,3.22,0.81,5.63,234,1.21,0,2.92,10577,3.09,2.11,4.07,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,0
3,Number of Cases Too Small,Medicare requires hospitals to have at least 2...,No Different than U.S. National Rate,,No Different than U.S. National Rate,,No Different than U.S. National Rate,,No Different than U.S. National Rate,,No Different than U.S. National Rate,,No Different than U.S. National Rate,,Not Available,Suppressed for one or more quarters by CMS.,Not Available,Suppressed for one or more quarters by CMS.,Not Available,Suppressed for one or more quarters by CMS.,10,Not Available,Not Available,Not Available,2176,0.3,0.0,0.71,Not Available,Not Available,Not Available,Not Available,173,6.11,1.46,10.76,35,0.85,0,3.03,2220,1.74,0.0,3.7,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,0
4,Not Available,No data are available from the hospital for th...,No Different than U.S. National Rate,,Not Available,No data are available from the hospital for th...,No Different than U.S. National Rate,,Not Available,No data are available from the hospital for th...,No Different than U.S. National Rate,,No Different than U.S. National Rate,,Not Available,Suppressed for one or more quarters by CMS.,Not Available,Suppressed for one or more quarters by CMS.,Not Available,Suppressed for one or more quarters by CMS.,Not Available,Not Available,Not Available,Not Available,673,0.33,0.0,0.76,Not Available,Not Available,Not Available,Not Available,28,4.52,0.0,9.6,Not Available,Not Available,Not Available,Not Available,707,1.9,0.0,4.21,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,Not Available,0


In [50]:
#drop footnote columns
dataset_2012.drop(dataset_2012.columns[dataset_2012.apply(lambda col: 'Footnote' in str(col))], axis=1, inplace = True)

In [51]:
#drop columns with all 'Not Available' elements
dataset_2012.drop(dataset_2012.columns[dataset_2012.apply(lambda col: (col=='Not Available').all())], axis=1, inplace = True)

In [52]:
#replace strings in comparison with US national rates
#mapping criteria:
#'No Different than U.S. National Rate','Not Available','Number of Cases Too Small':0
#'Worse than U.S. National Rate': 1
#'Better than U.S. National Rate':2
dataset_2012[['Death from serious treatable complications after surgery','Collapsed lung due to medical treatment','Breathing failure after surgery','Serious blood clots after surgery','A wound that splits open after surgery','Accidental cuts and tears from medical treatment','Serious Complications']]=dataset_2012[['Death from serious treatable complications after surgery','Collapsed lung due to medical treatment','Breathing failure after surgery','Serious blood clots after surgery','A wound that splits open after surgery','Accidental cuts and tears from medical treatment','Serious Complications']].replace({'No Different than U.S. National Rate': 0,' Not Available':0, 'Not Available':0,'Number of Cases Too Small':0,'Worse than U.S. National Rate': 1,'Better than U.S. National Rate':2})

In [53]:
#replace all missing values by "NaN" 
dataset_2012=dataset_2012.replace({'Not Available':np.nan,' Not Available':np.nan})

In [54]:
#making all values being able to do numerical calculations
for col in dataset_2012:
    dataset_2012[col]=pd.to_numeric(dataset_2012[col])

In [55]:
dataset_2012.loc[dataset_2012['Closure']==1]

Unnamed: 0,Death from serious treatable complications after surgery,Collapsed lung due to medical treatment,Breathing failure after surgery,Serious blood clots after surgery,A wound that splits open after surgery,Accidental cuts and tears from medical treatment,Serious Complications,Number of Patients - Death from serious treatable complications after surgery,Rate - Death from serious treatable complications after surgery,Lower Estimate - Death from serious treatable complications after surgery,Higher Estimate - Death from serious treatable complications after surgery,Number of Patients - Collapsed lung due to medical treatment,Rate - Collapsed lung due to medical treatment,Lower Estimate - Collapsed lung due to medical treatment,Higher Estimate - Collapsed lung due to medical treatment,Number of Patients - Serious blood clots after surgery,Rate - Serious blood clots after surgery,Lower Estimate - Serious blood clots after surgery,Higher Estimate - Serious blood clots after surgery,Number of Patients - A wound that splits open after surgery,Rate - A wound that splits open after surgery,Lower Estimate - A wound that splits open after surgery,Higher Estimate - A wound that splits open after surgery,Number of Patients - Accidental cuts and tears from medical treatment,Rate - Accidental cuts and tears from medical treatment,Lower Estimate - Accidental cuts and tears from medical treatment,Higher Estimate - Accidental cuts and tears from medical treatment,Closure
16,0,0,0,0,0,0,0,,,,,575.0,0.34,0.0,0.77,,,,,,,,,594.0,1.95,0.0,4.3,1
43,0,0,0,0,0,0,0,,,,,571.0,0.32,0.0,0.75,3.0,,,,,,,,576.0,1.94,0.0,4.27,1
173,0,0,0,0,0,0,0,49.0,140.32,96.2,184.44,1674.0,0.34,0.0,0.71,628.0,4.47,0.49,8.45,32.0,0.86,0.0,3.06,1900.0,2.08,0.49,3.67,1
421,0,0,0,0,0,0,0,,,,,180.0,0.34,0.0,0.77,8.0,,,,3.0,,,,181.0,1.97,0.0,4.32,1
478,0,0,0,0,0,0,0,13.0,,,,1801.0,0.3,0.0,0.71,194.0,3.74,0.0,8.37,24.0,,,,1842.0,1.46,0.0,3.48,1
565,0,0,0,0,0,0,0,,,,,250.0,0.33,0.0,0.76,13.0,,,,5.0,,,,252.0,1.95,0.0,4.28,1
1141,0,0,0,0,0,0,0,14.0,,,,2654.0,0.28,0.0,0.67,318.0,3.44,0.0,7.87,78.0,0.74,0.0,2.78,2677.0,1.93,0.05,3.81,1
1653,0,0,0,0,0,0,0,,,,,2178.0,0.31,0.0,0.72,39.0,4.45,0.0,9.49,14.0,,,,2223.0,1.62,0.0,3.76,1
2710,0,0,0,0,0,0,0,5.0,,,,1187.0,0.3,0.0,0.71,137.0,3.98,0.0,8.74,26.0,0.87,0.0,3.08,1227.0,1.88,0.0,3.92,1
2837,0,0,0,0,0,2,0,28.0,112.24,65.22,159.26,5074.0,0.37,0.06,0.68,739.0,7.54,4.23,10.85,146.0,1.4,0.0,3.24,5157.0,0.66,0.0,2.01,1


In [56]:
#drop columns with too many NaN values in closed hospital features
dataset_2012.drop(dataset_2012.columns[dataset_2012.apply(lambda col: '- Death from serious treatable complications after surgery' in str(col))], axis=1, inplace = True)
dataset_2012.drop(dataset_2012.columns[dataset_2012.apply(lambda col: '- A wound that splits open after surgery' in str(col))], axis=1, inplace = True)
dataset_2012.drop(dataset_2012.columns[dataset_2012.apply(lambda col: '- Serious blood clots after surgery' in str(col))], axis=1, inplace = True)
dataset_2012.drop(dataset_2012.columns[dataset_2012.apply(lambda col: 'Breathing failure after surgery' in str(col))], axis=1, inplace = True)

In [57]:
#closed hospital features now with no NaN values 
dataset_2012.loc[dataset_2012['Closure']==1]

Unnamed: 0,Death from serious treatable complications after surgery,Collapsed lung due to medical treatment,Serious blood clots after surgery,A wound that splits open after surgery,Accidental cuts and tears from medical treatment,Serious Complications,Number of Patients - Collapsed lung due to medical treatment,Rate - Collapsed lung due to medical treatment,Lower Estimate - Collapsed lung due to medical treatment,Higher Estimate - Collapsed lung due to medical treatment,Number of Patients - Accidental cuts and tears from medical treatment,Rate - Accidental cuts and tears from medical treatment,Lower Estimate - Accidental cuts and tears from medical treatment,Higher Estimate - Accidental cuts and tears from medical treatment,Closure
16,0,0,0,0,0,0,575.0,0.34,0.0,0.77,594.0,1.95,0.0,4.3,1
43,0,0,0,0,0,0,571.0,0.32,0.0,0.75,576.0,1.94,0.0,4.27,1
173,0,0,0,0,0,0,1674.0,0.34,0.0,0.71,1900.0,2.08,0.49,3.67,1
421,0,0,0,0,0,0,180.0,0.34,0.0,0.77,181.0,1.97,0.0,4.32,1
478,0,0,0,0,0,0,1801.0,0.3,0.0,0.71,1842.0,1.46,0.0,3.48,1
565,0,0,0,0,0,0,250.0,0.33,0.0,0.76,252.0,1.95,0.0,4.28,1
1141,0,0,0,0,0,0,2654.0,0.28,0.0,0.67,2677.0,1.93,0.05,3.81,1
1653,0,0,0,0,0,0,2178.0,0.31,0.0,0.72,2223.0,1.62,0.0,3.76,1
2710,0,0,0,0,0,0,1187.0,0.3,0.0,0.71,1227.0,1.88,0.0,3.92,1
2837,0,0,0,0,2,0,5074.0,0.37,0.06,0.68,5157.0,0.66,0.0,2.01,1


In [58]:
#now almost all rows having NaN values have more than 4 NaN values (only 3 exception rows), so drop them all
dataset_2012.dropna(inplace=True)

##### apply the same steps above to  Agency for Healthcare Research and Quality 2013

In [59]:
#apply the same steps above to  Agency for Healthcare Research and Quality 2013
sub_ds2=dataset_close[dataset_close['year_close_combined']==2014]
dataset_2013=dataset_2013.rename(columns={'Provider Number':'id'})
dataset_2013['Closure']=dataset_2013.id.apply(lambda x:1 if x in np.array(sub_ds2['id']) else 0)
dataset_2013.drop(dataset_2013.columns[:10], axis = 1, inplace = True)

In [60]:
dataset_2013.drop(dataset_2013.columns[dataset_2013.apply(lambda col: 'Footnote' in str(col))], axis=1, inplace = True)
dataset_2013.drop(dataset_2013.columns[dataset_2013.apply(lambda col: (col=='Not Available').all())], axis=1, inplace = True)
dataset_2013[['Death from serious treatable complications after surgery','Collapsed lung due to medical treatment','Serious blood clots after surgery','A wound that splits open after surgery','Accidental cuts and tears from medical treatment','Serious Complications']]=dataset_2013[['Death from serious treatable complications after surgery','Collapsed lung due to medical treatment','Serious blood clots after surgery','A wound that splits open after surgery','Accidental cuts and tears from medical treatment','Serious Complications']].replace({'No Different than U.S. National Rate': 0,' Not Available':0, 'Not Available':0,'Number of Cases Too Small':0,'Worse than U.S. National Rate': 1,'Better than U.S. National Rate':2})
dataset_2013=dataset_2013.replace({'Not Available':np.nan,' Not Available':np.nan})
for col in dataset_2013:
    dataset_2013[col]=pd.to_numeric(dataset_2013[col])

In [61]:
dataset_2013.drop(dataset_2013.columns[dataset_2013.apply(lambda col: '- Death from serious treatable complications after surgery' in str(col))], axis=1, inplace = True)
dataset_2013.drop(dataset_2013.columns[dataset_2013.apply(lambda col: '- A wound that splits open after surgery' in str(col))], axis=1, inplace = True)
dataset_2013.drop(dataset_2013.columns[dataset_2013.apply(lambda col: '- Serious blood clots after surgery' in str(col))], axis=1, inplace = True)
#drop the columns that did not appear in dataset_2012  
dataset_2013.drop(dataset_2013.columns[dataset_2013.apply(lambda col: '- Serious Complications' in str(col))], axis=1, inplace = True)
dataset_2013.dropna(inplace=True)

now the datasets of 2012 and 2013 have same size of features, we can merge them into a larger dataset, and split it in training set and test set

In [62]:
dataset=pd.concat([dataset_2012,dataset_2013])
train, test = train_test_split(dataset, test_size=0.2)

In [63]:
X_train = train.drop("Closure", axis=1)
Y_train = train["Closure"]
X_test  = test.drop("Closure", axis=1).copy()
Y_test  = test["Closure"]
X_train.shape, Y_train.shape, X_test.shape, Y_test.shape

((5262, 14), (5262,), (1316, 14), (1316,))

##### Logistic Regression

In [64]:
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
Y_pred = cross_val_predict(logreg, X_train, Y_train, cv=10)
acc_log = logreg.score(X_train, Y_train) * 100
acc_log_t = logreg.score(X_test, Y_test) * 100
print('training score:' +str(acc_log))
print('test score:' +str(acc_log_t))
confusion_matrix(Y_train, Y_pred)

training score:99.41087039148613
test score:99.46808510638297


array([[5231,    0],
       [  31,    0]], dtype=int64)

##### SVM

In [65]:
svc = SVC()
svc.fit(X_train, Y_train)
Y_pred = svc.predict(X_test)
acc_svc = svc.score(X_train, Y_train) * 100
acc_svc_t = svc.score(X_test, Y_test) * 100
print('training score:' +str(acc_svc))
print('test score:' +str(acc_svc_t))
confusion_matrix(Y_test, Y_pred)


training score:99.41087039148613
test score:99.46808510638297


array([[1309,    0],
       [   7,    0]], dtype=int64)

##### k-NN

In [66]:
knn = KNeighborsClassifier(n_neighbors = X_train.shape[0])
knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_test)
acc_knn = knn.score(X_train, Y_train) * 100
acc_knn_t = knn.score(X_test, Y_test) * 100
print('training score:' +str(acc_knn))
print('test score:' +str(acc_knn_t))
confusion_matrix(Y_test, Y_pred)

training score:99.41087039148613
test score:99.46808510638297


array([[1309,    0],
       [   7,    0]], dtype=int64)

##### Gaussian Naive Bayes

In [67]:
gaussian = GaussianNB()
gaussian.fit(X_train, Y_train)
Y_pred = gaussian.predict(X_test)
acc_gaussian = gaussian.score(X_train, Y_train) * 100
acc_gaussian_t = gaussian.score(X_test, Y_test) * 100
print('training score:' +str(acc_gaussian))
print('test score:' +str(acc_gaussian_t))
confusion_matrix(Y_test, Y_pred)

training score:64.36716077537058
test score:63.525835866261396


array([[830, 479],
       [  1,   6]], dtype=int64)

##### Perceptron

In [68]:
perceptron = Perceptron()
perceptron.fit(X_train, Y_train)
Y_pred = perceptron.predict(X_test)
acc_perceptron = perceptron.score(X_train, Y_train) * 100
acc_perceptron_t = perceptron.score(X_test, Y_test) * 100
print('training score:' +str(acc_perceptron))
print('test score:' +str(acc_perceptron_t))
confusion_matrix(Y_test, Y_pred)

training score:99.41087039148613
test score:99.46808510638297


array([[1309,    0],
       [   7,    0]], dtype=int64)

##### Linear SVM

In [69]:
linear_svc = LinearSVC()
linear_svc.fit(X_train, Y_train)
Y_pred = linear_svc.predict(X_test)
acc_linear_svc = linear_svc.score(X_train, Y_train) * 100
acc_linear_svc_t = linear_svc.score(X_test, Y_test) * 100
print('training score:' +str(acc_linear_svc))
print('test score:' +str(acc_linear_svc_t))
confusion_matrix(Y_test, Y_pred)

training score:93.48156594450779
test score:94.30091185410335


array([[1241,   68],
       [   7,    0]], dtype=int64)

##### linear classifier with stochastic gradient descent

In [70]:
sgd = SGDClassifier()
sgd.fit(X_train, Y_train)
Y_pred = sgd.predict(X_test)
acc_sgd = sgd.score(X_train, Y_train) * 100
acc_sgd_t = sgd.score(X_test, Y_test) * 100
print('training score:' +str(acc_sgd))
print('test score:' +str(acc_sgd_t))
confusion_matrix(Y_test, Y_pred)

training score:99.41087039148613
test score:99.46808510638297


array([[1309,    0],
       [   7,    0]], dtype=int64)

##### decision tree

In [71]:
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = decision_tree.score(X_train, Y_train) * 100
acc_decision_tree_t = decision_tree.score(X_test, Y_test) * 100
print('training score:' +str(acc_decision_tree))
print('test score:' +str(acc_decision_tree_t))
confusion_matrix(Y_test, Y_pred)

training score:100.0
test score:98.93617021276596


array([[1302,    7],
       [   7,    0]], dtype=int64)

##### random forest

In [72]:
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)
Y_pred = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)
acc_random_forest = random_forest.score(X_train, Y_train) * 100
acc_random_forest_t = random_forest.score(X_test, Y_test) * 100
print('training score:' +str(acc_random_forest))
print('test score:' +str(acc_random_forest_t))
confusion_matrix(Y_test, Y_pred)

training score:100.0
test score:99.46808510638297


array([[1309,    0],
       [   7,    0]], dtype=int64)

##### XGBoost

In [73]:
xgboost = XGBClassifier()
xgboost.fit(X_train, Y_train)
Y_pred = xgboost.predict(X_test)
xgboost.score(X_train, Y_train)
acc_xgboost = xgboost.score(X_train, Y_train) * 100
acc_xgboost_t = xgboost.score(X_test, Y_test) * 100
print('training score:' +str(acc_xgboost))
print('test score:' +str(acc_xgboost_t))
confusion_matrix(Y_test, Y_pred)

training score:99.8859749144812
test score:99.46808510638297


array([[1309,    0],
       [   7,    0]], dtype=int64)

#### One-Class Classification for unbalanced Data
We can see that only a very small percentage of hospitals are closed, so the data are very unbalanced. Three algorithms below can regard the small percentage class as outliers.

##### Isolation Forest

In [74]:
isolation_forest= IsolationForest(contamination=0.00575, behaviour='new')
X_train_= X_train[Y_train==0]
isolation_forest.fit(X_train_)
Y_pred = isolation_forest.predict(X_train)
Y_pred[Y_pred == 1] = 0
Y_pred[Y_pred == -1] = 1
confusion_matrix(Y_train, Y_pred)

array([[5200,   31],
       [  31,    0]], dtype=int64)

##### one-class SVM

In [75]:
one_class_SVM= OneClassSVM(gamma='scale', nu=0.00575)
X_train_= X_train[Y_train==0]
one_class_SVM.fit(X_train_)
Y_pred = one_class_SVM.predict(X_train)
Y_pred[Y_pred == 1] = 0
Y_pred[Y_pred == -1] = 1
confusion_matrix(Y_train, Y_pred)

array([[5199,   32],
       [  31,    0]], dtype=int64)

##### Minimum Covariance Determinant

In [76]:
MCD= EllipticEnvelope(contamination=0.00575)
X_train_= X_train[Y_train==0]
MCD.fit(X_train_)
Y_pred = MCD.predict(X_train)
Y_pred[Y_pred == 1] = 0
Y_pred[Y_pred == -1] = 1
confusion_matrix(Y_train, Y_pred)

array([[5200,   31],
       [  31,    0]], dtype=int64)

##### Comparison of supervised learning algorithms above

In [77]:
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest', 'Naive Bayes', 'Perceptron', 
              'Stochastic Gradient Decent', 'Linear SVC', 
              'Decision Tree', 'XGBoost'],
    'Training Score': [acc_svc, acc_knn, acc_log, 
              acc_random_forest, acc_gaussian, acc_perceptron, 
              acc_sgd, acc_linear_svc, acc_decision_tree,acc_xgboost],
    'Test Score': [acc_svc_t, acc_knn_t, acc_log_t, 
              acc_random_forest_t, acc_gaussian_t, acc_perceptron_t, 
              acc_sgd_t, acc_linear_svc_t, acc_decision_tree_t,acc_xgboost_t]})
models.sort_values(by='Test Score', ascending=False)

Unnamed: 0,Model,Training Score,Test Score
0,Support Vector Machines,99.41087,99.468085
1,KNN,99.41087,99.468085
2,Logistic Regression,99.41087,99.468085
3,Random Forest,100.0,99.468085
5,Perceptron,99.41087,99.468085
6,Stochastic Gradient Decent,99.41087,99.468085
9,XGBoost,99.885975,99.468085
8,Decision Tree,100.0,98.93617
7,Linear SVC,93.481566,94.300912
4,Naive Bayes,64.367161,63.525836
