In [2]:
import pandas as pd

path=r'noExpired.xlsx'
all=pd.read_excel(path)
#numeric_cols = [ 'age', 'time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses']
numeric_cols = ['time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses']

x_num_all = all[ numeric_cols ].as_matrix()
x_num_all

array([[ 1, 35,  4, ...,  0,  0,  9],
       [ 2,  8,  5, ...,  0,  0,  6],
       [ 7, 12,  0, ...,  0,  1,  9],
       ..., 
       [13, 46,  2, ...,  0,  1,  6],
       [ 2, 62,  1, ...,  0,  3,  9],
       [ 8, 61,  0, ...,  0,  1,  9]], dtype=int64)

In [3]:
# categorical
cat_all = all.drop( numeric_cols + [ 'readmitted'], axis = 1 )

fac_x_cat_all = pd.DataFrame()
cat_cols = list(cat_all.columns.values)
for col in cat_cols:
    all_cur, _ = pd.factorize(cat_all[col])
    fac_x_cat_all[col] = all_cur

fac_x_cat_all = fac_x_cat_all.as_matrix()


In [4]:
import numpy as np
x_all = np.hstack(( x_num_all, fac_x_cat_all ))
y_all = all.readmitted

In [5]:
from sklearn.cross_validation import train_test_split
#train-test split
x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, test_size=.25,random_state=24)

In [6]:
import time
from sklearn.svm import SVC
# although one needs to choose these hyperparams
#training
st = time.time()
print "training started"

C = 173
gamma = 1.31e-5
shrinking = True
probability = True
verbose = True
svc = SVC( C = C, gamma = gamma, shrinking = shrinking, probability = probability, verbose = verbose )
svc.fit( x_train, y_train )

print "training ended"
et = time.time()
tt = et - st
print "Training Time = " + str(tt) + "\n"

training started
[LibSVM]training ended
Training Time = 49.2950000763



In [7]:
from sklearn.metrics import roc_auc_score as AUC
p = svc.predict_proba( x_test )
auc = AUC( y_test, p[:,1] )
print "SVM AUC", auc

SVM AUC 0.638319296141


In [None]:
from sklearn.grid_search import RandomizedSearchCV
from sklearn.cross_validation import StratifiedKFold

paramgrid = {"kernel": ["rbf"],
             "C"     : np.logspace(-9, 9, num=100, base=10),
             "gamma" : np.logspace(-9, 9, num=100, base=10)}

cv = RandomizedSearchCV(estimator=SVC(),
                        param_distributions=paramgrid,
                        n_iter=250,
                        scoring="accuracy",
                        cv=StratifiedKFold(y_all, n_folds=10),
                        verbose=1)
%time cv.fit(x_all, y_all)


In [None]:
cv.best_score_, cv.best_params_

In [10]:
from sklearn.ensemble import RandomForestClassifier as RF
from math import sqrt

print "training random forest..."
from sklearn.svm import SVC
# although one needs to choose these hyperparams
#training
st = time.time()
print "training started"

n_trees = 100
max_features = int( round( sqrt( x_train.shape[1] ) * 2 )) # try more features at each split
max_features = 'auto'
verbose = 1
n_jobs = 1
rf = RF( n_estimators = n_trees, max_features = max_features, verbose = verbose, n_jobs = n_jobs )
rf.fit( x_train, y_train )


[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  50 jobs       | elapsed:    0.6s


training random forest...
training started


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    1.2s finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=1,
            warm_start=False)

In [11]:
p = rf.predict_proba( x_test )
auc = AUC( y_test, p[:,1] )
print "RF AUC", auc

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  50 jobs       | elapsed:    0.0s


RF AUC 0.67881808061


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
