In [17]:
import pandas as pd
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from sklearn import model_selection
%matplotlib inline
import matplotlib.pyplot as plt
from copy import deepcopy
import seaborn as sns
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV 
# import lightgbm as lgb


In [96]:
data = pd.read_csv('data_train_1.csv')
y_data = data['stroke_in_2018']
data.drop(columns=['stroke_in_2018'],inplace=True)
print(data.shape)
print(y_data.shape)
y_data_n = y_data[y_data== 1]
data_y_p = y_data[:700]
print(len(y_data_n))

y_real = pd.concat((y_data_n,data_y_p))
print(len(y_real))
y_real.value_counts()
data_r = data.iloc[y_real.index,:]
print(len(data_r))
data_r['result'] = y_real
data_r.sample(frac=1)
y_real = data_r['result']
data_r.drop(columns=['result'],inplace=True)

(34751, 16)
(34751,)
651
1351
1351


In [97]:
train_x, test_x, train_y, test_y = model_selection.train_test_split(data_r, y_real,test_size=0.1)

print(train_x.shape)
print(train_y.shape)

(1215, 16)
(1215,)


In [98]:
MLA = [
    #Ensemble Methods
    ensemble.AdaBoostClassifier(),
    ensemble.BaggingClassifier(),
    ensemble.ExtraTreesClassifier(),
    ensemble.GradientBoostingClassifier(),
    ensemble.RandomForestClassifier(),

    #Gaussian Processes
    gaussian_process.GaussianProcessClassifier(),
    
    #GLM
    linear_model.LogisticRegressionCV(),
    linear_model.PassiveAggressiveClassifier(),
    linear_model.RidgeClassifierCV(),
    linear_model.SGDClassifier(),
    linear_model.Perceptron(),
    #Navies Bayes
    naive_bayes.BernoulliNB(),
    naive_bayes.GaussianNB(),
    
    #Nearest Neighbor
    neighbors.KNeighborsClassifier(),
    
    #SVM
    svm.SVC(probability=True),
#     svm.NuSVC(probability=True),
    svm.LinearSVC(),
    
    #Trees    
    tree.DecisionTreeClassifier(),
    tree.ExtraTreeClassifier(),
    
    #Discriminant Analysis
#     discriminant_analysis.LinearDiscriminantAnalysis(),
#     discriminant_analysis.QuadraticDiscriminantAnalysis(),

    
    #xgboost: http://xgboost.readthedocs.io/en/latest/model.html
    XGBClassifier()
    #lightboost
#     lgb.sklearn.LGBMClassifier()
    ]

In [99]:
cv_split = model_selection.ShuffleSplit(n_splits = 10, test_size = .3, 
                                        train_size = .7, random_state = 0 )

In [100]:
MLA_columns = ['MLA Name', 'MLA Parameters','MLA Train Accuracy Mean', 
               'MLA Test Accuracy Mean', 'MLA Test Accuracy 3*STD' ,'MLA Time']
MLA_compare = pd.DataFrame(columns = MLA_columns)
MLA_predict = deepcopy(train_y)

In [101]:
print(train_x.shape)
print(train_y.shape)

(1215, 16)
(1215,)


In [102]:
row_index = 0
for alg in MLA:
    #set name and parameters
    MLA_name = alg.__class__.__name__
    MLA_compare.loc[row_index, 'MLA Name'] = MLA_name
    MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params())
    print(MLA_name)
    print(alg.get_params())
    cv_results = model_selection.cross_validate(alg, train_x, train_y, cv  = cv_split)

    MLA_compare.loc[row_index, 'MLA Time'] = cv_results['fit_time'].mean()
    MLA_compare.loc[row_index, 'MLA '] = cv_results['train_score'].mean()
    MLA_compare.loc[row_index, 'MLA Test Accuracy Mean'] = cv_results['test_score'].mean()
    MLA_compare.loc[row_index, 'MLA Test Accuracy 3*STD'] = cv_results['test_score'].std()*3
    alg.fit(train_x, train_y)
    MLA_predict[MLA_name] = alg.predict(train_x)
    row_index+=1

MLA_compare.sort_values(by = ['MLA Test Accuracy Mean'], ascending = False, inplace = True)

AdaBoostClassifier
{'algorithm': 'SAMME.R', 'base_estimator': None, 'learning_rate': 1.0, 'n_estimators': 50, 'random_state': None}
BaggingClassifier
{'base_estimator': None, 'bootstrap': True, 'bootstrap_features': False, 'max_features': 1.0, 'max_samples': 1.0, 'n_estimators': 10, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
ExtraTreesClassifier
{'bootstrap': False, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 'warn', 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
GradientBoostingClassifier
{'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impuri

In [103]:
MLA_compare

Unnamed: 0,MLA Name,MLA Parameters,MLA Train Accuracy Mean,MLA Test Accuracy Mean,MLA Test Accuracy 3*STD,MLA Time,MLA
8,RidgeClassifierCV,"{'alphas': array([ 0.1, 1. , 10. ]), 'class_w...",,0.750411,0.0828018,0.00650201,0.760471
6,LogisticRegressionCV,"{'Cs': 10, 'class_weight': None, 'cv': 'warn',...",,0.749041,0.0692755,0.454923,0.759647
18,XGBClassifier,"{'base_score': 0.5, 'booster': 'gbtree', 'cols...",,0.745753,0.0659381,0.0724253,0.833059
3,GradientBoostingClassifier,"{'criterion': 'friedman_mse', 'init': None, 'l...",,0.744932,0.0550685,0.117497,0.862941
0,AdaBoostClassifier,"{'algorithm': 'SAMME.R', 'base_estimator': Non...",,0.74411,0.073349,0.0861925,0.793059
14,SVC,"{'C': 1.0, 'cache_size': 200, 'class_weight': ...",,0.740822,0.0767691,0.166269,0.815059
13,KNeighborsClassifier,"{'algorithm': 'auto', 'leaf_size': 30, 'metric...",,0.721918,0.075442,0.0032033,0.808824
4,RandomForestClassifier,"{'bootstrap': True, 'class_weight': None, 'cri...",,0.717808,0.0709901,0.0185369,0.984471
1,BaggingClassifier,"{'base_estimator': None, 'bootstrap': True, 'b...",,0.703562,0.0564437,0.0335263,0.982118
2,ExtraTreesClassifier,"{'bootstrap': False, 'class_weight': None, 'cr...",,0.697534,0.0719542,0.0162334,1.0


In [104]:
MLA_compare['MLA Parameters'][17]

"{'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': None, 'splitter': 'random'}"

In [37]:
xgbt = XGBClassifier()
para_grid = {'learning_rate':[0.01],'n_estimators':[300],'max_depth':[6],'seed':[0,1,2,3,4]}
grid_search = GridSearchCV(estimator=xgbt,scoring='accuracy',param_grid=para_grid,cv=10).fit(data,y_data)
print('best_params for xgboost')
print(grid_search.best_params_)
print(grid_search.best_score_)


best_params for xgboost
{'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 300, 'seed': 0}
0.9812955022877039


In [105]:
data_test = pd.read_csv('data_test_1.csv')

In [106]:
print(len(data_test))

8718


In [107]:
orinal_data = pd.read_csv('test.csv')

In [108]:
xgbt = MLA[-1]
xgbt.fit(data,y_data)
yhat = xgbt.predict(data_test)
final_r = pd.DataFrame({'id':orinal_data['id'],'stroke_in_2018':yhat})
print(len(final_r))
print(sum(yhat))

8718
2.0


In [109]:
from sklearn import model_selection
from mlxtend.classifier import StackingCVClassifier
import numpy as np
import warnings
from sklearn import model_selection

warnings.simplefilter('ignore')



In [110]:
xg,rd,svc,ada = MLA[-1],MLA[7],MLA[14],MLA[0]


In [111]:
logit = linear_model.LogisticRegressionCV()

In [112]:
stack = StackingCVClassifier(classifiers=[rd,svc,ada],meta_classifier=logit,cv=4)

In [113]:
stack.fit(data,y_data)

StackingCVClassifier(classifiers=[PassiveAggressiveClassifier(C=1.0, average=False, class_weight=None,
              early_stopping=False, fit_intercept=True, loss='hinge',
              max_iter=None, n_iter=None, n_iter_no_change=5, n_jobs=None,
              random_state=None, shuffle=True, tol=None,
              va...hm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)],
           cv=4, drop_last_proba=False,
           meta_classifier=LogisticRegressionCV(Cs=10, class_weight=None, cv='warn', dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='warn', n_jobs=None, penalty='l2',
           random_state=None, refit=True, scoring=None, solver='lbfgs',
           tol=0.0001, verbose=0),
           n_jobs=None, pre_dispatch='2*n_jobs', random_state=None,
           shuffle=True, store_train_meta_features=False, stratify=True,
           use_clones=True, use_features_in_secondary=False,
  

In [114]:
yhat=stack.predict(data_test)
print(sum(yhat))

0.0


In [56]:
final_r.to_csv('result.csv',index=False)


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.01, max_delta_step=0,
       max_depth=6, min_child_weight=1, missing=None, n_estimators=300,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=0, silent=True,
       subsample=1)

[0. 0. 0. ... 0. 0. 0.]


0.0
