In [40]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier

In [41]:
X = np.load("./tatanic_X_train.npy")
y = np.load("./tatanic_y_train.npy")

In [42]:
X[:5]

array([[0.27345609, 0.01415106, 0.        , 1.        , 0.        ,
        0.125     , 0.        , 0.        , 0.        , 1.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        1.        , 0.        , 0.        , 1.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.473882  , 0.13913574, 0.        , 0.        , 1.        ,
        0.125     , 0.25      , 1.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 1.        , 0.        , 0.        , 0.        ,
        0.        , 1.        , 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.32356257, 0.01546857, 0.        , 1.        , 1.        ,
        0.        , 0.        , 0.        , 0.        , 1.        ,
        0.        , 0.        , 0.        , 0.        , 1.        ,
        0.        , 0.        , 0.        , 1.    

In [43]:
y[:5]

array([0., 1., 1., 1., 0.])

In [44]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [45]:
clflog = LogisticRegression(random_state = 1)
clfdt = DecisionTreeClassifier(random_state = 1)
clfgn = GaussianNB()
eclf_h = VotingClassifier(estimators = [('lr', clflog), ('rf', clfdt),
                                        ('gnb', clfgn)], voting = 'hard')
eclf_s = VotingClassifier(estimators = [('lr', clflog), ('rf', clfdt),
                                        ('gnb', clfgn)], voting = 'soft')

In [46]:
models = [clflog, clfdt, clfgn, eclf_h, eclf_s]

In [47]:
for model in models:
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    score = model.score(X_test, y_test)
    
    print(model)
    print(score)
    print('='*120)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=1, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
0.8052434456928839
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=1,
            splitter='best')
0.7715355805243446
GaussianNB(priors=None, var_smoothing=1e-09)
0.7640449438202247
VotingClassifier(estimators=[('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=1, solver='warn',
          tol=0.0001, verbose=0, warm_start=



In [48]:
from sklearn.model_selection import cross_val_score
for model in models:
    scores = cross_val_score(model, X_train, y_train, cv = 5)
    
    print(scores)
    print(scores.mean())
    print('=='*30)

[0.84       0.8        0.7983871  0.87903226 0.83064516]
0.8296129032258065
[0.736      0.752      0.73387097 0.79032258 0.77419355]
0.7572774193548387
[0.728      0.392      0.37903226 0.78225806 0.78225806]
0.6127096774193548
[0.784      0.76       0.75       0.86290323 0.82258065]
0.7958967741935484
[0.768      0.752      0.75806452 0.86290323 0.82258065]
0.7927096774193549




In [49]:
clf1 = LogisticRegression(random_state = 1)
clf2 = DecisionTreeClassifier(random_state = 1)
eclf1 = VotingClassifier(estimators= [('lr', clf1), ('dt', clf2)], 
                         voting = 'hard')
eclf2 = VotingClassifier(estimators= [('lr', clf1), ('dt', clf2)], 
                         voting = 'soft')

In [50]:
models = [clf1, clf2, eclf1, eclf2]

for model in models:
    score = cross_val_score(model, X_train, y_train, cv = 5)
    
    print(scores)
    print(scores.mean())
    print('=='*30)

[0.768      0.752      0.75806452 0.86290323 0.82258065]
0.7927096774193549
[0.768      0.752      0.75806452 0.86290323 0.82258065]
0.7927096774193549
[0.768      0.752      0.75806452 0.86290323 0.82258065]
0.7927096774193549
[0.768      0.752      0.75806452 0.86290323 0.82258065]
0.7927096774193549




In [51]:
eclf = VotingClassifier(estimators=[('lr', clf1), ('dt', clf2)], voting='hard')

In [67]:
c_params = [0.1, 5.0, 7.0, 10.0, 15.0, 20.0, 100.0]
params ={
    "lr__solver" : ['liblinear'], "lr__penalty" : ["l2"], \
    "lr__C" : c_params, "dt__criterion" : ["gini", "entropy"],
    "dt__max_depth" : [10,8,7,6,5,4,3,2],
    "dt__min_samples_leaf": [1,2,3,4,5,6,7,8,9]
    }

# c_params = [0.1, 5.0, 7.0, 10.0, 15.0, 20.0, 100.0]
# params ={
#     "lr__solver" : ['liblinear'], "lr__penalty" : ["l2"], \
#     "lr__C" : c_params, "dt__criterion" : ["gini", "entropy"],
#     "dt__max_depth" : [10,8,7,6,5,4,3,2],
#     "dt__min_samples_leaf": [1,2,3,4,5,6,7,8,9]
#     }


In [68]:
# cross_validation(cv), grid_search(param_grid), ensemble(estimator) =>

from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(estimator = eclf, param_grid = params, cv = 5)
grid = grid.fit(X, y)

In [69]:
grid.best_score_

0.84251968503937

In [70]:
grid.best_params_

{'dt__criterion': 'gini',
 'dt__max_depth': 10,
 'dt__min_samples_leaf': 5,
 'lr__C': 5.0,
 'lr__penalty': 'l2',
 'lr__solver': 'liblinear'}

In [72]:
c_params= [0.1, 5.0, 7.0, 10.0, 15.0, 20.0, 100.0]

params = {"solver" : ['liblinear'], "penalty" : ["l2"], "C" : c_params}
grid = GridSearchCV(clf1, param_grid = params, cv = 5)
grid = grid.fit(X, y)

In [73]:
grid.best_score_

0.8267716535433071

In [74]:
grid.best_params_

{'C': 5.0, 'penalty': 'l2', 'solver': 'liblinear'}