In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [2]:
train = pd.read_csv('data/preprocessed_train.csv')
test = pd.read_csv('data/preprocessed_test.csv')

print(train.shape)
print(test.shape)

(891, 11)
(418, 10)


In [5]:
features = list(train.columns)

target = 'Survived'
features.remove(target)

train_x = train[features]
train_y = train[target]
test_x = test[features]

train_num = train_x.shape[0]
test_num = test_x.shape[1]

In [11]:
print(train_y.value_counts())

0    549
1    342
Name: Survived, dtype: int64


In [14]:
dead_weight = 549/891.0
survived_weight = 1.0 - dead_weight

In [57]:
from sklearn.linear_model import SGDClassifier, LogisticRegression, RidgeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

seperate into two groups, one can predict prob for soft voting classifier, another group cannot. 

### Group 1: Classifiers with predict_prob functions

In [20]:
sgd = SGDClassifier(loss='log', penalty='elasticnet', class_weight = 'balanced', n_jobs=2,
                    alpha=0.0005, l1_ratio=0.87, random_state=3, max_iter=100000, learning_rate='optimal')
params = {
    'alpha': [0.05, 0.01, 0.005, 0.001],
    'l1_ratio': [1.0, 0.8, 0.6],
    #'random_state': [1,2,3,4,5]
}
gs =  GridSearchCV(estimator=sgd, param_grid = params, scoring='accuracy', iid=False, cv=5)
gs.fit(train_x, train_y)
print(gs.best_params_, gs.best_score_)

{'l1_ratio': 1.0, 'alpha': 0.005} 0.786822089754


In [24]:
sgd = SGDClassifier(loss='log', penalty='elasticnet', class_weight = 'balanced', n_jobs=2,
                    alpha=0.005, l1_ratio=1.0, random_state=3, max_iter=100000, learning_rate='optimal')
params = {
    'random_state': [1,2,3,4,5]
}
gs =  GridSearchCV(estimator=sgd, param_grid = params, scoring='accuracy', iid=False, cv=5)
gs.fit(train_x, train_y)
print(gs.best_params_, gs.best_score_)

{'random_state': 3} 0.786822089754


In [26]:
lr = LogisticRegression(class_weight = 'balanced', random_state=3, max_iter=100000)

params = {
    'penalty': ['l1', 'l2'],
}
gs =  GridSearchCV(estimator=lr, param_grid = params, scoring='accuracy', iid=False, cv=5)
gs.fit(train_x, train_y)
print(gs.best_params_, gs.best_score_)

{'penalty': 'l2'} 0.776684338214


In [44]:
lr = LogisticRegression(class_weight = 'balanced', penalty='l2', random_state=3, max_iter=100000)

params = {
    'random_state': [1,2,3,4,5]
}
gs =  GridSearchCV(estimator=lr, param_grid = params, scoring='accuracy', iid=False, cv=5)
gs.fit(train_x, train_y)
print(gs.best_params_, gs.best_score_)

{'random_state': 1} 0.776684338214


In [35]:
# use lbfgs for small data set
mlp = MLPClassifier(hidden_layer_sizes=(20,), solver='lbfgs', alpha=0.001, random_state=3, 
                    max_iter=10000)
params = {
    'alpha': [0.01, 0.005, 0.001, 0.0005, 0.0001, 0.0005],
}
gs =  GridSearchCV(estimator=mlp, param_grid = params, scoring='accuracy', iid=False, cv=5)
gs.fit(train_x, train_y)
print(gs.best_params_, gs.best_score_)

{'alpha': 0.005} 0.801366344337


In [45]:
# use lbfgs for small data set
mlp = MLPClassifier(hidden_layer_sizes=(20,), solver='lbfgs', alpha=0.005, random_state=3, 
                    max_iter=10000)
params = {
    'random_state': [1,2,3,4,5]
}
gs =  GridSearchCV(estimator=mlp, param_grid = params, scoring='accuracy', iid=False, cv=5)
gs.fit(train_x, train_y)
print(gs.best_params_, gs.best_score_)

{'random_state': 2} 0.804774864201


In [42]:
# use lbfgs for small data set
rf = RandomForestClassifier(n_estimators = 1200, n_jobs = 2, class_weight='balanced',
                            random_state=3,
                            min_samples_split=2, min_weight_fraction_leaf=0.0,
                            min_impurity_decrease=0.0,)
params = {
    'min_samples_split': [0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5],
    'min_weight_fraction_leaf': [0.0, 0.1, 0.2, 0.3, 0.4, 0.49],
    'min_impurity_decrease': [0.2, 0.1, 0.05, 0.01, 0.005, 0.001],
}
gs =  GridSearchCV(estimator=rf, param_grid = params, scoring='accuracy', iid=False, cv=5)
gs.fit(train_x, train_y)
print(gs.best_params_, gs.best_score_)

{'min_samples_split': 0.01, 'min_weight_fraction_leaf': 0.0, 'min_impurity_decrease': 0.001} 0.827221878816


In [46]:
# use lbfgs for small data set
rf = RandomForestClassifier(n_estimators = 1200, n_jobs = 2, class_weight='balanced',
                            random_state=3,
                            min_samples_split=0.01, min_weight_fraction_leaf=0.0,
                            min_impurity_decrease=0.001)
params = {
    'random_state': [1,2,3,4,5]
}
gs =  GridSearchCV(estimator=rf, param_grid = params, scoring='accuracy', iid=False, cv=5)
gs.fit(train_x, train_y)
print(gs.best_params_, gs.best_score_)

{'random_state': 1} 0.830586388263


In [73]:
nb = GaussianNB(priors=[survived_weight, dead_weight])
params = {
    'priors': [[survived_weight, dead_weight], [.5, .5], [dead_weight, survived_weight]]
}
gs =  GridSearchCV(estimator=nb, param_grid = params, scoring='accuracy', iid=False, cv=5)
gs.fit(train_x, train_y)
print(gs.best_params_, gs.best_score_)

{'priors': [0.38383838383838387, 0.6161616161616161]} 0.790230751472


In [78]:
xg = xgb.XGBClassifier(n_estimators = 1200, n_jobs=2,
                       learning_rate=0.01, subsample=1, colsample_bytree=1,
                      min_child_weight=0.0, gamma=0.0,
                      reg_alpha=0, reg_lambda=1,random_state=0)

params = {
    'learning_rate': [0.001, 0.005, 0.01, 0.05],
    'subsample': [1.0, 0.8, 0.6], 
    'colsample_bytree': [1.0, 0.8, 0.6], 
}
gs =  GridSearchCV(estimator=xg, param_grid = params, scoring='accuracy', iid=False, cv=5)
gs.fit(train_x, train_y)
print(gs.best_params_, gs.best_score_)

{'colsample_bytree': 0.8, 'learning_rate': 0.01, 'subsample': 0.8} 0.839550044028


In [None]:
xg = xgb.XGBClassifier(n_estimators = 1200, n_jobs=2,
                       learning_rate=0.01, subsample=.8, colsample_bytree=.8,
                      min_child_weight=0.0, gamma=0.0,
                      reg_alpha=0, reg_lambda=1, random_state=0)

params = {
    'min_child_weight': [0.0, 0.001, 0.005, 0.01, 0.05, 0.1, 0.2],
    'gamma': [0.0, 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05], 
}
gs =  GridSearchCV(estimator=xg, param_grid = params, scoring='accuracy', iid=False, cv=5)
gs.fit(train_x, train_y)
print(gs.best_params_, gs.best_score_)

In [None]:
xg = xgb.XGBClassifier(n_estimators = 1200, n_jobs=2,
                       learning_rate=0.01, subsample=.8, colsample_bytree=.8,
                      min_child_weight=0.0, gamma=0.0,
                      reg_alpha=0, reg_lambda=1, random_state=0)

params = {
    'min_child_weight': [0.0, 0.001, 0.005, 0.01, 0.05, 0.1, 0.2],
    'gamma': [0.0, 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05], 
}
gs =  GridSearchCV(estimator=xg, param_grid = params, scoring='accuracy', iid=False, cv=5)
gs.fit(train_x, train_y)
print(gs.best_params_, gs.best_score_)

In [74]:
# 0.786822089754
sgd = SGDClassifier(loss='log', penalty='elasticnet', class_weight = 'balanced', n_jobs=2,
                    alpha=0.005, l1_ratio=1.0, random_state=3, max_iter=100000, learning_rate='optimal')

# 0.776684338214
lr = LogisticRegression(class_weight = 'balanced', penalty='l2', random_state=1, max_iter=100000)

# 0.804774864201
mlp = MLPClassifier(hidden_layer_sizes=(20,), solver='lbfgs', alpha=0.005, random_state=2, 
                    max_iter=10000)

# 0.830586388263
rf = RandomForestClassifier(n_estimators = 1200, n_jobs = 2, class_weight='balanced',
                            random_state=1,
                            min_samples_split=0.01, min_weight_fraction_leaf=0.0,
                            min_impurity_decrease=0.001)

# 0.790230751472
nb = GaussianNB(priors=[survived_weight, dead_weight])


### Group 2: Classifiers without predict_prob functions

In [31]:
ridge = RidgeClassifier(alpha=1.0, normalize=True, class_weight='balanced', random_state=3, max_iter=10000)
params = {
    'alpha': [1.0, 0.8, 0.6, 0.4, 0.2],
}
gs =  GridSearchCV(estimator=ridge, param_grid = params, scoring='accuracy', iid=False, cv=5)
gs.fit(train_x, train_y)
print(gs.best_params_, gs.best_score_)

{'alpha': 0.2} 0.777770129446


In [47]:
ridge = RidgeClassifier(alpha=0.2, normalize=True, class_weight='balanced', random_state=3, max_iter=10000)
params = {
    'random_state': [1,2,3,4,5]
}
gs =  GridSearchCV(estimator=ridge, param_grid = params, scoring='accuracy', iid=False, cv=5)
gs.fit(train_x, train_y)
print(gs.best_params_, gs.best_score_)

{'random_state': 1} 0.777770129446


In [53]:
# 0.777770129446
ridge = RidgeClassifier(alpha=0.2, normalize=True, class_weight='balanced', random_state=1, max_iter=10000)

## Use Group1 for voting classifier

In [54]:
from sklearn.ensemble import VotingClassifier

In [71]:
weights = np.array([0.786822089754, 0.776684338214, 0.804774864201, 0.830586388263, 0.790230751472])

vc = VotingClassifier(estimators=[('sgd', sgd), ('lr', lr), ('mlp', mlp), ('rf', rf), ('nb', nb)], 
                 voting='soft', weights=weights, n_jobs=2)

In [72]:
params = {
    'weights': [weights**2, weights**3, weights**4, weights**5, weights**6]
}

gs =  GridSearchCV(estimator=vc, param_grid = params, scoring='accuracy', iid=False, cv=5)
gs.fit(train_x, train_y)
print(gs.best_params_, gs.best_score_)

KeyboardInterrupt: 

## Use Group1 + 2 in the stacking averaged model

In [None]:
supervised_dict = {
    'sgd': sgd,
    'lr': lr,
    'mlp': mlp,
    'rf': rf,
    'nb': nb,
    'ridge': ridge
}