In [1]:
# Imports

# pandas
import pandas as pd
from pandas import Series,DataFrame
import sklearn
import seaborn as sns
import scipy

# numpy, matplotlib, seaborn
import numpy as np
import matplotlib.pyplot as plt

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

# import seaborn as sns
# sns.set_style('whitegrid')
%matplotlib inline



In [2]:
titanic_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

# drop unnecessary columns, these columns won't be useful in analysis and prediction
titanic_df = titanic_df.drop(['PassengerId','Name','Ticket','Cabin'], axis=1)
test_df    = test_df.drop(['Name','Ticket','Cabin'], axis=1)
titanic_df["Embarked"].fillna("S", inplace=True)
test_df["Embarked"].fillna("S", inplace=True)
test_fare_mean = test_df["Fare"].mean()
test_df["Fare"].fillna(test_fare_mean, inplace=True)

# get average, std, and number of NaN values in titanic_df
average_age_titanic   = titanic_df["Age"].mean()
std_age_titanic       = titanic_df["Age"].std()
count_nan_age_titanic = titanic_df["Age"].isnull().sum()

# get average, std, and number of NaN values in test_df
average_age_test   = test_df["Age"].mean()
std_age_test       = test_df["Age"].std()
count_nan_age_test = test_df["Age"].isnull().sum()

# generate random numbers between (mean - std) & (mean + std)
rand_1 = np.random.randint(average_age_titanic - std_age_titanic, average_age_titanic + std_age_titanic, size = count_nan_age_titanic)
rand_2 = np.random.randint(average_age_test - std_age_test, average_age_test + std_age_test, size = count_nan_age_test)

# fill NaN values in Age column with random values generated
titanic_df["Age"][np.isnan(titanic_df["Age"])] = rand_1
test_df["Age"][np.isnan(test_df["Age"])] = rand_2

#transform Sex
titanic_df.replace(to_replace=["male", "female"], value=[1, 0], inplace=True)
titanic_df.replace(to_replace=["S", "C", "Q"], value=[1, 2, 3], inplace=True)

test_df.replace(to_replace=["male", "female"], value=[1, 0], inplace=True)
test_df.replace(to_replace=["S", "C", "Q"], value=[1, 2, 3], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [3]:
from sklearn import preprocessing
# define training and testing sets

X = titanic_df.drop("Survived",axis=1).values
Y = titanic_df["Survived"].values
X_predict  = test_df.drop("PassengerId",axis=1).copy().values


In [5]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

cv = StratifiedShuffleSplit(n_splits=3, test_size=0.2, random_state=0)

clf = make_pipeline(preprocessing.StandardScaler(), MLPClassifier(solver='lbfgs', activation='logistic', alpha=3e-4, hidden_layer_sizes=(2, 400), random_state=1))
cross_val_score(clf, X, Y, cv=5)

array([ 0.75977654,  0.77094972,  0.8258427 ,  0.79775281,  0.85875706])

In [130]:
from sklearn.ensemble import GradientBoostingClassifier
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=None)

clf = GradientBoostingClassifier(n_estimators=300, learning_rate=0.01, max_depth=3, random_state=None).fit(X_train, Y_train)
print("train score :", clf.score(X_train, Y_train) , '\ntest score:', clf.score(X_test, Y_test))

train score : 0.842696629213 
test score: 0.826815642458


In [236]:
X_pred_scaled = scaler.transform(X_predict)
Y_pred = clf.predict(X_pred_scaled)
submission = pd.DataFrame({
        "PassengerId": test_df["PassengerId"],
        "Survived": Y_pred
    })
submission.to_csv("titanic.csv", index=False)

In [6]:
#paramaters select and ensemb
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier

In [12]:
from scipy.spatial import distance
def func_noncorrcoef_weight(prediction_mat):
    cc = np.corrcoef(prediction_mat)
    cc_scale = 1- (cc + 1)/2
    a = np.sum(cc_scale, axis=0) / (cc_scale.shape[0] - 1)
    return a

def func_ensemble_weight(accuracy_lst, prediction_mat, ac_weight):
    aw = np.tan(np.pi * accuracy_lst /2)
    cw = func_dissim_weight(prediction_mat)
    ew = np.multiply(cw, aw)
    return ew / np.sum(ew), ew.mean()

def func_ensemble_weight2(accuracy_lst, prediction_mat, ac_weight):
    aw = np.tan(np.pi * accuracy_lst /2)
    cw = func_dissim_weight(prediction_mat)
    cw = np.tan(np.pi * cw / 2)
    ew = cw + ac_weight * aw
    return ew / np.sum(ew), ew.mean()

#caculate the dissimilarty of the predcitons
def func_dissim_weight(prediction_mat):
    num_prediction = prediction_mat.shape[1]
    dissim_mat = distance.squareform(distance.pdist(prediction_mat, 'sqeuclidean')) / num_prediction
    return np.sum(dissim_mat, axis=0) / (dissim_mat.shape[0] - 1)

X_ensemble, Y_ensemble = X_test, Y_test
num_ensemble = Y_ensemble.shape[0]
max_loop = 100
stop_accuracy = 0.9
num_clf = 10
activation = ['identity', 'logistic', 'tanh', 'relu']
no_activation = np.random.randint(0, 4, 10)
param_dict = dict(solver='lbfgs', alpha=3e-6, random_state=None)
accuracy_lst = np.zeros(num_clf)
prediction_mat = np.zeros((num_clf, num_ensemble))
clf_lst = []
for i in range(num_clf):
    nn_size = (np.random.randint(100, 200)
               , np.random.randint(100, 200))
    nn_activation = activation[no_activation[i]]
    clf = MLPClassifier(hidden_layer_sizes = nn_size, activation = nn_activation, **param_dict)
    clf.fit(X_train, Y_train)
    prediction_mat[i, :] = clf.predict(X_ensemble)
    clf_lst.append(clf)
    accuracy_lst[i] = clf.score(X_ensemble, Y_ensemble)
    
dissim_lst = func_dissim_weight(prediction_mat)
print("no_activation: ", no_activation)
print("accuracy_lst: ", accuracy_lst)
print("dissim_lst: ", dissim_lst)
ensemble_weight, ew_rate = func_ensemble_weight2(accuracy_lst, prediction_mat, 1)
ensemble_pred = np.dot(ensemble_weight, prediction_mat)
ensemble_pred[np.argwhere(ensemble_pred > 0.5)] = 1
ensemble_pred[np.argwhere(ensemble_pred <= 0.5)] = 0
ensembled_accuracy = sklearn.metrics.accuracy_score(Y_ensemble, ensemble_pred)
print("ensemble_weight: ", ensemble_weight)
print("ensembled_aaccuracy :", ensembled_accuracy, "\nmax accuracy:", np.max(accuracy_lst), 
      "\new_rate: ", ew_rate)
# for i in range(max_loop):
#     if np.any(clf_lst > stop_accuracy):
#         print("clf no.%d has reach stop accuracy" % np.argwhere(clf_lst > stop_accuracy))
    


no_activation:  [0 2 2 1 1 2 0 0 0 2]
accuracy_lst:  [ 0.81564246  0.81564246  0.7877095   0.81564246  0.82681564  0.81564246
  0.81564246  0.81564246  0.81564246  0.83240223]
dissim_lst:  [ 0.07821229  0.08193669  0.09683426  0.09310987  0.08814401  0.09683426
  0.07821229  0.07821229  0.07821229  0.08690255]
ensemble_weight:  [ 0.09933531  0.09950499  0.08678939  0.10001564  0.10631815  0.10018642
  0.09933531  0.09933531  0.09933531  0.10984419]
ensembled_aaccuracy : 0.815642458101 
max accuracy: 0.832402234637 
ew_rate:  3.50286739903


In [16]:
def func_ensemble_weight3(accuracy_lst, prediction_mat, ac_weight):
    aw = np.tan(np.pi * accuracy_lst /2)
    aw_rate = aw / np.sum(aw)
    cw = func_dissim_weight(prediction_mat)
    cw = np.tan(np.pi * cw / 2)
    ew = cw * aw
    return ew / np.sum(ew), ew.mean()
ensemble_weight, ew_rate = func_ensemble_weight3(accuracy_lst, prediction_mat, 1e2)
ensemble_pred = np.dot(ensemble_weight, prediction_mat)
ensemble_pred[np.argwhere(ensemble_pred > 0.5)] = 1
ensemble_pred[np.argwhere(ensemble_pred <= 0.5)] = 0
ensembled_accuracy = sklearn.metrics.accuracy_score(Y_ensemble, ensemble_pred)
print("ensemble_weight: ", ensemble_weight)
print(" ensembled_aaccuracy : %f.\n" % ensembled_accuracy, "max accuracy: %f." % np.max(accuracy_lst))

ensemble_weight:  [ 0.09102714  0.09540891  0.09720413  0.10859532  0.10972793  0.11300555
  0.09102714  0.09102714  0.09102714  0.11194958]
 ensembled_aaccuracy : 0.804469.
 max accuracy: 0.832402.


In [182]:
clf = MLPClassifier(solver='lbfgs', activation='tanh', alpha=3e-5, hidden_layer_sizes=(200, 200), 
                    verbose=True, max_iter=500 )
clf.fit(X_train, Y_train)
print("train score:", clf.score(X_train, Y_train), "\ntest score:", clf.score(X_test, Y_test))

train score: 0.884831460674 
test score: 0.787709497207


In [281]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

scaler= preprocessing.StandardScaler().fit(X)
X_scaled = scaler.transform(X)
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=None)

GB_clf = GradientBoostingClassifier(n_estimators=300, learning_rate=0.01, max_depth=3, random_state=None)
parameters = {'n_estimators':[100, 300, 500], 'max_depth':[1, 3, 5], "learning_rate": [0.1, 0.01, 0.03, 0.001]}
clf = GridSearchCV(GB_clf, parameters)
clf.fit(X_train, Y_train)
clf = clf.best_estimator_
print("train score:", clf.score(X_train, Y_train), "\ntest score:", clf.score(X_test, Y_test))

train score: 0.900280898876 
test score: 0.793296089385


In [266]:
clf.best_estimator_.score(X_test, Y_test)

0.81564245810055869

In [291]:
from sklearn.ensemble import GradientBoostingClassifier
scaler= preprocessing.MinMaxScaler().fit(X)
X_scaled = scaler.transform(X)
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=None)

num_clf = 10
clf_lst = list() 
for i in range(num_clf):
    clf = GradientBoostingClassifier(n_estimators=300, learning_rate=0.01, max_depth=3, random_state=i).fit(X_train, Y_train)
    clf_lst.append(clf)
    print("dlf no", i, "train score :", clf.score(X_train, Y_train) , 'test score:', clf.score(X_test, Y_test))

dlf no 0 train score : 0.86797752809 test score: 0.837988826816
dlf no 1 train score : 0.86797752809 test score: 0.837988826816
dlf no 2 train score : 0.86797752809 test score: 0.837988826816
dlf no 3 train score : 0.86797752809 test score: 0.837988826816
dlf no 4 train score : 0.86797752809 test score: 0.837988826816
dlf no 5 train score : 0.86797752809 test score: 0.837988826816
dlf no 6 train score : 0.86797752809 test score: 0.837988826816
dlf no 7 train score : 0.86797752809 test score: 0.837988826816
dlf no 8 train score : 0.86797752809 test score: 0.837988826816
dlf no 9 train score : 0.86797752809 test score: 0.837988826816


In [249]:
clf.best_estimator_.score(X_train, Y_train)

0.8904494382022472

In [219]:
from sklearn.ensemble import GradientBoostingClassifier
scaler= preprocessing.StandardScaler().fit(X)
X_scaled = scaler.transform(X)
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=None)

clf = GradientBoostingClassifier(n_estimators=500, learning_rate=0.003, max_depth=3, random_state=None).fit(X_train, Y_train)
print("train score :", clf.score(X_train, Y_train) , '\ntest score:', clf.score(X_test, Y_test))

train score : 0.842696629213 
test score: 0.821229050279


In [294]:
scaler= preprocessing.MinMaxScaler().fit(X)
X_scaled = scaler.transform(X)
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=None)

num_clf = 10
clf_lst = list() 
for i in range(num_clf):
    clf = MLPClassifier(solver='lbfgs', activation='logistic', alpha=3e-6, hidden_layer_sizes=(200, 200), 
                        verbose=False, max_iter=200 ).fit(X_train, Y_train)
    clf_lst.append(clf)
    print("clf no.%d" % i, "train score:", clf.score(X_train, Y_train), "test score:", clf.score(X_test, Y_test))

clf no.0 train score: 0.818820224719 test score: 0.810055865922
clf no.1 train score: 0.820224719101 test score: 0.821229050279
clf no.2 train score: 0.813202247191 test score: 0.826815642458
clf no.3 train score: 0.831460674157 test score: 0.821229050279
clf no.4 train score: 0.818820224719 test score: 0.826815642458
clf no.5 train score: 0.817415730337 test score: 0.821229050279
clf no.6 train score: 0.831460674157 test score: 0.815642458101
clf no.7 train score: 0.824438202247 test score: 0.815642458101
clf no.8 train score: 0.823033707865 test score: 0.826815642458
clf no.9 train score: 0.821629213483 test score: 0.826815642458


In [295]:
X_pred_scaled = scaler.transform(X_predict)
Y_pred = clf.predict(X_pred_scaled)
submission = pd.DataFrame({
        "PassengerId": test_df["PassengerId"],
        "Survived": Y_pred
    })
submission.to_csv("titanic.csv", index=False)

In [274]:
X_scaled.mean(axis=0)

array([ 0.65432099,  0.64758698,  0.36891139,  0.06537598,  0.06359895,
        0.06285843,  0.18069585])