In [35]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
import torch.utils.data as data
from pathlib import Path
import tensorboardX as tbx
import os
from sklearn import preprocessing
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb


In [93]:

def import_data():
    train_data = pd.read_csv('titanic/train.csv')
    test_data = pd.read_csv('titanic/test.csv')
    return train_data, test_data


def predict_test_data(model, test_data):
    # model = use_gpu(model)
    model.eval()
    pred = np.round(model.pred(test_data.values))
    pred = np.clip(pred, 0, 1).astype(int)
    return pred


def load_data(enable_labels=None):
    train_data, test_data = import_data()
    if not enable_labels:
        labels = train_data.columns.value
        train_x = labels.delete('Survived')
        test_x = test_data[enable_labels]
    else:
        train_x = train_data[enable_labels]
        test_x = test_data[enable_labels]
    
    train_y = train_data['Survived']
    # test_y = test_data['Survived']
    
    return train_x, train_y


def get_null_index(data):
    null_index = data.isnull().any(axis=1)
    return null_index


def remove_nan_preprocess(train_x, train_y):
    # とりあえず一つでも欠損していればそのデータは有効にしないようにする
    # train_x = train_x.values
    null_index = get_null_index(train_x)
    train_x = train_x[~null_index]
    train_y = train_y[~null_index]

    MALE = .0
    FEMALE = 1.0
    Q = .0
    S = 1.0
    C = 2.0

    train_x = train_x.replace('male', MALE)
    train_x = train_x.replace('female', FEMALE)
    train_x = train_x.replace('Q', Q)
    train_x = train_x.replace('S', S)
    train_x = train_x.replace('C', C)

    return train_x, train_y


def preprocess_from_startup(train_x, train_y, mapping_order=False):
    """
    https://www.kaggle.com/startupsci/titanic-data-science-solutions
    で記述されているようなデータの前処理を行う．
    :param train_x:
    :param train_y:
    :return:
    """
    # カテゴリ変数のマッピングについては，survivedに対する相関係数順にするほうがいい気がする
    # 均等性を考慮して，survivedの平均にマッピングするようにした
    if mapping_order:
        # Mrs > Miss > Master > Rare > Mr
        gender_mapping = {'male': 0.188908, 'female': 0.742038}
        title_mapping = {"Mr": 0.156673, "Rare": 0.347826, "Master": 0.575000, 
                         "Miss": 0.702703, "Mrs": 0.793651}
        embarkation_mapping = {'S': 0.336957, 'C': 0.553571, 'Q': 0.389610}
    else:
        gender_mapping = {'male': 1, 'female': 0}
        title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
        embarkation_mapping = {'S': 0, 'C': 1, 'Q': 2}

    # Ticketsは，相関性が見込めないため入力データには適していない
    # Cabinは，欠損データが多く，入力データには適していない
    # PassengerIdは，survivedに対して相関がほぼ無いため，予測に適していない．
    # -->これらは無効にする
    train_x = train_x.drop(['PassengerId', 'Ticket', 'Cabin'], axis=1)

    # Title(敬称)から，新たに特徴量を生成にする
    train_x['Title'] = train_x.Name.str.extract('([A-Za-z]+)\.', expand=False)
    # 少数の敬称は全てRareで統一する
    train_x['Title'] = train_x['Title'].replace(['Lady', 'Countess', 'Capt', 'Col',
                                           'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    # 変形しているだけで同じ意味のものを変換しておく
    train_x['Title'] = train_x['Title'].replace('Mlle', 'Miss')
    train_x['Title'] = train_x['Title'].replace('Ms', 'Miss')
    train_x['Title'] = train_x['Title'].replace('Mme', 'Mrs')
    train_x['Title'] = train_x['Title'].map(title_mapping)
    train_x['Title'] = train_x['Title'].fillna(0)

    # 性別を数値データにマッピング
    train_x['Sex'] = train_x['Sex'].map(gender_mapping)

    # Nameは使用しないためdrop
    train_x = train_x.drop(['Name'], axis=1)

    # Ageの欠損値を，genderとPclass別の中央値で補完する
    for i in gender_mapping.values():
        for j in range(0, 3):
            current_df = train_x[np.logical_and(train_x['Sex'] == i, train_x['Pclass'] == j + 1)]
            guess_df = current_df['Age'].dropna()
            med = np.round(np.median(guess_df))
            # print(current_df['Age'].isnull().va)
            # train_x.loc[train_x[np.logical_and(train_x['Sex'] == i, train_x['Pclass'] == j + 1)]
            #             ['Age'].isnull(), 'Age'] = med
            train_x.loc[(train_x.Age.isnull()) & (train_x.Sex == i) & (train_x.Pclass == j + 1), \
                        'Age'] = med

    # 年齢層を示す特徴量Agebandを定義
    # train_x['AgeBand'] = pd.cut(train_x['Age'], 5)
    # 特徴量FareBandを定義する
    # train_x['FareBand'] = pd.cut(train_x['Fare'], 4)

    # 家族の人数を示す特徴量FamilySizeを定義する
    # Parchは正の相関，SibSpは負の相関をsurvivedに対して持つため，parch + sibspはどうなんだろ
    # FamilySizeにしてしまったことで，相関係数の絶対値が小さくなるけど...
    train_x['FamilySize'] = train_x['Parch'] + train_x['SibSp'] + 1

    # 新たな特徴量IsAlone，Age*Classを定義
    train_x['IsAlone'] = 0
    train_x.loc[train_x['FamilySize'] == 1, 'IsAlone'] = 1
    train_x['Age*Class'] = train_x.Age * train_x.Pclass
    train_x = train_x.drop(['Age', 'Pclass'], axis=1)

    # Embarkedの欠損値を，最頻値で補完する
    train_x['Embarked'] = train_x['Embarked'].map(embarkation_mapping)
    embarkation_mode = train_x['Embarked'].dropna().mode()[0]
    train_x['Embarked'] =train_x['Embarked'].fillna(embarkation_mode)

    # Fareの欠損値を，中央値で補完にする
    train_x['Fare'] = train_x['Fare'].fillna(train_x['Fare'].median())
    return train_x, train_y



In [92]:
def training_randomforest(data_x, data_y, test_name, depth, n_forest):
    train_x, test_x, train_y, test_y = train_test_split(data_x, data_y,
                                                test_size=0.2,
                                                shuffle=True,
                                                random_state=47,
                                                stratify=data_y)

    random_forest = RandomForestClassifier(max_depth=depth, n_estimators=n_forest, random_state=42)
    random_forest.fit(train_x, train_y)
    
    trainaccuracy_random_forest = random_forest.score(train_x, train_y)
    print('TrainAccuracy: {}'.format(trainaccuracy_random_forest))
    
    y_pred = random_forest.predict(test_x)
    acc = accuracy_score(test_y, y_pred)
    print(f'Test Accuracy: {acc}')
    
    eval = pd.DataFrame([[trainaccuracy_random_forest, acc]], columns=['train Accuracy', 'Test Accuracy'])
    Path(f'random_forest/{test_name}').mkdir(parents=True, exist_ok=True)
    
    eval.to_csv(f'random_forest/{test_name}/score.csv', index=False)
    
    return random_forest


def training_xgboost(data_x, data_y, test_name):
    train_x, test_x, train_y, test_y = train_test_split(data_x, data_y,
                                                test_size=0.2,
                                                shuffle=True,
                                                random_state=47,
                                                stratify=data_y)

    model = xgb.XGBRegressor()
    model.fit(train_x, train_y)
    
    y_train_pred = model.predict(train_x)
    y_test_pred = model.predict(test_x)
    y_train_pred = np.clip(np.round(y_train_pred), 0, 1)
    y_test_pred = np.clip(np.round(y_test_pred), 0, 1)
    
    train_acc = accuracy_score(y_train_pred, train_y)
    print('TrainAccuracy: {}'.format(train_acc))
    test_acc = accuracy_score(y_test_pred, test_y)
    print(f'Test Accuracy: {test_acc}')
    
    eval = pd.DataFrame([[train_acc, test_acc]], columns=['train Accuracy', 'Test Accuracy'])
    Path(f'XGBoost/{test_name}').mkdir(parents=True, exist_ok=True)
    
    eval.to_csv(f'XGBoost/{test_name}/score.csv', index=False)
    
    return model


def predict_forest(forest, test_x, passenger_id, test_name):
    test_input = test_x.values
    y_pred = forest.predict(test_input)
    y_pred = np.clip(np.round(y_pred), 0, 1).astype(int)
    
    Path(f'random_forest/{test_name}').mkdir(parents=True, exist_ok=True)
    pred = pd.DataFrame(y_pred, columns=['Survived'])
    submission_pred = pd.concat([passenger_id, pred], axis=1)
    submission_pred.to_csv(f'random_forest/{test_name}/submission.csv', index=False)
    
    
def predict_xgboost(model, test_x, passenger_id, test_name):
    test_input = test_x.values
    y_pred = model.predict(test_input)
    
    y_pred = np.clip(np.round(y_pred), 0, 1).astype(int)
    
    Path(f'XGBoost/{test_name}').mkdir(parents=True, exist_ok=True)
    pred = pd.DataFrame(y_pred, columns=['Survived'])
    submission_pred = pd.concat([passenger_id, pred], axis=1)
    submission_pred.to_csv(f'XGBoost/{test_name}/submission.csv', index=False)


def trainging_normal_randomforest():
    train, test = import_data()
    train_x = train.drop(['Survived'], axis=1)
    train_y = train['Survived']
    train_x, train_y = preprocess_from_startup(train_x, train_y)
    forest = training_randomforest(train_x.values, train_y.values, 'normal', 10, 10)
    
    test_x, _ = preprocess_from_startup(test, None)
    predict_forest(forest, test_x, test['PassengerId'], 'normal')


def trainging_normal_xgboost():
    train, test = import_data()
    train_x = train.drop(['Survived'], axis=1)
    train_y = train['Survived']
    train_x, train_y = preprocess_from_startup(train_x, train_y)
    forest = training_xgboost(train_x.values, train_y.values, 'normal')
    
    test_x, _ = preprocess_from_startup(test, None)
    predict_xgboost(forest, test_x, test['PassengerId'], 'normal')



In [1]:
class LNN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LNN, self).__init__()
        DEPTH = 8
        UNITS = 128
        self.filename = 'LNN.pth'
        self.fc_input = nn.Linear(input_dim, UNITS)
        self.fc_array = nn.ModuleList([nn.Linear(UNITS, UNITS) for _ in range(DEPTH - 2)])
        self.fc_output = nn.Linear(UNITS, output_dim)

    def forward(self, x):
        x = F.dropout(x, training=self.training)
        y = F.relu(self.fc_input(x))
        for layer in self.fc_array:
            y = F.dropout(y, training=self.training)
            y = F.relu(layer(y))
        y = F.dropout(y, training=self.training)
        y = self.fc_output(y)
        return y

    def pred(self, x):
        x = torch.from_numpy(x).float()
        pred = self.forward(x).detach().numpy()
        pred = np.clip(np.round(pred), 0, 1)
        return pred

    def save(self, save_path):
        Path(save_path).mkdir(parents=True, exist_ok=True)
        torch.save(self.state_dict(), f'{save_path}/{self.filename}')

    def load(self, load_path):
        if os.path.isfile(f'{load_path}/{self.filename}'):
            self.load_state_dict(torch.load(f'{load_path}/{self.filename}'))
            return True
        else:
            return False


class ResLNN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(ResLNN, self).__init__()
        DEPTH = 8
        UNITS = 128
        self.filename = 'ResLNN.pth'
        self.fc_input = nn.Linear(input_dim, UNITS)
        self.fc_array = nn.ModuleList([nn.Linear(UNITS, UNITS) for _ in range(DEPTH - 2)])
        self.fc_output = nn.Linear(UNITS, output_dim)

    def forward(self, x):
        x = F.dropout(x, training=self.training)
        y = F.relu(self.fc_input(x))
        res = y
        for n, layer in enumerate(self.fc_array):
            y = F.dropout(y, training=self.training)
            y = F.relu(layer(y))
            if n % 2 == 2 and n != 0:
                y += res
                res = y
        y = F.dropout(y, training=self.training)
        y = self.fc_output(y)
        return y

    def pred(self, x):
        x = torch.from_numpy(x).float()
        pred = self.forward(x).detach().numpy()
        pred = np.clip(np.round(pred), 0, 1)
        return pred

    def save(self, save_path):
        Path(save_path).mkdir(parents=True, exist_ok=True)
        torch.save(self.state_dict(), f'{save_path}/{self.filename}')

    def load(self, load_path):
        if os.path.isfile(f'{load_path}/{self.filename}'):
            self.load_state_dict(torch.load(f'{load_path}/{self.filename}'))
            return True
        else:
            return False


class BNLNN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(BNLNN, self).__init__()
        DEPTH = 16
        UNITS = 256
        self.filename = 'BnLNN.pth'
        self.fc_input = nn.Linear(input_dim, UNITS)
        self.fc_array = nn.ModuleList([nn.Linear(UNITS, UNITS) for _ in range(DEPTH - 2)])
        self.bn_array = nn.ModuleList([nn.BatchNorm1d(UNITS) for _ in range(DEPTH - 2)])
        self.fc_output = nn.Linear(UNITS, output_dim)

    def forward(self, x):
        y = F.relu(self.fc_input(x))
        for layer, bn in zip(self.fc_array, self.bn_array):
            y = F.relu(bn(layer(y)))
        y = self.fc_output(y)
        return y

    def pred(self, x):
        x = torch.from_numpy(x).float()
        pred = self.forward(x).detach().numpy()
        pred = np.clip(np.round(pred), 0, 1)
        return pred

    def save(self, save_path):
        Path(save_path).mkdir(parents=True, exist_ok=True)
        torch.save(self.state_dict(), f'{save_path}/{self.filename}')

    def load(self, load_path):
        if os.path.isfile(f'{load_path}/{self.filename}'):
            self.load_state_dict(torch.load(f'{load_path}/{self.filename}'))
            return True
        else:
            return False


class ResBNLNN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(ResBNLNN, self).__init__()
        DEPTH = 16
        UNITS = 256
        self.filename = 'ResBnLNN.pth'
        self.fc_input = nn.Linear(input_dim, UNITS)
        self.fc_array = nn.ModuleList([nn.Linear(UNITS, UNITS) for _ in range(DEPTH - 2)])
        self.bn_array = nn.ModuleList([nn.BatchNorm1d(UNITS) for _ in range(DEPTH - 2)])
        self.fc_output = nn.Linear(UNITS, output_dim)

    def forward(self, x):
        y = F.relu(self.fc_input(x))
        res = y
        n = 0
        for layer, bn in zip(self.fc_array, self.bn_array):
            y = F.relu(bn(layer(y)))

            if n % 2 == 2 and n != 0:
                y += res
                res = y
            n += 1

        y = self.fc_output(y)
        return y

    def pred(self, x):
        x = torch.from_numpy(x).float()
        pred = self.forward(x).detach().numpy()
        pred = np.clip(np.round(pred), 0, 1)
        return pred

    def save(self, save_path):
        Path(save_path).mkdir(parents=True, exist_ok=True)
        torch.save(self.state_dict(), f'{save_path}/{self.filename}')

    def load(self, load_path):
        if os.path.isfile(f'{load_path}/{self.filename}'):
            self.load_state_dict(torch.load(f'{load_path}/{self.filename}'))
            return True
        else:
            return False

def use_gpu(e):
    if torch.cuda.is_available():
        return e.cuda()
    return e


def train(model, loss_func, optimizer, trX, trY):
    x = Variable(trX, requires_grad=False)
    y = Variable(trY, requires_grad=False)
    optimizer.zero_grad()
    y_pred = model.forward(x)
    loss = loss_func(y_pred, y)
    loss.backward()
    optimizer.step()
    return loss.data


def training(model, data_x, data_y, epochs, batch_size, model_name, eval_num=20, visualize_num=10):
    train_x, test_x, train_y, test_y = train_test_split(data_x, data_y,
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    random_state=47,
                                                    stratify=data_y)
    train_ = data.TensorDataset(torch.from_numpy(train_x).float(),
                                torch.from_numpy(train_y).float())
    train_iter = torch.utils.data.DataLoader(train_, batch_size=batch_size, shuffle=True)
    
    # Path(f'{save_path}/total_loss/').mkdir(exist_ok=True, parents=True)
    # Path(f'{save_path}/metrics/').mkdir(exist_ok=True, parents=True)
    
    if model.load(f'./models/{model_name}'):
        print(f'loaded existing model file ./models/{model_name}')
    else:
        print(f'not found existing model file ./models/{model_name}')

    # SummaryWriterのインスタンス作成[ポイント2]
    writer = tbx.SummaryWriter(f'./logs/{model_name}/')

    optimizer = optim.AdamW(model.parameters())
    criterion = nn.MSELoss()

    torch.manual_seed(1)
    for epoch in range(1, epochs + 1):
        model = use_gpu(model)
        model.train()
        loss = 0
        for i, train_data in enumerate(train_iter):
            inputs, labels = train_data
            inputs = use_gpu(inputs)
            labels = use_gpu(labels)
            loss += train(model, criterion, optimizer, inputs, labels)    

        if epoch % visualize_num == 0:
            print(f'epoch {epoch}: loss {loss / batch_size}')

        if epoch % eval_num == 0:
            model.cpu()
            model.eval()
            pred = model.pred(test_x)
            # y = np.reshape(test_y, pred.shape)
            acc = accuracy_score(test_y, pred)
            print(f'{epoch} Accuracy Score:{acc}')
            writer.add_scalar('Accuracy for training set', acc, int(epoch / eval_num))

            model.save(f'./models/{model_name}')
            print(f'save model at ./models/{model_name}')
            
            writer.add_scalar('Total Loss', loss / batch_size, epoch)
    writer.export_scalars_to_json(f"./logs/{model_name}/all_scalars.json")
    writer.close()


def training_LNN():
    train_x, train_y = load_data(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'])
    train_x, train_y = remove_nan_preprocess(train_x, train_y)
    model = LNN(7, 1)
    training(model, train_x.values, train_y.values, 30000, 64, 'models/lnn/', eval_num=5)


def training_BNLNN():
    train_x, train_y = load_data(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'])
    train_x, train_y = remove_nan_preprocess(train_x, train_y)
    model = BNLNN(7, 1)
    training(model, train_x.values, train_y.values, 30000, 64, 'models/bnlnn/', eval_num=50)


def training_LNN_startup():
    train, test = import_data()
    train_y = train[['Survived']]
    train_x = train.drop(['Survived'], axis=1)
    train_x, train_y = preprocess_from_startup(train_x, train_y)
    model = LNN(9, 1)
    training(model, train_x.values, train_y.values, 30000, 64, 'models/lnn_startup/', eval_num=50)


def training_BNLNN_startup():
    train, test = import_data()
    train_y = train[['Survived']]
    train_x = train.drop(['Survived'], axis=1)
    train_x, train_y = preprocess_from_startup(train_x, train_y)
    model = BNLNN(9, 1)
    training(model, train_x.values, train_y.values, 30000, 64, 'models/bnlnn_startup/', eval_num=50)


def training_ResLNN_startup():
    train, test = import_data()
    train_y = train[['Survived']]
    train_x = train.drop(['Survived'], axis=1)
    train_x, train_y = preprocess_from_startup(train_x, train_y)
    model = ResLNN(9, 1)
    training(model, train_x.values, train_y.values, 30000, 64, 'models/Reslnn_startup/', eval_num=50)


def training_ResBNLNN_startup():
    train, test = import_data()
    train_y = train[['Survived']]
    train_x = train.drop(['Survived'], axis=1)
    train_x, train_y = preprocess_from_startup(train_x, train_y)
    model = ResBNLNN(9, 1)
    training(model, train_x.values, train_y.values, 30000, 64, 'models/Resbnlnn_startup/', eval_num=200)


def training_ResBNLNN_order():
    train, test = import_data()
    train_y = train[['Survived']]
    train_x = train.drop(['Survived'], axis=1)
    train_x, train_y = preprocess_from_startup(train_x, train_y, mapping_order=True)
    model = ResBNLNN(9, 1)
    training(model, train_x.values, train_y.values, 30000, 64, 'models/Resbnlnn_order/', eval_num=200)
    

def testing_ResBNLNN_startup():
    train, test = import_data()
    processd_test, __ = preprocess_from_startup(test, None, mapping_order=True)
    model = ResBNLNN(9, 1)
    model.load('models/Resbnlnn_startup/')
    pred = predict_test_data(model, processd_test)
    submission_pred = test['PassengerId']
    pred = pd.DataFrame(pred, columns=['Survived'])
    submission_pred = pd.concat([submission_pred, pred], axis=1)
    # submission_pred['Survived'] = pred
    submission_pred.to_csv(f'models/Resbnlnn_startup/gender_submission.csv', index=False)
    print(submission_pred)    

    
def testing_ResBNLNN_order():
    train, test = import_data()
    processd_test, __ = preprocess_from_startup(test, None, mapping_order=True)
    model = ResBNLNN(9, 1)
    model.load('models/Resbnlnn_order/')
    pred = predict_test_data(model, processd_test)
    submission_pred = test['PassengerId']
    pred = pd.DataFrame(pred, columns=['Survived'])
    submission_pred = pd.concat([submission_pred, pred], axis=1)
    # submission_pred['Survived'] = pred
    submission_pred.to_csv(f'models/Resbnlnn_order/gender_submission.csv', index=False)
    print(submission_pred)
    


NameError: name 'nn' is not defined

In [41]:
train_data, test = import_data()
train_y = train_data[['Survived']]
train_x = train_data.drop(['Survived'], axis=1)
train_x, train_y = preprocess_from_startup(train_x, train_y, mapping_order=True)

train_x.isnull().sum()

# concat_train = pd.concat([train_x, train_y], axis=1)
# concat_train.corr()


Sex           0
SibSp         0
Parch         0
Fare          0
Embarked      0
Title         0
FamilySize    0
IsAlone       0
Age*Class     0
dtype: int64

In [42]:

# print('-' * 60)

train_x, train_y = load_data(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'])
train_x, train_y = remove_nan_preprocess(train_x, train_y)
concat_train = pd.concat([train_x, train_y], axis=1)
concat_train.corr()


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
Pclass,1.0,-0.150826,-0.365902,0.065187,0.023666,-0.552893,-0.297517,-0.356462
Sex,-0.150826,1.0,-0.099037,0.106296,0.249543,0.182457,0.077391,0.536762
Age,-0.365902,-0.099037,1.0,-0.307351,-0.187896,0.093143,0.04234,-0.082446
SibSp,0.065187,0.106296,-0.307351,1.0,0.383338,0.13986,-0.062028,-0.015523
Parch,0.023666,0.249543,-0.187896,0.383338,1.0,0.206624,-0.00412,0.095265
Fare,-0.552893,0.182457,0.093143,0.13986,0.206624,1.0,0.286416,0.2661
Embarked,-0.297517,0.077391,0.04234,-0.062028,-0.00412,0.286416,1.0,0.189657
Survived,-0.356462,0.536762,-0.082446,-0.015523,0.095265,0.2661,0.189657,1.0


In [None]:
training_ResBNLNN_order()
# training_LNN_startup()
# training_BNLNN_startup()
# training_ResLNN_startup()
# training_ResBNLNN_startup()
# training_LNN()
# training_BNLNN()



loaded existing model file ./models/models/Resbnlnn_order/
epoch 10: loss 0.025699926540255547
epoch 20: loss 0.026811711490154266


In [44]:
train, test = import_data()

train['Title'] = train.Name.str.extract('([A-Za-z]+)\.', expand=False)
train['Title'] = train['Title'].replace(['Lady', 'Countess','Capt', 'Col', \
'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
train['Title'] = train['Title'].replace('Mlle', 'Miss')
train['Title'] = train['Title'].replace('Ms', 'Miss')
train['Title'] = train['Title'].replace('Mme', 'Mrs')

# train[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()
# train[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean()
train[['Sex', 'Survived']].groupby(['Sex'], as_index=False).mean()

# C > Q > S
# Female > male
# Mrs > Miss > Master > Rare > Mr

Unnamed: 0,Sex,Survived
0,female,0.742038
1,male,0.188908


In [95]:
testing_ResBNLNN_startup()
testing_ResBNLNN_order()


     PassengerId  Survived
0            892         0
1            893         0
2            894         0
3            895         1
4            896         1
..           ...       ...
413         1305         1
414         1306         0
415         1307         0
416         1308         1
417         1309         0

[418 rows x 2 columns]
     PassengerId  Survived
0            892         0
1            893         0
2            894         1
3            895         1
4            896         0
..           ...       ...
413         1305         0
414         1306         0
415         1307         0
416         1308         0
417         1309         0

[418 rows x 2 columns]


In [94]:
trainging_normal_xgboost()
trainging_normal_randomforest()


TrainAccuracy: 0.8806179775280899
Test Accuracy: 0.8100558659217877
TrainAccuracy: 0.9466292134831461
Test Accuracy: 0.8044692737430168
