In [111]:
import pandas as pd
import numpy as np

In [113]:
train = pd.read_csv('../../dataset/train.csv')
test = pd.read_csv('../../dataset/test.csv')
# 前処理を一度にやるためにtrainとtestをconcatする
test['Transported'] = np.nan
train_test = pd.concat([train, test], axis=0, ignore_index=True, sort=False)

# split on `/` to cols (deck/num/side)
def split_cabin(df):
    cabin = df['Cabin'].str.split('/', expand=True).rename(columns={0: 'CabinDeck', 1: 'CabinNum', 2: 'CabinSide'})
    cabin['CabinNum'] = cabin['CabinNum'].astype(float)
    return pd.concat([df, cabin], axis=1)

train_test = split_cabin(train_test)

  train_test = pd.concat([train, test], axis=0, ignore_index=True, sort=False)


### 使う特徴量を選ぶ

In [114]:
train_test = train_test[['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'CabinNum', 'CabinSide', 'Transported']]

採用する特徴量
- HomePlanet→最頻値
- CryoSleep→最頻値
- Destination→最頻値
- Age→中央値で埋める
- VIP→VIPなしで埋める
- CabinNum→最頻値で埋める
- CabinSide→CabinNum==82となっているCabinSideの最頻値で埋める？→Pで埋める

In [115]:
# 欠損値埋め
def fillna_cols(df):
    df['HomePlanet'] = df['HomePlanet'].fillna(df['HomePlanet'].mode()[0])
    df['CryoSleep'] = df['CryoSleep'].fillna(df['CryoSleep'].mode()[0])
    df['Destination'] = df['Destination'].fillna(df['Destination'].mode()[0])
    df['Age'] = df['Age'].fillna(df['Age'].median())
    df['VIP'] = df['VIP'].fillna(False)
    df['CabinNum'] = df['CabinNum'].fillna(df['CabinNum'].mode()[0])
    df['CabinSide'] = df['CabinSide'].fillna('P')
    return df

train_test = fillna_cols(train_test)

In [116]:
train_test.isna().sum(axis=0)

HomePlanet        0
CryoSleep         0
Destination       0
Age               0
VIP               0
CabinNum          0
CabinSide         0
Transported    4277
dtype: int64

### Encoding

In [117]:
# HomePlanet, Destination, CabinSideはOne-hot化
train_test = pd.get_dummies(train_test, columns=['HomePlanet', 'Destination', 'CabinSide'], drop_first=True)

In [118]:
# boolをintへ
def bool2int(df):
    for col in df.columns:
        if df[col].dtype == bool:
            df[col] = df[col].astype(int)
    return df

train_test = bool2int(train_test)

In [119]:
train_test

Unnamed: 0,CryoSleep,Age,VIP,CabinNum,Transported,HomePlanet_Europa,HomePlanet_Mars,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,CabinSide_S
0,0,39.0,0,0.0,0.0,1,0,0,1,0
1,0,24.0,0,0.0,1.0,0,0,0,1,1
2,0,58.0,1,0.0,0.0,1,0,0,1,1
3,0,33.0,0,0.0,0.0,1,0,0,1,1
4,0,16.0,0,1.0,1.0,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...
12965,1,34.0,0,1496.0,,0,0,0,1,1
12966,0,42.0,0,82.0,,0,0,0,1,0
12967,1,27.0,0,296.0,,0,1,0,0,0
12968,0,27.0,0,297.0,,1,0,0,1,0


### scaling

In [120]:
from sklearn import preprocessing

In [121]:
# Age, CabinNumを最小0最大1へ正規化
scaler = preprocessing.MinMaxScaler() 
train_test[['Age', 'CabinNum']] = scaler.fit_transform(train_test[['Age', 'CabinNum']])

In [304]:
train_test

Unnamed: 0,CryoSleep,Age,VIP,CabinNum,Transported,HomePlanet_Europa,HomePlanet_Mars,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,CabinSide_S
0,0,0.493671,0,0.000000,0.0,1,0,0,1,0
1,0,0.303797,0,0.000000,1.0,0,0,0,1,1
2,0,0.734177,1,0.000000,0.0,1,0,0,1,1
3,0,0.417722,0,0.000000,0.0,1,0,0,1,1
4,0,0.202532,0,0.000528,1.0,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...
12965,1,0.430380,0,0.789863,,0,0,0,1,1
12966,0,0.531646,0,0.043295,,0,0,0,1,0
12967,1,0.341772,0,0.156283,,0,1,0,0,0
12968,0,0.341772,0,0.156811,,1,0,0,1,0


### モデリング

In [164]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl


In [293]:
class MLP(nn.Module):
    def __init__(self, 
                 in_feats, 
                 out_feats, 
                 hid_feats=300, 
                 lr=0.01,
                 ):
        super().__init__()
        self.classifier = nn.Sequential(
            nn.Linear(in_feats, hid_feats),
            nn.ReLU(),
            nn.Dropout(0.25),
            nn.Linear(hid_feats, hid_feats),
            nn.ReLU(),
            nn.Dropout(0.25),
            nn.Linear(hid_feats, out_feats),
            nn.Dropout(0.5),
        )
        self.criterion = nn.BCEWithLogitsLoss()
        self.optimizer = torch.optim.Adam(self.classifier.parameters(), lr=lr)
        
    def forward(self, x):
        x = self.classifier(x)
        return x
    
    def training_step(self, batch):
        self.train()
        x, y = batch
        self.optimizer.zero_grad()
        pr = self.forward(x)
        loss = self.criterion(pr, y)
        loss.backward()
        self.optimizer.step()
        return {'loss': loss, 'pred': F.sigmoid(pr)}
    
    def validation_step(self, batch):
        self.eval()
        x, y = batch
        with torch.no_grad():
            pr = self(x)
            loss = self.criterion(pr, y)
        return {'loss': loss, 'pred': F.sigmoid(pr)}
    
    def predict(self, x):
        self.eval()
        with torch.no_grad():
            pr = self(x)
        return F.sigmoid(pr)

### 学習

In [190]:
import os
import random
from sklearn.model_selection import KFold
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.dataset import Subset

In [219]:
def set_seed(seed=3407):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

SEED = 3407
set_seed(SEED)

In [255]:
# DataFrameをnp.ndarrayに変換
train = train_test[~train_test['Transported'].isna()]
test = train_test[train_test['Transported'].isna()]
# inputとlabelに分離
x_train = train.drop('Transported', axis=1).values
y_train = train.Transported.values[:, np.newaxis]
x_test = test.drop('Transported', axis=1).values

# Tensorにする
x_train = torch.tensor(x_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32)
x_test = torch.tensor(x_test, dtype=torch.float32)

# Datasetにまとめる
train_set = torch.utils.data.TensorDataset(x_train, y_train)

In [256]:
x_train.shape, x_test.shape, y_train.shape

(torch.Size([8693, 9]), torch.Size([4277, 9]), torch.Size([8693, 1]))

In [320]:
# hyperparameter
batch_size = 128
lr = 0.001
epochs = 1000
num_hidden = 0

In [321]:
kf = KFold(n_splits=5, shuffle=True, random_state=SEED)

logs = {
    f'cv{i}': {
        'loss': [],
        'val_loss': [],
        'acc': [],
        'val_acc': [],
    }
    for i in range(1, 6)
}
logs['cv_acc'] = 0
logs['cv_loss'] = 0
device = 'cuda' if torch.cuda.is_available() else 'cpu'

for _fold, (train_index, valid_index) in enumerate(kf.split(train_set), 1):
    print(f'---------- cv{_fold} ------------')
    model = MLP(x_train.shape[1], 1, num_hidden, lr=lr).to(device)
    
    train_dataset = Subset(train_set, train_index)
    train_loader = DataLoader(train_dataset, batch_size, shuffle=True)
    valid_dataset = Subset(train_set, valid_index)
    valid_loader = DataLoader(valid_dataset, batch_size, shuffle=False)

    for epoch in range(epochs):
        train_loss = 0
        train_acc = 0
        val_loss = 0
        val_acc = 0

        # train
        for i, (X, y) in enumerate(train_loader):
            X, y = X.to(device), y.to(device)
            outputs = model.training_step((X, y))
            train_loss += outputs['loss'].item()
            train_acc += torch.sum((outputs['pred'] >= 0.5) * (y == 1), axis=0).item()
        train_loss = train_loss / i  # batchごとにlossがmeanされてるため、iteration数で割ることで全batchの平均lossになる
        train_acc = train_acc / len(train_loader.dataset)

        # val
        for X, y in valid_loader:
            X, y = X.to(device), y.to(device)
            outputs = model.validation_step((X, y))
            val_loss += outputs['loss'].item()
            val_acc += torch.sum((outputs['pred'] >= 0.5) * (y == 1), axis=0).item()
        val_loss = val_loss / i
        val_acc = val_acc / len(valid_loader.dataset)

        print (f'Epoch [{epoch+1}/{epochs}], loss: {train_loss:.4f}, val_loss: {val_loss:.4f},  acc: {train_acc:.4f}, val_acc: {val_acc:.4f}')
        logs[f'cv{_fold}']['loss'].append(train_loss)
        logs[f'cv{_fold}']['acc'].append(train_acc)
        logs[f'cv{_fold}']['val_loss'].append(val_loss)
        logs[f'cv{_fold}']['val_acc'].append(val_acc)

    logs[f'cv{_fold}']['model'] = model
    logs['cv_loss'] += val_loss / kf.n_splits
    logs['cv_acc'] += val_acc / kf.n_splits

print('CV loss', logs['cv_loss'], 'CV acc', logs['cv_acc'])

---------- cv1 ------------
Epoch [1/1000], loss: 0.7060, val_loss: 0.1797,  acc: 0.2552, val_acc: 0.0000
Epoch [2/1000], loss: 0.7060, val_loss: 0.1797,  acc: 0.3408, val_acc: 0.5060




Epoch [3/1000], loss: 0.7060, val_loss: 0.1797,  acc: 0.5030, val_acc: 0.5060
Epoch [4/1000], loss: 0.7058, val_loss: 0.1797,  acc: 0.5030, val_acc: 0.5060
Epoch [5/1000], loss: 0.7058, val_loss: 0.1797,  acc: 0.5030, val_acc: 0.5060
Epoch [6/1000], loss: 0.7060, val_loss: 0.1797,  acc: 0.5030, val_acc: 0.5060
Epoch [7/1000], loss: 0.7061, val_loss: 0.1797,  acc: 0.5030, val_acc: 0.5060
Epoch [8/1000], loss: 0.7060, val_loss: 0.1797,  acc: 0.5030, val_acc: 0.5060
Epoch [9/1000], loss: 0.7060, val_loss: 0.1797,  acc: 0.5030, val_acc: 0.5060
Epoch [10/1000], loss: 0.7060, val_loss: 0.1797,  acc: 0.5030, val_acc: 0.5060
Epoch [11/1000], loss: 0.7060, val_loss: 0.1797,  acc: 0.5030, val_acc: 0.5060
Epoch [12/1000], loss: 0.7060, val_loss: 0.1797,  acc: 0.5030, val_acc: 0.5060
Epoch [13/1000], loss: 0.7059, val_loss: 0.1797,  acc: 0.5030, val_acc: 0.5060
Epoch [14/1000], loss: 0.7061, val_loss: 0.1797,  acc: 0.5030, val_acc: 0.5060
Epoch [15/1000], loss: 0.7060, val_loss: 0.1797,  acc: 0.33