In [1]:
# code/notebooks/titanic-mlp.ipynb
import sys
sys.path.append('../utils')
import os
import pandas as pd
import numpy as np
import torch
from torch import nn
from load_data import load_data
from train import train


In [2]:
# 设置索引字符串
file_name="titanic-logistic"
title="titanic"
label="Survived"
id="PassengerId"

# 加载数据
X_train, y_train, X_val, y_val, X_test = load_data(title=title, label=label,
                                     id=id,
                                     drop_features=["Name", "Ticket", "Cabin"],
                                     fill_na=["Age", "Embarked", "Fare"],
                                     one_hot=["Sex", "Embarked"])
# 打印前5行
print(X_train[:5])

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
  

In [3]:
# logistic模型
class Logistic(nn.Module):
    def __init__(self, input_dim):
        super(Logistic, self).__init__()
        self.linear = nn.Linear(input_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        return self.sigmoid(self.linear(x))
    
model = Logistic(X_train.shape[1])

# 训练模型
train(model, X_train, y_train, X_val, y_val,
        epochs=1000, loss_fn=nn.BCELoss(), optimizer=torch.optim.Adam(model.parameters(), lr = 0.01),
        stage=100, draw=False, log=True)

Epoch: 100, Loss: 0.5546, Train Acc: 0.7293, Val Acc: 0.7809
Epoch: 200, Loss: 0.5170, Train Acc: 0.7896, Val Acc: 0.8258
Epoch: 300, Loss: 0.4990, Train Acc: 0.7854, Val Acc: 0.8202
Epoch: 400, Loss: 0.4888, Train Acc: 0.7854, Val Acc: 0.8258
Epoch: 500, Loss: 0.4819, Train Acc: 0.7896, Val Acc: 0.8258
Epoch: 600, Loss: 0.4766, Train Acc: 0.7868, Val Acc: 0.8258
Epoch: 700, Loss: 0.4720, Train Acc: 0.7868, Val Acc: 0.8315
Epoch: 800, Loss: 0.4682, Train Acc: 0.7896, Val Acc: 0.8315
Epoch: 900, Loss: 0.4650, Train Acc: 0.7896, Val Acc: 0.8315
Epoch: 1000, Loss: 0.4623, Train Acc: 0.7910, Val Acc: 0.8315


In [4]:
# 保存模型
torch.save(model, "../../models/" + file_name + ".pt")

# 存储预测结果
id_col = pd.read_csv("../../data/raw/" + title + "/test.csv")[id]
y_pred = model(X_test).detach().numpy().round().astype(int).flatten()

submission = pd.DataFrame({
    id: id_col,
    label: y_pred
})
submission.to_csv("../../data/submission/" + file_name + ".csv", index=False)