In [1]:
# code/notebooks/titanic-mlp.ipynb
import sys
sys.path.append('../utils')
import os
import pandas as pd
import numpy as np
import torch
from torch import nn
from load_data import load_data
from train import train


In [2]:
# 设置索引字符串
file_name="house-prices-mlp"
title="house-prices"
label="SalePrice"
id="Id"

# 加载数据
X_train, y_train, X_val, y_val, X_test = load_data(title=title, label=label,
                                     id=id,
                                     drop_features=["Name", "Ticket", "Cabin"],
                                     fill_na=["Age", "Embarked", "Fare"],
                                     one_hot=["Sex", "Embarked"])

# 打印前5行
print(X_train[:5])
print(y_train[:5])
print(X_test[:5])

tensor([[ 3.0000, 22.0000,  1.0000,  0.0000,  7.2500,  1.0000,  0.0000,  1.0000],
        [ 1.0000, 38.0000,  1.0000,  0.0000, 71.2833,  0.0000,  0.0000,  0.0000],
        [ 3.0000, 26.0000,  0.0000,  0.0000,  7.9250,  0.0000,  0.0000,  1.0000],
        [ 1.0000, 35.0000,  1.0000,  0.0000, 53.1000,  0.0000,  0.0000,  1.0000],
        [ 3.0000, 35.0000,  0.0000,  0.0000,  8.0500,  1.0000,  0.0000,  1.0000]])
tensor([0., 1., 1., 1., 0.])
tensor([[ 3.0000, 34.5000,  0.0000,  0.0000,  7.8292,  1.0000,  1.0000,  0.0000],
        [ 3.0000, 47.0000,  1.0000,  0.0000,  7.0000,  0.0000,  0.0000,  1.0000],
        [ 2.0000, 62.0000,  0.0000,  0.0000,  9.6875,  1.0000,  1.0000,  0.0000],
        [ 3.0000, 27.0000,  0.0000,  0.0000,  8.6625,  1.0000,  0.0000,  1.0000],
        [ 3.0000, 22.0000,  1.0000,  1.0000, 12.2875,  0.0000,  0.0000,  1.0000]])


In [3]:
# 模型
model = nn.Sequential(
    nn.BatchNorm1d(8),
    nn.Linear(8, 32),
    nn.ReLU(),
    nn.BatchNorm1d(32),
    nn.Linear(32, 1),
    nn.Sigmoid()
)

# 训练模型
train(model, X_train, y_train, X_val, y_val,
        epochs=1000, loss_fn=nn.BCELoss(), optimizer=torch.optim.Adam(model.parameters(), lr = 0.01),
        stage=100, draw=False, log=True)

Epoch: 100, Loss: 0.3527, Train Acc: 0.8583, Val Acc: 0.8652
Epoch: 200, Loss: 0.3042, Train Acc: 0.8808, Val Acc: 0.8483
Epoch: 300, Loss: 0.2808, Train Acc: 0.8850, Val Acc: 0.8315
Epoch: 400, Loss: 0.2684, Train Acc: 0.8920, Val Acc: 0.8371
Epoch: 500, Loss: 0.2651, Train Acc: 0.8934, Val Acc: 0.8371
Epoch: 600, Loss: 0.2506, Train Acc: 0.8962, Val Acc: 0.8371
Epoch: 700, Loss: 0.2456, Train Acc: 0.9032, Val Acc: 0.8315
Epoch: 800, Loss: 0.2470, Train Acc: 0.8990, Val Acc: 0.8258
Epoch: 900, Loss: 0.2355, Train Acc: 0.9060, Val Acc: 0.8371
Epoch: 1000, Loss: 0.2303, Train Acc: 0.9074, Val Acc: 0.8315


In [4]:
# 保存模型
torch.save(model, "../../models/" + file_name + ".pt")

# 存储预测结果
id_col = pd.read_csv("../../data/raw/" + title + "/test.csv")[id]
y_pred = model(X_test).detach().numpy().round().astype(int).flatten()

submission = pd.DataFrame({
    id: id_col,
    label: y_pred
})
submission.to_csv("../../data/submission/" + file_name + ".csv", index=False)