In [28]:
"""
估计 Titanic 乘客生还率
https://www.kaggle.com/competitions/titanic/overview

score: 0.76794
"""

import numpy as np
import pandas as pd

X = pd.read_csv('./data/train.csv')

# print( X['Pclass'].unique(), X['Fare'].unique())

# 按照 Pclass 分组, 并计算 Fare 的平均值
# print(X.groupby('Pclass').Fare.mean())
# print(X[X['Pclass'] == 1]['Fare'].describe())
# print(X[X['Pclass'] == 2]['Fare'].describe())
# print(X[X['Pclass'] == 3]['Fare'].describe())

"""
- PassengerId 乘客ID
- Ticket 票号
- Name 姓名
- Cabin 船舱号(类似于高铁的座位号)
"""

"""
类别特征:
- Pclass: 1, 2, 3 (分别表示一等座, 二等座, 三等座)
- Sex: male, female
- Embarked: 登船港口, C = Cherbourg, Q = Queenstown, S = Southampton
"""

def clear_data(X):
    X = X[['Pclass', 'Sex', 'Age', 'SibSp',
           'Parch', 'Fare', 'Embarked']]
    X = pd.get_dummies(
        X, columns=['Pclass', 'Sex', 'Embarked'], drop_first=False, dtype=np.int8)
    
    return X.fillna(0, inplace=False)


y = X['Survived']
X_train = clear_data(X)

In [29]:
import torch
import torch.nn as nn

class LogisticRegression(nn.Module):
    def __init__(self, input_dim):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_dim, 1)

    def forward(self, x):
        return torch.sigmoid(self.linear(x))

n_features = X_train.shape[1]
model = LogisticRegression(n_features)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criteria = nn.BCELoss()  # Binary Cross Entropy Loss

X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y.values, dtype=torch.float32).view(-1, 1) # view(-1, 1) 将 y 变成列向量

for i in range(10000):
    optimizer.zero_grad()
    y_pred = model(X_train_tensor)
    loss = criteria(y_pred, y_train_tensor)
    loss.backward()
    optimizer.step()

    if i % 1000 == 0:
        print(f'Iteration {i}, loss: {loss.item()}')

Iteration 0, loss: 2.424731492996216
Iteration 1000, loss: 0.44992098212242126
Iteration 2000, loss: 0.4498960077762604
Iteration 3000, loss: 0.4498603343963623
Iteration 4000, loss: 0.44981759786605835
Iteration 5000, loss: 0.44980356097221375
Iteration 6000, loss: 0.4497412443161011
Iteration 7000, loss: 0.44972464442253113
Iteration 8000, loss: 0.44992953538894653
Iteration 9000, loss: 0.4496934115886688


In [43]:
import pandas as pd

X_test_origin = pd.read_csv('./data/test.csv')
X_test = clear_data(X_test_origin)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)

model.eval()
y_pred = model(X_test_tensor).detach().numpy()
y_pred = np.where(y_pred > 0.5, 1, 0)

# 将 X_test_origin 中的 PassengerId 列 和 y_pred 合并
result = np.concatenate([X_test_origin['PassengerId'].to_numpy().reshape(-1, 1), y_pred], axis=1)
result = pd.DataFrame(result, columns=['PassengerId', 'Survived'])
result.to_csv('./data/result.csv', index=False)