In [98]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [99]:
train_df = pd.read_csv('/kaggle/input/titanic/train.csv')
print(train_df.shape)
print(train_df.head)

test_df = pd.read_csv('/kaggle/input/titanic/test.csv')
print(test_df.shape)
print(test_df.columns)

(891, 12)
<bound method NDFrame.head of      PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
..           ...       ...     ...   
886          887         0       2   
887          888         1       1   
888          889         0       3   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex   Age  SibSp  \
0                              Braund, Mr. Owen Harris    male  22.0      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                               Heikkinen, Miss. Laina  female  26.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                             Allen, Mr. William Henry    male  35.0      0   
..                           

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


In [100]:
import torch
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader

class TitanicDataset(Dataset):
    def __init__(self,filepath):
        df = pd.read_csv(filepath)
        self.y_data = torch.tensor(df['Survived'].values,dtype=torch.float32).view(-1,1)
        # feature engineering
        df = df.drop(columns=['Survived','Name','Ticket','Cabin','PassengerId'])
        df["Sex"] = df["Sex"].map({"female": 0, "male": 1}).astype(float)
        df = pd.get_dummies(df, columns=["Embarked"], drop_first=False)
        df["Age"] = df["Age"].fillna(df["Age"].median())
        print(df.head)
        df.info()
        self.x_data = torch.tensor(df.values.astype('float32'), dtype=torch.float32)
        self.len = df.shape[0]
        print(self.x_data.shape, self.y_data.shape, self.len)

    def __len__(self):
        return self.len

    def __getitem__(self, idx):
        return self.x_data[idx], self.y_data[idx]

dataset = TitanicDataset('/kaggle/input/titanic/train.csv')
train_loader = DataLoader(dataset=dataset,batch_size=64,shuffle=True,num_workers=4)

<bound method NDFrame.head of      Pclass  Sex   Age  SibSp  Parch     Fare  Embarked_C  Embarked_Q  \
0         3  1.0  22.0      1      0   7.2500       False       False   
1         1  0.0  38.0      1      0  71.2833        True       False   
2         3  0.0  26.0      0      0   7.9250       False       False   
3         1  0.0  35.0      1      0  53.1000       False       False   
4         3  1.0  35.0      0      0   8.0500       False       False   
..      ...  ...   ...    ...    ...      ...         ...         ...   
886       2  1.0  27.0      0      0  13.0000       False       False   
887       1  0.0  19.0      0      0  30.0000       False       False   
888       3  0.0  28.0      1      2  23.4500       False       False   
889       1  1.0  26.0      0      0  30.0000        True       False   
890       3  1.0  32.0      0      0   7.7500       False        True   

     Embarked_S  
0          True  
1         False  
2          True  
3          True  
4  

In [101]:
class Model(torch.nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.linear1 = torch.nn.Linear(9,4)
        self.linear2 = torch.nn.Linear(4,1)
        self.sigmoid = torch.nn.Sigmoid()

    def forward(self, x):
        x = self.sigmoid(self.linear1(x))
        x = self.sigmoid(self.linear2(x))
        return x

model = Model()

In [102]:
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [103]:
for epoch in range(1000):
    for i, data in enumerate(train_loader,0):
        inputs, labels = data
        y_pred = model(inputs)
        loss = criterion(y_pred, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    if epoch == 999:
        print('Loss:', loss.item())

Loss: 0.4892825186252594


In [104]:
# 假设您的测试集路径
TEST_FILEPATH = '/kaggle/input/titanic/test.csv'

# --- 1. 创建测试集数据集类（或直接处理） ---
class TitanicTestDataset(Dataset):
    def __init__(self, filepath):
        df = pd.read_csv(filepath)
        
        # 保存 PassengerId 以便提交
        self.passenger_ids = df['PassengerId'].values
        
        # 特征工程 (必须与训练集保持一致!)
        df = df.drop(columns=['PassengerId','Name', 'Ticket', 'Cabin'])
        
        # 映射 'Sex'
        df["Sex"] = df["Sex"].map({"female": 0, "male": 1}).astype(float)
        
        # 独热编码 'Embarked'
        df = pd.get_dummies(df, columns=["Embarked"], drop_first=False)
        
        # 填充 'Age'
        df["Age"] = df["Age"].fillna(df["Age"].median())
        
        # 填充 'Fare' 
        df["Fare"] = df["Fare"].fillna(df["Fare"].median()) 

        df.info()
        
        # 关键步骤：强制转换为 float32，避免 object 类型错误
        self.x_data = torch.tensor(df.values.astype('float32'), dtype=torch.float32)
        self.len = df.shape[0]

    def __len__(self):
        return self.len

    def __getitem__(self, idx):
        return self.x_data[idx] # 注意：测试集没有 y_data

# --- 2. 加载数据 ---
test_dataset = TitanicTestDataset(TEST_FILEPATH)
test_loader = DataLoader(
    dataset=test_dataset, 
    batch_size=64, 
    shuffle=False, # 预测时不需要打乱
    num_workers=4
)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Pclass      418 non-null    int64  
 1   Sex         418 non-null    float64
 2   Age         418 non-null    float64
 3   SibSp       418 non-null    int64  
 4   Parch       418 non-null    int64  
 5   Fare        418 non-null    float64
 6   Embarked_C  418 non-null    bool   
 7   Embarked_Q  418 non-null    bool   
 8   Embarked_S  418 non-null    bool   
dtypes: bool(3), float64(3), int64(3)
memory usage: 20.9 KB


In [105]:
# --- 3. 设置模型为评估模式 ---
# 这一步至关重要，它会关闭如 Dropout 和 BatchNorm 等只在训练中使用的层。
model.eval() 

# 存储所有预测结果
all_predictions = []

# --- 4. 禁用梯度计算进行预测 ---
with torch.no_grad():
    for inputs in test_loader:
        # 模型前向传播
        outputs = model(inputs)
        
        # 假设您的模型输出是经过 Sigmoid 的概率值（0到1之间）
        probabilities = outputs 
        
        # 将概率转换为二分类预测结果 (Survived: 0 或 1)
        # 阈值通常设为 0.5
        predicted_classes = (probabilities >= 0.5).int()
        
        # 收集结果
        all_predictions.append(predicted_classes)

# 将所有批次的结果连接成一个张量或 NumPy 数组
final_predictions_tensor = torch.cat(all_predictions, dim=0)
final_predictions_np = final_predictions_tensor.squeeze().numpy()

In [106]:
# --- 5. 创建提交文件 ---

# 获取 PassengerId
passenger_ids = test_dataset.passenger_ids

# 检查长度是否匹配
if len(passenger_ids) != len(final_predictions_np):
    print("错误：PassengerId 数量与预测结果数量不匹配！")
else:
    # 创建 DataFrame
    submission_df = pd.DataFrame({
        'PassengerId': passenger_ids,
        'Survived': final_predictions_np
    })
    
    # 确保 'Survived' 列是整数类型
    submission_df['Survived'] = submission_df['Survived'].astype(int)
    
    # 保存为 CSV 文件
    submission_df.to_csv('/kaggle/working/submission.csv', index=False)
    
    print("✅ 预测完成，'submission.csv' 文件已生成。")

✅ 预测完成，'submission.csv' 文件已生成。
