https://www.kaggle.com/code/gotutiyan/titanic-tutorial-pytorch-japanese/notebook

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib as mpl
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
from torch.utils.data import DataLoader
from torchvision import datasets
import torchvision.transforms as transforms
import os
import time
import sys
import torch.quantization
sys.path.append(os.pardir)

In [20]:
path_train = os.path.join('..', 'train.csv')
path_test = os.path.join('..', 'test.csv')
train_df = pd.read_csv(path_train)
test_df = pd.read_csv(path_test)

In [21]:
train_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [23]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


# 前処理

In [35]:
def process_df(df):
    df = df.drop(["PassengerId", "Name", "Ticket", "Cabin", "Embarked"], axis=1)
    df["Age"] = df["Age"].fillna(df["Age"].mean())
    df["Fare"] = df["Fare"].fillna(df["Fare"].mean())
    df = df.replace("male", 0)
    df = df.replace("female", 1)
    return df


In [36]:
train_process_df = process_df(train_df)
test_process_df = process_df(test_df)
train_process_df.head()


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,0,22.0,1,0,7.25
1,1,1,1,38.0,1,0,71.2833
2,1,3,1,26.0,0,0,7.925
3,1,1,1,35.0,1,0,53.1
4,0,3,0,35.0,0,0,8.05


In [37]:
test_process_df.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
0,3,0,34.5,0,0,7.8292
1,3,1,47.0,1,0,7.0
2,2,0,62.0,0,0,9.6875
3,3,0,27.0,0,0,8.6625
4,3,1,22.0,1,1,12.2875


# データセット

In [38]:
class Dataset:
    def __init__(self, df):
        self.df = df
        self.X = self.df.drop(["Survived"], axis=1)
        self.Y = self.df["Survived"]
    
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):
#         print(type(self.X.iloc[idx,:]))
#         print(type(self.Y.iloc[idx]))
        return self.X.iloc[idx,:].values, self.Y.iloc[idx]

train_dataset = Dataset(train_process_df)
len(train_dataset)


891

In [39]:
class TestDataset:
    def __init__(self, df):
        self.df = df
    
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):
        return self.X.iloc[idx,:].values, 

test_dataset = TestDataset(test_process_df)
len(test_dataset)

418

In [47]:
BATCH_SIZE = 50
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)

# xに入力，yにラベルが入る
for x,y in train_dataloader:
    print(x,y)
    print("len",len(y))
    break


tensor([[  2.0000,   0.0000,  24.0000,   0.0000,   0.0000,  10.5000],
        [  2.0000,   0.0000,  36.0000,   0.0000,   0.0000,  10.5000],
        [  3.0000,   0.0000,  29.0000,   0.0000,   0.0000,   9.5000],
        [  2.0000,   1.0000,  19.0000,   1.0000,   0.0000,  26.0000],
        [  1.0000,   0.0000,  65.0000,   0.0000,   1.0000,  61.9792],
        [  1.0000,   0.0000,  55.0000,   0.0000,   0.0000,  30.5000],
        [  2.0000,   1.0000,  17.0000,   0.0000,   0.0000,  10.5000],
        [  3.0000,   0.0000,  29.6991,   0.0000,   0.0000,   7.7292],
        [  3.0000,   0.0000,  29.6991,   0.0000,   0.0000,   7.7375],
        [  3.0000,   0.0000,  21.0000,   0.0000,   0.0000,  16.1000],
        [  2.0000,   0.0000,   3.0000,   1.0000,   1.0000,  26.0000],
        [  3.0000,   1.0000,  25.0000,   1.0000,   0.0000,   7.9250],
        [  1.0000,   1.0000,  40.0000,   1.0000,   1.0000, 134.5000],
        [  3.0000,   0.0000,  20.5000,   0.0000,   0.0000,   7.2500],
        [  3.0000,  

TypeError: 'Tensor' object is not callable

# モデル定義

In [41]:
class Net(nn.Module):
    def __init__(self, input_sz, hidden_sz, out_sz):
        super(Net, self).__init__()
        self.f1 = nn.Linear(input_sz, hidden_sz)
        self.bn1 = nn.BatchNorm1d(hidden_sz) #バッチ正規化
        self.f2 = nn.Linear(hidden_sz, out_sz)
        
    def forward(self, x):
        h1 = F.relu(self.f1(x))
        h2 = self.bn1(h1) #バッチ正規化を行う
        y = self.f2(h2)
        
        return y

input_sz = 6
hidden_sz = 3
out_sz = 2
net = Net(input_sz, hidden_sz, out_sz)


# 訓練

In [48]:
learning_rate = 0.01
loss_func = nn.MSELoss(reduction="sum")
optimizer = optim.Adam(net.parameters(), lr=learning_rate)
epoch = 32
def train():
    for e in range(epoch):
        for X, labels in train_dataloader:
            T = convert_label_to_onehot(labels)
            y = F.softmax(net(X.float()), dim=1)
            loss = loss_func(y, torch.FloatTensor(T))
            loss.backward()
            optimizer.step()
            
def convert_label_to_onehot(labels):
    onehot = np.zeros((len(labels), 2))
    idx = [(i, t.item()) for i, t in enumerate(labels)]
    for i in idx:
        onehot[i] = 1
    return onehot

train()


# テスト

In [49]:
# torch.max()の簡単な説明
prob = torch.tensor([[0.1, 0.9],
                    [0.2, 0.8],
                    [0.6, 0.4]])
max, argmax = torch.max(prob, dim=1)
print("max\t",max)
print("argmax\t",argmax)

max	 tensor([0.9000, 0.8000, 0.6000])
argmax	 tensor([1, 1, 0])


In [51]:
def test():
    test_X = torch.tensor(test_process_df.iloc[:,:].values)
    test_Y = net(test_X.float())
    survived = torch.max(test_Y, dim=1)[1]
    sub_df = pd.DataFrame({
                    "PassengerId":test_df['PassengerId'],
                    "Survived":survived
    })
    print(sub_df)
    return sub_df
    
sub_df = test()
sub_df.to_csv("./submission.csv", index=False)


     PassengerId  Survived
0            892         0
1            893         0
2            894         0
3            895         0
4            896         0
..           ...       ...
413         1305         0
414         1306         0
415         1307         0
416         1308         0
417         1309         0

[418 rows x 2 columns]
