In [1]:
import pandas as pd
import numpy as np
import sklearn
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

In [2]:
class TitanicData(Dataset):
    def __init__(self, train_path='./train.csv', test_path='./test.csv', sub_path='./gender_submission.csv'):
        self.df_train = pd.read_csv(train_path, encoding='utf8')
        self.df_test = pd.read_csv(test_path, encoding='utf8')
        self.df_sub = pd.read_csv(sub_path, encoding='utf8')
        self.df_test = pd.concat([self.df_test, self.df_sub['Survived']], axis=1)
        self.df_train = self.process_df(self.df_train)
        self.df_test = self.process_df(self.df_test)
        self.all_x = pd.concat([self.df_train.iloc[:, 1:-2], self.df_test.iloc[:, 1:-2]], axis=0)
        self.all_y = pd.concat([self.df_train.iloc[:, -2:], self.df_test.iloc[:, -2:]], axis=0)

        # _scaler = StandardScaler()
        # np_all_x = _scaler.fit_transform(self.all_x)
        # self.all_x = pd.DataFrame(data=np_all_x, columns=self.all_x.columns)

    def process_df(self, df):
        df.sort_values(by=['Ticket'], ascending=[True], inplace=True)
        df.drop(columns=['Name', 'Ticket', 'Cabin'], inplace=True)
        df['Age'].interpolate(inplace=True)
        df['Fare'].interpolate(inplace=True)
        df['Embarked'].fillna(value='S', inplace=True)
        df.sort_values(by=['PassengerId'], ascending=[True], inplace=True)
        df = pd.get_dummies(data=df, columns=['Sex', 'Pclass', 'Embarked', 'Survived'])
        return df

    # def _init_train(self):
    #     self.df_train.sort_values(by=['Ticket'], ascending=[True], inplace=True)
    #     self.df_train.drop(columns=['Name', 'Ticket', 'Cabin'], inplace=True)
    #     self.df_train['Age'].interpolate(inplace=True)
    #     self.df_train['Fare'].interpolate(inplace=True)
    #     self.df_train['Embarked'].fillna(value='S', inplace=True)
    #     self.df_train.sort_values(by=['PassengerId'], ascending=[True], inplace=True)
    #     self.df_train = pd.get_dummies(data=self.df_train, columns=['Sex', 'Pclass', 'Embarked', 'Survived'])
    #
    # def _init_test(self):
    #     self.df_test.sort_values(by=['Ticket'], ascending=[True], inplace=True)
    #     self.df_test.drop(columns=['Name', 'Ticket', 'Cabin'], inplace=True)
    #     self.df_test['Age'].interpolate(inplace=True)
    #     self.df_test['Fare'].interpolate(inplace=True)
    #     self.df_test['Embarked'].fillna(value='S', inplace=True)
    #     self.df_test.sort_values(by=['PassengerId'], ascending=[True], inplace=True)
    #     self.df_test = pd.get_dummies(data=self.df_test, columns=['Sex', 'Pclass', 'Embarked'])

    def get_x(self, idx: int):
        return torch.Tensor(self.all_x.iloc[idx].values)

    def get_y(self, idx: int):
        return torch.Tensor(self.all_y.iloc[idx].values)

    def __getitem__(self, idx):
        return self.get_x(idx), self.get_y(idx)

    def __len__(self):
        return len(self.df_train)


titanic = TitanicData()
titanic.df_train

Unnamed: 0,PassengerId,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,Survived_0,Survived_1
0,1,22.000000,1,0,7.2500,0,1,0,0,1,0,0,1,1,0
1,2,38.000000,1,0,71.2833,1,0,1,0,0,1,0,0,0,1
2,3,26.000000,0,0,7.9250,1,0,0,0,1,0,0,1,0,1
3,4,35.000000,1,0,53.1000,1,0,1,0,0,0,0,1,0,1
4,5,35.000000,0,0,8.0500,0,1,0,0,1,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,27.000000,0,0,13.0000,0,1,0,1,0,0,0,1,1,0
887,888,19.000000,0,0,30.0000,1,0,1,0,0,0,0,1,0,1
888,889,15.666667,1,2,23.4500,1,0,0,0,1,0,0,1,1,0
889,890,26.000000,0,0,30.0000,0,1,1,0,0,1,0,0,0,1


In [3]:
all_x = titanic.all_x
all_x.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Age         1309 non-null   float64
 1   SibSp       1309 non-null   int64  
 2   Parch       1309 non-null   int64  
 3   Fare        1309 non-null   float64
 4   Sex_female  1309 non-null   uint8  
 5   Sex_male    1309 non-null   uint8  
 6   Pclass_1    1309 non-null   uint8  
 7   Pclass_2    1309 non-null   uint8  
 8   Pclass_3    1309 non-null   uint8  
 9   Embarked_C  1309 non-null   uint8  
 10  Embarked_Q  1309 non-null   uint8  
 11  Embarked_S  1309 non-null   uint8  
dtypes: float64(2), int64(2), uint8(8)
memory usage: 61.4 KB


In [4]:
# scaler = StandardScaler()
std_x = StandardScaler().fit_transform(all_x.values[:, 1:])

In [5]:
std_x

array([[ 0.48128777, -0.4449995 , -0.50421749, ..., -0.50976981,
        -0.32204029,  0.65501092],
       [ 0.48128777, -0.4449995 ,  0.73326033, ...,  1.96166973,
        -0.32204029, -1.52669211],
       [-0.47908676, -0.4449995 , -0.49117276, ..., -0.50976981,
        -0.32204029,  0.65501092],
       ...,
       [-0.47908676, -0.4449995 , -0.50421749, ..., -0.50976981,
        -0.32204029,  0.65501092],
       [-0.47908676, -0.4449995 , -0.48875706, ..., -0.50976981,
        -0.32204029,  0.65501092],
       [ 0.48128777,  0.71076309, -0.2122415 , ...,  1.96166973,
        -0.32204029, -1.52669211]])

In [6]:
titanic.df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Age          891 non-null    float64
 2   SibSp        891 non-null    int64  
 3   Parch        891 non-null    int64  
 4   Fare         891 non-null    float64
 5   Sex_female   891 non-null    uint8  
 6   Sex_male     891 non-null    uint8  
 7   Pclass_1     891 non-null    uint8  
 8   Pclass_2     891 non-null    uint8  
 9   Pclass_3     891 non-null    uint8  
 10  Embarked_C   891 non-null    uint8  
 11  Embarked_Q   891 non-null    uint8  
 12  Embarked_S   891 non-null    uint8  
 13  Survived_0   891 non-null    uint8  
 14  Survived_1   891 non-null    uint8  
dtypes: float64(2), int64(3), uint8(10)
memory usage: 50.5 KB


In [7]:
titanic.all_x

Unnamed: 0,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S
0,22.000000,1,0,7.2500,0,1,0,0,1,0,0,1
1,38.000000,1,0,71.2833,1,0,1,0,0,1,0,0
2,26.000000,0,0,7.9250,1,0,0,0,1,0,0,1
3,35.000000,1,0,53.1000,1,0,1,0,0,0,0,1
4,35.000000,0,0,8.0500,0,1,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
413,25.500000,0,0,8.0500,0,1,0,0,1,0,0,1
414,39.000000,0,0,108.9000,1,0,1,0,0,1,0,0
415,38.500000,0,0,7.2500,0,1,0,0,1,0,0,1
416,38.000000,0,0,8.0500,0,1,0,0,1,0,0,1


In [10]:
x, y = titanic[123]
x, y

(tensor([32.5000,  0.0000,  0.0000, 13.0000,  1.0000,  0.0000,  0.0000,  1.0000,
          0.0000,  0.0000,  0.0000,  1.0000]),
 tensor([0., 1.]))

In [11]:
def init_weights(m):
    if type(m) == nn.Linear:
        nn.init.normal_(m.weight, std=0.01)

In [12]:
def parse_output(output):
    x0 = output[0].item()
    x1 = output[1].item()
    if x0 > x1:
        # return np.array([1, 0])
        return 0
    else:
        # return np.array([0, 1])
        return 1

In [43]:
model = nn.Sequential(
    nn.Linear(12, 4),
    nn.ReLU(),
    nn.Linear(4, 2),
    nn.Sigmoid(),
    nn.Softmax(dim=0)
    )
# model.apply(init_weights)
loss = nn.CrossEntropyLoss()
loss = nn.MSELoss()
opt = torch.optim.Adam(model.parameters(), lr=0.001)

In [44]:
loader = DataLoader(titanic, batch_size=32, shuffle=True)
# loader = load_array([titanic.all_x.values[:891, 1:], titanic.all_y.values[:, 1:]], 32)

In [45]:
num_epoch = 100
looper = tqdm(range(num_epoch))
for epoch in looper:
    model.train()
    for batch, (X, y) in enumerate(loader):
        pred = model(X)
        l = loss(pred, y)

        opt.zero_grad()
        l.backward()
        opt.step()

        looper.set_description(f'loss = {round(l.item(), 5)}')

loss = 0.45833: 100%|██████████| 100/100 [00:18<00:00,  5.30it/s]


In [46]:
true_cnt, false_cnt = 0, 0
for idx in range(891, 1309):
    output = model(titanic[idx][0])
    if parse_output(output) == parse_output(titanic[idx][1]):
        true_cnt += 1
    else:
        false_cnt += 1

true_cnt, false_cnt, true_cnt/(true_cnt+false_cnt)

(387, 31, 0.9258373205741627)

In [236]:
df1 = pd.read_csv('./test.csv')
df2 = pd.read_csv('./gender_submission.csv')
test = pd.concat([df1, df2['Survived']], axis=1)
test

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S,1
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,1
...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S,0
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C,1
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S,0
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S,0


In [234]:
df2['Survived']

0      0
1      1
2      0
3      0
4      1
      ..
413    0
414    1
415    0
416    0
417    0
Name: Survived, Length: 418, dtype: int64