In [3]:
import csv
import numpy as np
import pandas as pd

# scikit-learn
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as O
from torch.autograd import Variable


In [4]:
! ls ../input/titanic

[31mgender_submission.csv[m[m [31mtest.csv[m[m              [31mtrain.csv[m[m


In [6]:
# トレーニングデータ、テストデータ、サンプルサブミットデータを読み込み
data = pd.read_csv('../input/titanic/train.csv')
test = pd.read_csv('../input/titanic/test.csv')
sample_submission = pd.read_csv('../input/titanic/gender_submission.csv')

In [18]:
test

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


PassengerId : 乗客のID
Survived : 生存結果（０：死亡、１：生存）
Pclass : チケットのクラス　（1=1st, 2=2nd, 3=3rd）
Name : 乗客の名前
Sex : 乗客の性別
Age : 乗客の年齢
SibSp : タイタニック号に乗船している兄弟と配偶者の数
Parch : タイタニック号に乗船している両親と子供の数
Ticket : チケット番号
Fare : 運賃
Cabin : 部屋番号
Embarked : 乗船した港

In [7]:
# Change string to numerical in sex field　性別を文字列から数字に変更
data = data.replace("male", 1).replace("female", 0)
# Remove some nonvaluable field nullが含まれるカラムを削除
data = data.drop(["Name", "Ticket", "Embarked", "Cabin", "Fare"], axis=1)
# Remove missing value 欠損値を削除
data = data.dropna()

In [8]:
# Split dataset into training set and test one　データセットを学習とテストに分ける
X = data.values[:, 2:]
Y = data.values[:, 1].astype(dtype=np.int64)
train_x, test_x, train_t, test_t = train_test_split(X, Y, test_size=0.1) 

In [9]:
# ナイーブベイズ
gnb = GaussianNB()
gnb.fit(train_x, train_t)

result = gnb.predict(test_x)
num_right = np.sum(result == test_t)
print("Accuracy {:.2f}".format(num_right / len(test_t)))

Accuracy 0.76


In [10]:
# SVM
clf = SVC()
clf.fit(train_x, train_t) # パラメータ（平均や標準偏差 etc）計算

result = clf.predict(test_x) # 学習
num_right = np.sum(result == test_t)
print("Accuracy {:.2f}".format(num_right / len(test_t)))

Accuracy 0.81




In [11]:
# ニューラルネット
class NeuralNetwork(nn.Module):
    def __init__(self, n_in, n_hidden, n_out):
        super(NeuralNetwork, self).__init__()
        self.input = nn.Linear(n_in, n_hidden)
        self.output = nn.Linear(n_hidden, n_out)
        
    def forward(self, x):
        h = F.sigmoid(self.input(x))
        y = F.sigmoid(self.output(h))
        return y
    

batchsize = 50
epochs = 4000
learning_rate = 0.01
n_batch = len(train_x) // batchsize
n_in = len(train_x[0])
n_hidden = 3
n_out = 2

network = NeuralNetwork(n_in, n_hidden, n_out)
criterion = nn.CrossEntropyLoss()
optimizer = O.Adam(network.parameters(), lr=learning_rate)

for epoch in range(epochs):
    if epoch % 100 == 0:
        print("Epoch {}".format(epoch))
    train_x, train_t = shuffle(train_x, train_t)
    # Mini batch learning　ミニバッチ学習
    for i in range(n_batch):
        start = i * batchsize
        end = start + batchsize
        x_var = Variable(torch.FloatTensor(train_x[start:end]))
        t_var = Variable(torch.LongTensor(train_t[start:end]))
        # Forward + Backward + Optimize
        optimizer.zero_grad()
        y_var = network(x_var)
        loss = criterion(y_var, t_var)
        loss.backward()
        optimizer.step()

# Test the model テストモデル
test_var = Variable(torch.FloatTensor(test_x), volatile=True)
result = network(test_var)
values, labels = torch.max(result, 1)
num_right = np.sum(test_t == labels.data.numpy())
print("Accuracy {:.2f}".format(num_right / len(test_t)))

Epoch 0




Epoch 100
Epoch 200
Epoch 300
Epoch 400
Epoch 500
Epoch 600
Epoch 700
Epoch 800
Epoch 900
Epoch 1000
Epoch 1100
Epoch 1200
Epoch 1300
Epoch 1400
Epoch 1500
Epoch 1600
Epoch 1700
Epoch 1800
Epoch 1900
Epoch 2000
Epoch 2100
Epoch 2200
Epoch 2300
Epoch 2400
Epoch 2500
Epoch 2600
Epoch 2700
Epoch 2800
Epoch 2900
Epoch 3000
Epoch 3100
Epoch 3200
Epoch 3300
Epoch 3400
Epoch 3500
Epoch 3600
Epoch 3700
Epoch 3800
Epoch 3900
Accuracy 0.79




In [26]:
#  テスト
####################
test = pd.read_csv("../input/titanic/test.csv")

# Change string to numerical in sex field テストデータの性別を文字から数字に変更
test = test.replace("male", 1).replace("female", 0)
# Remove some nonvaluable field カラムを削除
test = test.drop(["Name", "Ticket", "Embarked", "Cabin", "Fare"], axis=1)
# Give median values to NA in age field  nullの年齢カラムを中央値で穴埋め
test["Age"].fillna(test.Age.median(), inplace=True)
test_set = test.values[:, 1:]
test_var = Variable(torch.FloatTensor(test_set), volatile=True)
# with torch.no_grad():
result = network(test_var)
values, labels = torch.max(result, 1)

  if sys.path[0] == '':


In [29]:
with open("result.csv", "w") as f:
    writer = csv.writer(f, lineterminator="\n")
    writer.writerow(["PassengerId", "Survived"])
    for pid, survived in zip(test.values[:, 0].astype(int), labels.data.numpy()):
        writer.writerow([pid, survived])