In [1]:
!unzip titanic.zip

Archive:  titanic.zip
  inflating: train.csv               
  inflating: test.csv                
  inflating: gender_submission.csv   


In [2]:
import subprocess
import sys
import datetime
import os
import random as rn

import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import collections
import math

import tensorflow as tf
import keras
from keras import regularizers
from keras import backend as K
from keras.callbacks import LearningRateScheduler
from keras.layers import Input, Embedding, LSTM, Dense, concatenate, Dropout
from keras.models import Model, load_model

Using TensorFlow backend.


In [3]:
'''
データのロード
'''
train = pd.read_csv("train.csv")      # (891, 12)
test = pd.read_csv("test.csv")        # (418, 11)

full_df = pd.concat([train, test])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  import sys


In [0]:
def missing_table(df):      # データの欠損を計算
    null_val = df.isnull().sum()
    percent = 100 * df.isnull().sum()/len(df)
    missing_table = pd.concat([null_val, percent], axis=1)
    missing_table_ren_columns = missing_table.rename(
        columns = {0:"欠損数", 1:"%"}
    )
    return missing_table_ren_columns

In [5]:
missing_table(full_df)

Unnamed: 0,欠損数,%
Age,263,20.091673
Cabin,1014,77.463713
Embarked,2,0.152788
Fare,1,0.076394
Name,0,0.0
Parch,0,0.0
PassengerId,0,0.0
Pclass,0,0.0
Sex,0,0.0
SibSp,0,0.0


In [6]:
full_df.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
0,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,0.0,A/5 21171
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.0,PC 17599
2,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,1.0,STON/O2. 3101282
3,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1.0,113803
4,35.0,,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,0.0,373450


In [0]:
'''
前処理
'''
# Name
full_df['Title'] = full_df.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

In [8]:
vc = full_df['Title'].value_counts()
print(vc)

Mr          757
Miss        260
Mrs         197
Master       61
Dr            8
Rev           8
Col           4
Mlle          2
Major         2
Ms            2
Jonkheer      1
Capt          1
Mme           1
Lady          1
Countess      1
Sir           1
Don           1
Dona          1
Name: Title, dtype: int64


In [0]:
full_df['Title'] = full_df['Title'].replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
full_df['Title'] = full_df['Title'].replace('Mlle', 'Miss')
full_df['Title'] = full_df['Title'].replace('Ms', 'Miss')
full_df['Title'] = full_df['Title'].replace('Mme', 'Mrs')

In [10]:
full_df[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

Unnamed: 0,Title,Survived
0,Master,0.575
1,Miss,0.702703
2,Mr,0.156673
3,Mrs,0.793651
4,Rare,0.347826


In [0]:
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
full_df['Title'] = full_df['Title'].map(title_mapping)

In [12]:
full_df['Title']

0      1
1      3
2      2
3      3
4      1
5      1
6      1
7      4
8      3
9      3
10     2
11     2
12     1
13     1
14     2
15     3
16     4
17     1
18     3
19     3
20     1
21     1
22     2
23     1
24     2
25     3
26     1
27     1
28     2
29     1
      ..
388    1
389    4
390    1
391    3
392    4
393    1
394    1
395    3
396    1
397    3
398    1
399    1
400    2
401    1
402    2
403    1
404    1
405    1
406    1
407    1
408    2
409    2
410    2
411    3
412    2
413    1
414    5
415    1
416    1
417    4
Name: Title, Length: 1309, dtype: int64

In [0]:
# Sex
full_df['Sex'] = full_df['Sex'].map( {'female': 1, 'male': 0} ).astype(int)

In [0]:
# Age
guess_ages = np.zeros((2,3))
for i in range(0, 2):
  for j in range(0, 3):
    guess_df = full_df[(full_df['Sex'] == i) & (full_df['Pclass'] == j+1)]['Age'].dropna()
    age_guess = guess_df.median()
    guess_ages[i,j] = int( age_guess/0.5 + 0.5 ) * 0.5
    
for i in range(0, 2):
  for j in range(0, 3):
    full_df.loc[ (full_df.Age.isnull()) & (full_df.Sex == i) & (full_df.Pclass == j+1), 'Age'] = guess_ages[i,j]
    
full_df['Age'] = full_df['Age'].astype(int)           

In [15]:
# FamilySize
full_df['FamilySize'] = full_df['Parch'] + full_df['SibSp'] + 1
full_df[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,FamilySize,Survived
3,4,0.724138
2,3,0.578431
1,2,0.552795
6,7,0.333333
0,1,0.303538
4,5,0.2
5,6,0.136364
7,8,0.0
8,11,0.0


In [16]:
# IsAlone
full_df['IsAlone'] = 0
full_df.loc[full_df['FamilySize']==1, 'IsAlone'] = 1
full_df[['IsAlone', 'Survived']].groupby(['IsAlone'], as_index=False).mean()

Unnamed: 0,IsAlone,Survived
0,0,0.50565
1,1,0.303538


In [17]:
# Embarked
freq_port = full_df.Embarked.dropna().mode()[0]  # S
full_df['Embarked'] = full_df['Embarked'].fillna(freq_port)
full_df[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Embarked,Survived
0,C,0.553571
1,Q,0.38961
2,S,0.339009


In [0]:
full_df['Embarked'] = full_df['Embarked'].map({'S':0, 'C':1, 'Q':2}).astype(int)

In [0]:
full_df['Fare'].fillna(full_df['Fare'].dropna().median(), inplace=True)

In [0]:
# Fare per Person
ticket_list = full_df[["Ticket"]].values.tolist()
# 上のtolist()だとリスト化されるがその要素もリストとなる。そのため
# 文字列とするために以下の１行を追加。
ticket_list = [_i[0] for _i in ticket_list]
ticket_count = [0 for _i in range(len(ticket_list))]
c = collections.Counter(ticket_list)
for _i in c.keys():
    # 辞書のキーを順に処理。ticket_listの要素と同じならインデックスを抽出。
    # 抽出されたインデックスに関し出現回数を設定
    for _l in [_j for _j, x in enumerate(ticket_list) if x == _i]: 
#       print(_i,_l,c[_i])
      ticket_count[_l] = c[_i]
df = full_df
full_df = pd.concat([df.reset_index(drop=True), pd.DataFrame(data=ticket_count, columns=["TicketCount"], dtype='int')], axis=1)
full_df['FareAdj'] = full_df['Fare'] / full_df['TicketCount']

In [0]:
# FamilySurvivedCount
ticket_list = full_df[["Ticket"]].values.tolist()
ticket_list = [_i[0] for _i in ticket_list]
family_survived_count = [0 for _i in range(len(ticket_list))]
c = collections.Counter(ticket_list)

for _i, _t in enumerate(ticket_list):
    same_g = full_df.query('Ticket==@_t')[["PassengerId"]].values.tolist( )
    same_g = [_i[0] for _i in same_g]
    if len(same_g) == 1:
        continue
#         print(_i,same_g," ", end="")
    tmpcounter = 0
    for _l in [_x for  _x in same_g if _x!= (_i+1) ]: 
#             print("_l :",_l, end="")
        if (not math.isnan(full_df.loc[_l-1,"Survived"])) :
            tmpcounter += full_df.loc[_l-1,"Survived"]
#             print( " tmpcounter :",tmpcounter)
#         for _l in [_j for _j, _x in enumerate(ticket_list) if _x == _i]:
#             family_survived_count[_l]=tmpcounter
    family_survived_count[_i]=tmpcounter
full_df = pd.concat([full_df, pd.DataFrame(data=family_survived_count, columns=["Family_S_C"], dtype='int').reset_index(drop=True)], axis=1)


In [0]:
full_df['Simple_S_C']=full_df['Family_S_C'].apply(lambda x: 1 if x >0 else 0)

In [23]:
full_df[['Family_S_C', 'Simple_S_C']]

Unnamed: 0,Family_S_C,Simple_S_C
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
5,0,0
6,0,0
7,0,0
8,2,1
9,0,0


In [24]:
full_df.columns

Index(['Age', 'Cabin', 'Embarked', 'Fare', 'Name', 'Parch', 'PassengerId',
       'Pclass', 'Sex', 'SibSp', 'Survived', 'Ticket', 'Title', 'FamilySize',
       'IsAlone', 'TicketCount', 'FareAdj', 'Family_S_C', 'Simple_S_C'],
      dtype='object')

In [0]:
full_df = full_df.drop(["Ticket", "Fare","TicketCount"], axis=1)

In [26]:
full_df['Cabin'].unique()

array([nan, 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62 C64',

In [0]:
full_df['CabinInfo'] = full_df['Cabin'].apply(lambda _x: str(_x)[0] if type(_x) == str else 'noinfo' ) 

In [28]:
full_df['CabinInfo'].unique()

array(['noinfo', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [0]:
full_df['CabinInfo'] = full_df['CabinInfo'].map({"noinfo":0, "A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F":6, "G":7, "T":8})

In [0]:
# データの分割
test_df = full_df.query('PassengerId>=892')
test_df = test_df.drop('Survived', axis=1)
train_df = full_df.query('PassengerId<=891')

In [31]:
'''
入出力データの作成
'''
X_train = train_df.drop(["PassengerId","Survived", "Cabin", 'Name'], axis=1)
Y_train = train_df["Survived"]
X_test  = test_df.drop(["PassengerId", "Cabin", 'Name'], axis=1).copy()
X_train.shape, Y_train.shape, X_test.shape

((891, 13), (891,), (418, 13))

In [32]:
np.array(X_train.Age).reshape((len(X_train), 1))

array([[22],
       [38],
       [26],
       [35],
       [35],
       [25],
       [54],
       [ 2],
       [27],
       [14],
       [ 4],
       [58],
       [20],
       [39],
       [14],
       [55],
       [ 2],
       [29],
       [31],
       [22],
       [35],
       [34],
       [15],
       [28],
       [ 8],
       [38],
       [25],
       [19],
       [22],
       [25],
       [40],
       [36],
       [22],
       [66],
       [28],
       [42],
       [25],
       [21],
       [18],
       [14],
       [40],
       [27],
       [25],
       [ 3],
       [19],
       [25],
       [25],
       [22],
       [25],
       [18],
       [ 7],
       [21],
       [49],
       [29],
       [65],
       [42],
       [21],
       [28],
       [ 5],
       [11],
       [22],
       [38],
       [45],
       [ 4],
       [42],
       [25],
       [29],
       [19],
       [17],
       [26],
       [32],
       [16],
       [21],
       [26],
       [32],
       [25],
       [25],

In [33]:
'''
標準化
'''
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)

X_train_std = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
X_test_std = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

  return self.partial_fit(X, y)
  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.


In [0]:
'''
データの保存
'''
pickle_file = "./titanic_std.pickle"
with open(pickle_file, 'wb') as f:
    pickle.dump(X_train, f)
    pickle.dump(X_train_std, f)
    pickle.dump(X_test, f)
    pickle.dump(X_test_std, f)
    pickle.dump(Y_train, f)
    

In [0]:
with open(pickle_file, 'rb') as f:
    a = pickle.load(f)
    b = pickle.load(f)
    c = pickle.load(f)
    d = pickle.load(f)
    e = pickle.load(f)