In [0]:
!unzip titanic.zip

Archive:  titanic.zip
  inflating: train.csv               
  inflating: test.csv                
  inflating: gender_submission.csv   


In [0]:
import numpy as np 
import pandas as pd 
# Data processing, metrics and modeling
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import cross_val_score, StratifiedKFold
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Sequential
from keras.layers import Dense, Dropout
# Reproductibility
from numpy.random import seed
seed(1002)
from tensorflow import set_random_seed
set_random_seed(1002)

Using TensorFlow backend.


In [0]:
'''
データのロード
'''
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [0]:
def missing_table(df):      # データの欠損を計算
    null_val = df.isnull().sum()
    percent = 100 * df.isnull().sum()/len(df)
    missing_table = pd.concat([null_val, percent], axis=1)
    missing_table_ren_columns = missing_table.rename(
        columns = {0:"欠損数", 1:"%"}
    )
    return missing_table_ren_columns

In [0]:
# trainとtestのラベルを追加
train['Type'] = 'train'
test['Type'] = 'test'
data = train.append(test)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [0]:
'''
データの前処理
'''
# Title <- Name
data['Title'] = data['Name']
for name_string in data['Name']:
    data['Title'] = data['Name'].str.extract('([A-Za-z]+)\.', expand=True)

In [0]:
data['Title'].value_counts()
data[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

Unnamed: 0,Title,Survived
0,Capt,0.0
1,Col,0.5
2,Countess,1.0
3,Don,0.0
4,Dona,
5,Dr,0.428571
6,Jonkheer,0.0
7,Lady,1.0
8,Major,0.5
9,Master,0.575


In [0]:
mapping = {'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs', 'Major': 'Other', 
           'Col': 'Other', 'Dr' : 'Other', 'Rev' : 'Other', 'Capt': 'Other', 
           'Jonkheer': 'Royal', 'Sir': 'Royal', 'Lady': 'Royal', 
           'Don': 'Royal', 'Countess': 'Royal', 'Dona': 'Royal'}
data.replace({'Title': mapping}, inplace=True)

In [0]:
# 年齢の欠損値を補完
titles = ['Miss', 'Mr', 'Mrs', 'Royal', 'Other', 'Master']
for title in titles:
    age_to_impute = data.groupby('Title')['Age'].median()[title]
    data.loc[(data['Age'].isnull()) & (data['Title'] == title), 'Age'] = age_to_impute

In [0]:
data.groupby('Title')['Age'].median()
missing_table(data)

Unnamed: 0,欠損数,%
Age,0,0.0
Cabin,1014,77.463713
Embarked,2,0.152788
Fare,1,0.076394
Name,0,0.0
Parch,0,0.0
PassengerId,0,0.0
Pclass,0,0.0
Sex,0,0.0
SibSp,0,0.0


In [0]:
# Family_Size, FsizeD <- Parch, SibSp
data['Family_Size'] = data['Parch'] + data['SibSp'] + 1
data['FsizeD'] = 'Alone'
data.loc[(data['Family_Size'] > 1),'FsizeD'] = 'Small'
data.loc[(data['Family_Size'] > 4),'FsizeD'] = 'Big'

In [0]:
# 運賃の欠損値の補完
fa = data[data["Pclass"] == 3]
data['Fare'].fillna(fa['Fare'].median(), inplace = True)

In [0]:
# Child <- Age
data['Child'] = 1
data.loc[(data['Age'] >= 18),'Child'] =0

In [0]:
# Family_Survival
data['Last_Name'] = data['Name'].apply(lambda x: str.split(x, ",")[0])
DEFAULT_SURVIVAL_VALUE = 0.5
data['Family_Survival'] = DEFAULT_SURVIVAL_VALUE   # 更新がない場合は0.5

In [0]:
for grp, grp_df in data[['Survived','Name', 'Last_Name', 'Fare', 'Ticket', 'PassengerId',
                           'SibSp', 'Parch', 'Age', 'Cabin']].groupby(['Last_Name', 'Fare']):
# grp:Last_Name, Fareの値　grp_df:(Last_Name, Fare)の組み合わせのdf                  
    if (len(grp_df) != 1):
        # A Family group is found.
        for ind, row in grp_df.iterrows():   # 一行ずつindex(行)とrow(列)で取り出し (index, Series)
            smax = grp_df.drop(ind)['Survived'].max()  # 現indを抜いたmax(0 or 1 or nan)
            smin = grp_df.drop(ind)['Survived'].min()   # 現indを抜いたmin(0 or 1 or nan)
            passID = row['PassengerId']
            if (smax == 1.0):
                data.loc[data['PassengerId'] == passID, 'Family_Survival'] = 1
            elif (smin == 0.0):
                data.loc[data['PassengerId'] == passID, 'Family_Survival'] = 0
            # nanの時は更新がないので0.5
                
for _, grp_df in data.groupby('Ticket'):
    if (len(grp_df) != 1):
        print(grp_df['Last_Name'])
        for ind, row in grp_df.iterrows():
            if (row['Family_Survival'] == 0) | (row['Family_Survival']== 0.5):
                smax = grp_df.drop(ind)['Survived'].max()
                smin = grp_df.drop(ind)['Survived'].min()
                passID = row['PassengerId']
                if (smax == 1.0):
                    data.loc[data['PassengerId'] == passID, 'Family_Survival'] = 1
                elif (smin == 0.0):
                    data.loc[data['PassengerId'] == passID, 'Family_Survival'] = 0

In [0]:
'''
データのエンコード
'''
# 不要なラベルの削除
data = data.drop(columns = ['Age','Cabin','Embarked','Name','Last_Name', 'Parch', 'SibSp','Ticket', 'Family_Size'])

In [0]:
target_col = ["Survived"]
id_dataset = ["Type"]
cat_cols   = data.nunique()[data.nunique() < 12].keys().tolist()   # unique数が12未満のラベルをcat_colsにまとめる
cat_cols   = [x for x in cat_cols ]

In [0]:
# numerical columns(数値)
num_cols   = [x for x in data.columns if x not in cat_cols + target_col + id_dataset]  # cat_cols, target_col, id_datasetのどれにも属さないラベルをまとめる　
# Binary columns with 2 values(バイナリ)
bin_cols   = data.nunique()[data.nunique() == 2].keys().tolist()
# Columns more than 2 values(3種以上のクラス)
multi_cols = [i for i in cat_cols if i not in bin_cols]

In [0]:
# bin_colsのエンコード
le = LabelEncoder()
for i in bin_cols :
    data[i] = le.fit_transform(data[i])

In [0]:
# multi_colsのダミーデータを生成
data = pd.get_dummies(data = data,columns = multi_cols )

In [0]:
# num_colsを標準化
std = StandardScaler()
scaled = std.fit_transform(data[num_cols])
scaled = pd.DataFrame(scaled,columns = num_cols)

In [0]:
# 不要なラベルの削除とscaledのmerge
df_data_og = data.copy()
data = data.drop(columns = num_cols,axis = 1)
data = data.merge(scaled,left_index = True,right_index = True,how = "left")
data = data.drop(columns = ['PassengerId'],axis = 1)

In [0]:
cols = data.columns.tolist()   # ラベルを格納
cols.insert(0, cols.pop(cols.index('Survived')))
data = data.reindex(columns= cols)   # Survivedを先頭にする

In [0]:
# dataの分離
train = data[data['Type'] == 1].drop(columns = ['Type'])
test = data[data['Type'] == 0].drop(columns = ['Type'])

In [0]:
# trainを入力と正解に分離しarrayに変換
X_train = train.iloc[:, 1:].as_matrix()
Y_train = train.iloc[:,0].as_matrix()
X_test = test.iloc[:, 1:].as_matrix()

  """Entry point for launching an IPython kernel.
  
  This is separate from the ipykernel package so we can avoid doing imports until


In [0]:
'''
データの保存
'''
import pickle

pickle_file = "./titanic_data_forDNN.pickle"
with open(pickle_file, 'wb') as f:
    pickle.dump(X_train, f)
    pickle.dump(Y_train, f)
    pickle.dump(X_test, f)

In [0]:
with open(pickle_file, 'rb') as f:
    X_train = pickle.load(f)
    Y_train = pickle.load(f)
    X_test = pickle.load(f)