In [1]:
import glob
import sys,os
import json
import pprint
import time
import re
import datetime
import pickle
import string
import gc
import warnings
import yaml
warnings.filterwarnings("ignore")
sys.path.append(os.pardir)
sys.path.append('../..')
sys.path.append('../../..')

import numpy as np
import pandas as pd
import pandas_profiling as pdp
import matplotlib.pyplot as plt
import japanize_matplotlib # 日本語対応
import seaborn as sns
# pandasのオプション
pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 300)
pd.set_option('display.max_colwidth', 5000)
pd.options.display.float_format = '{:.3f}'.format
%matplotlib inline
# sns.set_style('whitegrid')
plt.style.use('fivethirtyeight')

from joblib import Parallel, delayed # よりお手軽にサクっと並列処理を実行出来るモジュール
from tqdm import tqdm, tqdm_notebook # プログレスバーを表示できる
from PIL import Image
tqdm.pandas()

# 外部モジュールを自動的にリロードする
%load_ext autoreload
%autoreload 2

In [2]:
CONFIG_FILE = '../configs/config.yaml'

with open(CONFIG_FILE) as file:
    yml = yaml.load(file)
MODEL_DIR_NAME = yml['SETTING']['MODEL_DIR_NAME']
FEATURE_DIR_NAME = yml['SETTING']['FEATURE_PATH']

In [16]:
def load_datasets_train(feats):
    dfs = [pd.read_pickle(FEATURE_DIR_NAME + f'{f}_train.pkl') for f in feats]
    X_train = pd.concat(dfs, axis=1)
    return X_train

def load_datasets_both(feats):
    dfs = [pd.read_pickle(FEATURE_DIR_NAME + f'{f}_train.pkl') for f in feats]
    X_train = pd.concat(dfs, axis=1)
    dfs = [pd.read_pickle(FEATURE_DIR_NAME + f'{f}_test.pkl') for f in feats]
    X_test = pd.concat(dfs, axis=1)
    return X_train, X_test

# 欠損値の確認
def missing_values_table(data):
    total = data.isnull().sum()
    percent = (data.isnull().sum()/data.isnull().count()*100)
    tt = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    types = []
    for col in data.columns:
        dtype = str(data[col].dtype)
        types.append(dtype)
    tt['Types'] = types
    return(np.transpose(tt))

In [4]:
! pwd

/Users/takapy/Documents/20_BlogCode/train_pred_pipeline/notebooks


In [10]:
features = [
'age'
,'age_mis_val_median'
,'cabin'
,'embarked'
,'embarked_mis_val_s'
,'family_size'
,'fare'
,'fare_mis_val_median'
,'name'
,'parch'
,'passenger_id'
,'pclass'
,'sex'
,'sex_label_encoder'
,'sibsp'
,'survived'
,'ticket'
]

In [11]:
train, test = load_datasets_both(features)

In [12]:
display(train.shape, test.shape)

(891, 17)

(418, 16)

In [13]:
display(train.head(), train.tail(), train.shape)

Unnamed: 0,Age,Age_mis_val_median,Cabin,Embarked,Embarked_mis_val_S,Family_Size,Fare,Fare_mis_val_median,Name,Parch,PassengerId,Pclass,Sex,sex_label_encoder,SibSp,Survived,Ticket
0,22.0,22.0,,S,S,1,7.25,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,1,0,A/5 21171
1,38.0,38.0,C85,C,C,1,71.283,71.283,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",0,2,1,female,0,1,1,PC 17599
2,26.0,26.0,,S,S,0,7.925,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,0,1,STON/O2. 3101282
3,35.0,35.0,C123,S,S,1,53.1,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,0,1,1,113803
4,35.0,35.0,,S,S,0,8.05,8.05,"Allen, Mr. William Henry",0,5,3,male,1,0,0,373450


Unnamed: 0,Age,Age_mis_val_median,Cabin,Embarked,Embarked_mis_val_S,Family_Size,Fare,Fare_mis_val_median,Name,Parch,PassengerId,Pclass,Sex,sex_label_encoder,SibSp,Survived,Ticket
886,27.0,27.0,,S,S,0,13.0,13.0,"Montvila, Rev. Juozas",0,887,2,male,1,0,0,211536
887,19.0,19.0,B42,S,S,0,30.0,30.0,"Graham, Miss. Margaret Edith",0,888,1,female,0,0,1,112053
888,,28.0,,S,S,3,23.45,23.45,"Johnston, Miss. Catherine Helen ""Carrie""",2,889,3,female,0,1,0,W./C. 6607
889,26.0,26.0,C148,C,C,0,30.0,30.0,"Behr, Mr. Karl Howell",0,890,1,male,1,0,1,111369
890,32.0,32.0,,Q,Q,0,7.75,7.75,"Dooley, Mr. Patrick",0,891,3,male,1,0,0,370376


(891, 17)

In [14]:
display(train.describe(), test.describe())

Unnamed: 0,Age,Age_mis_val_median,Family_Size,Fare,Fare_mis_val_median,Parch,PassengerId,Pclass,sex_label_encoder,SibSp,Survived
count,714.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,29.699,29.362,0.905,32.204,32.204,0.382,446.0,2.309,0.648,0.523,0.384
std,14.526,13.02,1.613,49.693,49.693,0.806,257.354,0.836,0.478,1.103,0.487
min,0.42,0.42,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,20.125,22.0,0.0,7.91,7.91,0.0,223.5,2.0,0.0,0.0,0.0
50%,28.0,28.0,0.0,14.454,14.454,0.0,446.0,3.0,1.0,0.0,0.0
75%,38.0,35.0,1.0,31.0,31.0,0.0,668.5,3.0,1.0,1.0,1.0
max,80.0,80.0,10.0,512.329,512.329,6.0,891.0,3.0,1.0,8.0,1.0


Unnamed: 0,Age,Age_mis_val_median,Family_Size,Fare,Fare_mis_val_median,Parch,PassengerId,Pclass,sex_label_encoder,SibSp
count,332.0,418.0,418.0,417.0,418.0,418.0,418.0,418.0,418.0,418.0
mean,30.273,29.599,0.84,35.627,35.577,0.392,1100.5,2.266,0.636,0.447
std,14.181,12.704,1.519,55.908,55.85,0.981,120.81,0.842,0.482,0.897
min,0.17,0.17,0.0,0.0,0.0,0.0,892.0,1.0,0.0,0.0
25%,21.0,23.0,0.0,7.896,7.896,0.0,996.25,1.0,0.0,0.0
50%,27.0,27.0,0.0,14.454,14.454,0.0,1100.5,3.0,1.0,0.0
75%,39.0,35.75,1.0,31.5,31.472,0.0,1204.75,3.0,1.0,1.0
max,76.0,76.0,10.0,512.329,512.329,9.0,1309.0,3.0,1.0,8.0


In [17]:
# 各データの欠損値を確認
display(
    missing_values_table(train),
    missing_values_table(test)
)

Unnamed: 0,Age,Age_mis_val_median,Cabin,Embarked,Embarked_mis_val_S,Family_Size,Fare,Fare_mis_val_median,Name,Parch,PassengerId,Pclass,Sex,sex_label_encoder,SibSp,Survived,Ticket
Total,177,0,687,2,0,0,0,0,0,0,0,0,0,0,0,0,0
Percent,19.865,0.000,77.104,0.224,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
Types,float64,float64,object,object,object,int64,float64,float64,object,int64,int64,int64,object,int64,int64,int64,object


Unnamed: 0,Age,Age_mis_val_median,Cabin,Embarked,Embarked_mis_val_S,Family_Size,Fare,Fare_mis_val_median,Name,Parch,PassengerId,Pclass,Sex,sex_label_encoder,SibSp,Ticket
Total,86,0,327,0,0,0,1,0,0,0,0,0,0,0,0,0
Percent,20.574,0.000,78.230,0.000,0.000,0.000,0.239,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
Types,float64,float64,object,object,object,int64,float64,float64,object,int64,int64,int64,object,int64,int64,object
