In [67]:
import pandas as pd

In [68]:
train_raw = pd.read_csv('../data/train.csv') #学習データ
test_raw = pd.read_csv('../data/test.csv') #テストデータ
print('The size of the train data:' + str(train_raw.shape))
print('The size of the test data:' + str(test_raw.shape))

The size of the train data:(891, 12)
The size of the test data:(418, 11)


In [69]:
train_mid = train_raw.copy()
test_mid = test_raw.copy()
train_mid['train_or_test'] = 'Train' #学習データフラグ
test_mid['train_or_test'] = 'Test' #テストデータフラグ
test_mid['Survived'] = 9 #テストにSurvivedカラムを仮置き

alldata = pd.concat(
    [
        train_mid,
        test_mid
    ],
    sort=False,
    axis=0
).reset_index(drop=True)

In [70]:
print(alldata.shape)
alldata.head()

(1309, 13)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,train_or_test
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Train
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Train
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Train
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Train
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Train


In [71]:
# テストデータの敬称(honorific)を抽出
alldata['honorific'] = alldata['Name'].map(lambda x: x.split(', ')[1].split('. ')[0])
# Familysize
alldata['FamilySize'] = alldata['Parch'] + alldata['SibSp'] + 1 #ALLデータ
# Cabinの頭文字
alldata['Cabin_ini'] = alldata['Cabin'].map(lambda x:str(x)[0])

In [72]:
id_col = 'PassengerId'
target_col = 'Survived'
feature_cols = ['Pclass', 'Sex', 'Age', 'SibSp','Parch', 'Fare', 'Embarked','honorific', 'FamilySize', 'Cabin_ini']

In [73]:
object_cols = alldata[feature_cols].dtypes[alldata[feature_cols].dtypes=='object'].index

In [74]:
# 欠損値補完
alldata.isnull().sum()

PassengerId         0
Survived            0
Pclass              0
Name                0
Sex                 0
Age               263
SibSp               0
Parch               0
Ticket              0
Fare                1
Cabin            1014
Embarked            2
train_or_test       0
honorific           0
FamilySize          0
Cabin_ini           0
dtype: int64

In [75]:
# Embarkedには最頻値を代入
alldata.Embarked.fillna(alldata.Embarked.mode()[0], inplace=True)
# Ageには中央値を代入
alldata.Age.fillna(alldata.Age.median(), inplace=True)
# Fareには中央値を代入
alldata.Fare.fillna(alldata.Fare.median(), inplace=True)

In [76]:
# カテゴリ特徴量についてラベルエンコーディング
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in object_cols:
    alldata.loc[:, col] = le.fit_transform(alldata[col])

In [77]:
alldata.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,train_or_test,honorific,FamilySize,Cabin_ini
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,2,Train,12,2,8
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,0,Train,13,2,2
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,2,Train,9,1,8
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,2,Train,13,2,2
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,2,Train,12,1,8


In [78]:
train = alldata.query('train_or_test == "Train"')
test = alldata.query('train_or_test == "Test"')

In [79]:
from sklearn.model_selection import train_test_split

In [81]:
train_feature = train[feature_cols]
train_tagert = train[target_col]
X_train, X_test, y_train, y_test = train_test_split(
    train_feature, train_tagert, test_size=0.2, random_state=0, stratify=train_tagert)

In [82]:
X_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,honorific,FamilySize,Cabin_ini
502,3,0,28.0,0,0,7.6292,1,9,1,8
464,3,1,28.0,0,0,8.0500,2,12,1,8
198,3,0,28.0,0,0,7.7500,1,9,1,8
765,1,0,51.0,1,0,77.9583,2,13,2,3
421,3,1,21.0,0,0,7.7333,1,12,1,8
...,...,...,...,...,...,...,...,...,...,...
131,3,1,20.0,0,0,7.0500,2,12,1,8
490,3,1,28.0,1,0,19.9667,2,12,2,8
528,3,1,39.0,0,0,7.9250,2,12,1,8
48,3,1,28.0,2,0,21.6792,0,12,3,8


In [8]:
import optuna

In [4]:
import xgboost