In [1]:
import pandas as pd
import numpy as np

In [94]:
train = pd.read_csv('../../dataset/train.csv')
test = pd.read_csv('../../dataset/test.csv')
# 前処理を一度にやるためにtrainとtestをconcatする
test['Transported'] = np.nan
train_test = pd.concat([train, test], axis=0, ignore_index=True, sort=False)

# split on `/` to cols (deck/num/side)
def split_cabin(df):
    cabin = df['Cabin'].str.split('/', expand=True).rename(columns={0: 'CabinDeck', 1: 'CabinNum', 2: 'CabinSide'})
    cabin['CabinNum'] = cabin['CabinNum'].astype(float)
    return pd.concat([df, cabin], axis=1)

train_test = split_cabin(train_test)

  train_test = pd.concat([train, test], axis=0, ignore_index=True, sort=False)


### 使う特徴量を選ぶ

In [95]:
train_test = train_test[['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'CabinSide', 'Transported']]

採用する特徴量
- HomePlanet→最頻値
- CryoSleep→最頻値
- Destination→最頻値
- Age→中央値で埋める
- VIP→VIPなしで埋める
- CabinSide→CabinNum==82となっているCabinSideの最頻値で埋める？→Pで埋める

### Encoding

In [96]:
# HomePlanet, Destination, CabinSideはlabel encoding
for col in ['HomePlanet', 'Destination', 'CabinSide']:
    train_test[col] = pd.factorize(train_test[col])[0]

In [97]:
# boolをintへ
def bool2int(df):
    for col in df.columns:
        if df[col].dtype == bool:
            df[col] = df[col].astype(float)
        if df[col].dtype == 'object':
            df[col] = df[col].map({True: 1, False: 0})
    return df

train_test = bool2int(train_test)

In [106]:
train_test

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,CabinSide,Transported
0,0,0.0,0,39.0,0.0,0,0.0
1,1,0.0,0,24.0,0.0,1,1.0
2,0,0.0,0,58.0,1.0,1,0.0
3,0,0.0,0,33.0,0.0,1,0.0
4,1,0.0,0,16.0,0.0,1,1.0
...,...,...,...,...,...,...,...
12965,1,1.0,0,34.0,0.0,1,
12966,1,0.0,0,42.0,0.0,-1,
12967,2,1.0,2,,0.0,0,
12968,0,0.0,-1,,0.0,0,


### モデリング

In [7]:
import lightgbm as lgbm

In [134]:
params = {
    'objective': 'binary',
    'num_leaves': 64,
    'min_child_samples': 20,
    'max_depth': 7,
    'n_estimators': 100,
    'learning_rate': 0.1,
    'importance_type': 'gain',
}

### 学習

In [109]:
import os
import random
from sklearn.model_selection import KFold, cross_validate
from sklearn.metrics import accuracy_score

In [103]:
def set_seed(seed=3407):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)


SEED = 3407
set_seed(SEED)

In [115]:
# DataFrameをnp.ndarrayに変換
trainval = train_test[~train_test['Transported'].isna()]
test = train_test[train_test['Transported'].isna()]
# inputとlabelに分離
x_trainval = trainval.drop('Transported', axis=1).values
y_trainval = trainval.Transported.values
x_test = test.drop('Transported', axis=1).values

In [116]:
x_trainval.shape, y_trainval.shape, x_test.shape

((8693, 6), (8693,), (4277, 6))

In [135]:
kf = KFold(n_splits=5, shuffle=True, random_state=SEED)

val_scores = []
models = []
for fold, (train_inds, val_inds) in enumerate(kf.split(x_trainval)):
    
    x_train, x_val = x_trainval[train_inds], x_trainval[val_inds]
    y_train, y_val = y_trainval[train_inds], y_trainval[val_inds]
    
    model = lgbm.LGBMClassifier(**params)
    model.fit(
        x_train, y_train,
        eval_set=(x_val, y_val),
        eval_metric='logloss',
        callbacks=[lgbm.early_stopping(10)],
    )

    y_val_pred = model.predict(x_val)
    score = accuracy_score(y_val, y_val_pred)
    print(f'fold {fold}/acc: {score}')
    val_scores.append(score)
    models.append(model)

cv_score = np.mean(val_scores)
print(f'CV score: {cv_score}')

Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[29]	valid_0's binary_logloss: 0.521327
fold 0/acc: 0.7406555491661875
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[37]	valid_0's binary_logloss: 0.529348
fold 1/acc: 0.7297297297297297
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[32]	valid_0's binary_logloss: 0.52962
fold 2/acc: 0.7360552041403106
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[38]	valid_0's binary_logloss: 0.506824
fold 3/acc: 0.7479861910241657
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[27]	valid_0's binary_logloss: 0.532549
fold 4/acc: 0.7295742232451093
CV score: 0.7368001794611005


In [137]:
importance = pd.DataFrame()

for i in range(len(models)):
    df = pd.DataFrame(models[i].feature_importances_,
                      index=train.columns[:-1], columns=[f'model{i+1}'])
    df = df.sort_values(f'model{i+1}', ascending=False)
    importance = pd.concat([importance, df], axis=1)

importance

Unnamed: 0,model1,model2,model3,model4,model5
CryoSleep,8272.77794,8782.041676,8718.379874,8300.382968,8751.358323
HomePlanet,2808.461472,2749.416826,2729.368397,2805.426626,2712.392995
Age,2390.456423,2531.698564,2386.261402,2545.021514,2183.614813
CabinSide,397.43306,404.66977,474.556898,456.762055,381.101245
Destination,343.048017,317.61306,274.567159,355.458576,203.982948
VIP,62.632485,62.959859,62.558878,80.020328,29.483359


### submit用のcsv作成

cvごとの推論の単純平均

In [144]:
y_preds = []

for i in range(len(models)):
    predictor = models[i]
    y_pred = predictor.predict_proba(x_test)
    y_preds.append(y_pred)

In [156]:
ensemble = np.argmax(np.mean(y_preds, axis=0), axis=-1)

In [158]:
test

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,CabinSide,Transported
8693,1,1.0,0,27.0,0.0,1,
8694,1,0.0,0,19.0,0.0,1,
8695,0,1.0,2,31.0,0.0,1,
8696,0,0.0,0,38.0,0.0,1,
8697,1,0.0,0,20.0,0.0,1,
...,...,...,...,...,...,...,...
12965,1,1.0,0,34.0,0.0,1,
12966,1,0.0,0,42.0,0.0,-1,
12967,2,1.0,2,,0.0,0,
12968,0,0.0,-1,,0.0,0,


In [159]:
test_ids = pd.read_csv('../../dataset/test.csv')['PassengerId']

In [160]:
test_ids

0       0013_01
1       0018_01
2       0019_01
3       0021_01
4       0023_01
         ...   
4272    9266_02
4273    9269_01
4274    9271_01
4275    9273_01
4276    9277_01
Name: PassengerId, Length: 4277, dtype: object

In [164]:
df_submit = pd.DataFrame(ensemble, index=test_ids, columns=['Transported'])
df_submit.Transported = df_submit.Transported.astype(bool)

In [165]:
df_submit

Unnamed: 0_level_0,Transported
PassengerId,Unnamed: 1_level_1
0013_01,True
0018_01,False
0019_01,True
0021_01,False
0023_01,False
...,...
9266_02,True
9269_01,False
9271_01,True
9273_01,False


In [166]:
df_submit.to_csv('lgbm_trial1.csv')