In [1]:
import pandas as pd
import numpy as np

In [16]:
train = pd.read_csv('../../dataset/train.csv')
test = pd.read_csv('../../dataset/test.csv')
# 前処理を一度にやるためにtrainとtestをconcatする
test['Transported'] = np.nan
train_test = pd.concat([train, test], axis=0, ignore_index=True, sort=False)

# split on `/` to cols (deck/num/side)
def split_cabin(df):
    cabin = df['Cabin'].str.split('/', expand=True).rename(columns={0: 'CabinDeck', 1: 'CabinNum', 2: 'CabinSide'})
    cabin['CabinNum'] = cabin['CabinNum'].astype(float)
    return pd.concat([df, cabin], axis=1)

# group passenger or not
def make_group(df):
    df['GroupId'] = df['PassengerId'].apply(lambda x: x.split('_')[0])
    df['PeopleId'] = df['PassengerId'].apply(lambda x: x.split('_')[1])
    df['IsGroup'] = df['GroupId'].duplicated(keep=False)
    return df

# total room service, etc...
def total_bill(df):
    df['TotalBill'] = df[
        ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)
    return df

# binalize 3 classes
def binalize_bill(df, th1=5000, th2=20000): 
    df = total_bill(df)
    df['BillBins'] = df['TotalBill'].apply(
        lambda x: 0 if x < th1 else (2 if x > th1 and x < th2 else 3))
    return df


train_test = split_cabin(train_test)
train_test = make_group(train_test)
train_test = binalize_bill(train_test)

  train_test = pd.concat([train, test], axis=0, ignore_index=True, sort=False)


### 使う特徴量を選ぶ

In [17]:
train_test = train_test[['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'CabinDeck', 'CabinNum', 'CabinSide', 'IsGroup', 'BillBins', 'Transported']]
train_test.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,CabinDeck,CabinNum,CabinSide,IsGroup,BillBins,Transported
0,Europa,False,TRAPPIST-1e,39.0,False,B,0.0,P,False,0,0.0
1,Earth,False,TRAPPIST-1e,24.0,False,F,0.0,S,False,0,1.0
2,Europa,False,TRAPPIST-1e,58.0,True,A,0.0,S,True,2,0.0
3,Europa,False,TRAPPIST-1e,33.0,False,A,0.0,S,True,2,0.0
4,Earth,False,TRAPPIST-1e,16.0,False,F,1.0,S,False,0,1.0


### Encoding

In [18]:
# HomePlanet, Destination, CabinSideはonehot encoding
for col in ['HomePlanet', 'Destination', 'CabinDeck', 'CabinSide']:
    train_test = pd.concat([train_test,  pd.get_dummies(train_test[col], prefix=col)], axis=1)
    train_test = train_test.drop(col, axis=1)

In [19]:
# boolをintへ
def bool2int(df):
    for col in df.columns:
        if df[col].dtype == bool:
            df[col] = df[col].astype(float)
        if df[col].dtype == 'object':
            df[col] = df[col].map({True: 1, False: 0})
    return df

train_test = bool2int(train_test)

In [20]:
train_test

Unnamed: 0,CryoSleep,Age,VIP,CabinNum,IsGroup,BillBins,Transported,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,...,CabinDeck_A,CabinDeck_B,CabinDeck_C,CabinDeck_D,CabinDeck_E,CabinDeck_F,CabinDeck_G,CabinDeck_T,CabinSide_P,CabinSide_S
0,0.0,39.0,0.0,0.0,0.0,0,0.0,0,1,0,...,0,1,0,0,0,0,0,0,1,0
1,0.0,24.0,0.0,0.0,0.0,0,1.0,1,0,0,...,0,0,0,0,0,1,0,0,0,1
2,0.0,58.0,1.0,0.0,1.0,2,0.0,0,1,0,...,1,0,0,0,0,0,0,0,0,1
3,0.0,33.0,0.0,0.0,1.0,2,0.0,0,1,0,...,1,0,0,0,0,0,0,0,0,1
4,0.0,16.0,0.0,1.0,0.0,0,1.0,1,0,0,...,0,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12965,1.0,34.0,0.0,1496.0,1.0,0,,1,0,0,...,0,0,0,0,0,0,1,0,0,1
12966,0.0,42.0,0.0,,0.0,0,,1,0,0,...,0,0,0,0,0,0,0,0,0,0
12967,1.0,,0.0,296.0,0.0,0,,0,0,1,...,0,0,0,1,0,0,0,0,1,0
12968,0.0,,0.0,297.0,0.0,0,,0,1,0,...,0,0,0,1,0,0,0,0,1,0


### モデリング

In [21]:
import lightgbm as lgbm

In [22]:
params = {
    'objective': 'binary',
    'num_leaves': 64,
    'min_child_samples': 20,
    'max_depth': 7,
    'n_estimators': 100,
    'learning_rate': 0.1,
    'importance_type': 'gain',
}

### 学習

In [23]:
import os
import random
from sklearn.model_selection import KFold, cross_validate
from sklearn.metrics import accuracy_score

In [24]:
def set_seed(seed=3407):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)


SEED = 3407
set_seed(SEED)

In [25]:
# DataFrameをnp.ndarrayに変換
trainval = train_test[~train_test['Transported'].isna()]
test = train_test[train_test['Transported'].isna()]
# inputとlabelに分離
x_trainval = trainval.drop('Transported', axis=1).values
y_trainval = trainval.Transported.values
x_test = test.drop('Transported', axis=1).values

In [26]:
x_trainval.shape, y_trainval.shape, x_test.shape

((8693, 22), (8693,), (4277, 22))

In [27]:
kf = KFold(n_splits=5, shuffle=True, random_state=SEED)

val_scores = []
models = []
for fold, (train_inds, val_inds) in enumerate(kf.split(x_trainval)):
    
    x_train, x_val = x_trainval[train_inds], x_trainval[val_inds]
    y_train, y_val = y_trainval[train_inds], y_trainval[val_inds]
    
    model = lgbm.LGBMClassifier(**params)
    model.fit(
        x_train, y_train,
        eval_set=(x_val, y_val),
        eval_metric='logloss',
        callbacks=[lgbm.early_stopping(10)],
    )

    y_val_pred = model.predict(x_val)
    score = accuracy_score(y_val, y_val_pred)
    print(f'fold {fold}/acc: {score}')
    val_scores.append(score)
    models.append(model)

cv_score = np.mean(val_scores)
print(f'CV score: {cv_score}')

Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[30]	valid_0's binary_logloss: 0.489887
fold 0/acc: 0.7527314548591144
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[35]	valid_0's binary_logloss: 0.499508
fold 1/acc: 0.7441058079355952
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[39]	valid_0's binary_logloss: 0.498089
fold 2/acc: 0.7475560667050029
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[32]	valid_0's binary_logloss: 0.483766
fold 3/acc: 0.766398158803222
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[38]	valid_0's binary_logloss: 0.502343
fold 4/acc: 0.7410817031070196
CV score: 0.7503746382819908


In [28]:
importance = pd.DataFrame()

for i in range(len(models)):
    df = pd.DataFrame(models[i].feature_importances_,
                      index=trainval.columns[:-1], columns=[f'model{i+1}'])
    df = df.sort_values(f'model{i+1}', ascending=False)
    importance = pd.concat([importance, df], axis=1)

importance

Unnamed: 0,model1,model2,model3,model4,model5
CryoSleep,8412.495599,8899.359484,8856.830072,8385.080112,8996.196618
Age,2345.292921,2276.744349,2554.193004,2145.014718,2461.889377
CabinNum,2268.839091,2837.537975,2715.492438,2644.926068,2771.747863
Transported,1915.278554,1879.352546,2241.320545,2076.902952,1894.17546
CabinDeck_D,739.7105,562.110243,569.984928,642.95044,614.505524
CabinDeck_F,556.880036,495.24872,179.890788,335.758039,474.229147
CabinDeck_T,477.171785,423.152614,375.000526,369.371438,513.122849
HomePlanet_Europa,368.049267,349.503714,358.335293,343.522509,299.373465
CabinSide_P,221.772248,216.493854,380.926987,306.8273,190.709163
Destination_PSO J318.5-22,201.636141,244.375115,257.746109,260.002163,220.633233


### submit用のcsv作成

cvごとの推論の単純平均

In [29]:
y_preds = []

for i in range(len(models)):
    predictor = models[i]
    y_pred = predictor.predict_proba(x_test)
    y_preds.append(y_pred)

ensemble = np.argmax(np.mean(y_preds, axis=0), axis=-1)

In [31]:
test_ids = pd.read_csv('../../dataset/test.csv')['PassengerId']

df_submit = pd.DataFrame(ensemble, index=test_ids, columns=['Transported'])
df_submit.Transported = df_submit.Transported.astype(bool)

In [76]:
df_submit

Unnamed: 0_level_0,Transported
PassengerId,Unnamed: 1_level_1
0013_01,True
0018_01,False
0019_01,True
0021_01,False
0023_01,False
...,...
9266_02,True
9269_01,False
9271_01,True
9273_01,False


In [33]:
df_submit.to_csv('submission/lgbm_trial3.csv')