In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv('../../dataset/train.csv')
test = pd.read_csv('../../dataset/test.csv')
# 前処理を一度にやるためにtrainとtestをconcatする
test['Transported'] = np.nan
train_test = pd.concat([train, test], axis=0, ignore_index=True, sort=False)

# split on `/` to cols (deck/num/side)
def split_cabin(df):
    cabin = df['Cabin'].str.split('/', expand=True).rename(columns={0: 'CabinDeck', 1: 'CabinNum', 2: 'CabinSide'})
    cabin['CabinNum'] = cabin['CabinNum'].astype(float)
    return pd.concat([df, cabin], axis=1)

# group passenger or not
def make_group(df):
    df['GroupId'] = df['PassengerId'].apply(lambda x: x.split('_')[0])
    df['PeopleId'] = df['PassengerId'].apply(lambda x: x.split('_')[1])
    df['IsGroup'] = df['GroupId'].duplicated(keep=False)
    return df

# total room service, etc...
def total_bill(df):
    df['TotalBill'] = df[
        ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)
    return df

# binalize 3 classes
def binalize_bill(df, th1=5000, th2=20000): 
    df = total_bill(df)
    df['BillBins'] = df['TotalBill'].apply(
        lambda x: 0 if x < th1 else (2 if x > th1 and x < th2 else 3))
    return df


train_test = split_cabin(train_test)
train_test = make_group(train_test)
train_test = binalize_bill(train_test)

train_test.head()

  train_test = pd.concat([train, test], axis=0, ignore_index=True, sort=False)


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,Name,Transported,CabinDeck,CabinNum,CabinSide,GroupId,PeopleId,IsGroup,TotalBill,BillBins
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,...,Maham Ofracculy,0.0,B,0.0,P,1,1,False,0.0,0
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,...,Juanna Vines,1.0,F,0.0,S,2,1,False,736.0,0
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,...,Altark Susent,0.0,A,0.0,S,3,1,True,10383.0,2
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,...,Solam Susent,0.0,A,0.0,S,3,2,True,5176.0,2
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,...,Willy Santantines,1.0,F,1.0,S,4,1,False,1091.0,0


### 使う特徴量を選ぶ

In [3]:
train_test = train_test[['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'CabinDeck', 'CabinNum', 'CabinSide', 'IsGroup', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'TotalBill', 'BillBins', 'Transported']]
train_test.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,CabinDeck,CabinNum,CabinSide,IsGroup,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,TotalBill,BillBins,Transported
0,Europa,False,TRAPPIST-1e,39.0,False,B,0.0,P,False,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0
1,Earth,False,TRAPPIST-1e,24.0,False,F,0.0,S,False,109.0,9.0,25.0,549.0,44.0,736.0,0,1.0
2,Europa,False,TRAPPIST-1e,58.0,True,A,0.0,S,True,43.0,3576.0,0.0,6715.0,49.0,10383.0,2,0.0
3,Europa,False,TRAPPIST-1e,33.0,False,A,0.0,S,True,0.0,1283.0,371.0,3329.0,193.0,5176.0,2,0.0
4,Earth,False,TRAPPIST-1e,16.0,False,F,1.0,S,False,303.0,70.0,151.0,565.0,2.0,1091.0,0,1.0


### Encoding

In [4]:
# HomePlanet, Destination, CabinSideはlabel encoding
for col in ['HomePlanet', 'Destination', 'CabinDeck', 'CabinSide']:
    train_test[col] = pd.factorize(train_test[col])[0]

In [5]:
# boolをintへ
def bool2int(df):
    for col in df.columns:
        if df[col].dtype == bool:
            df[col] = df[col].astype(float)
        if df[col].dtype == 'object':
            df[col] = df[col].map({True: 1, False: 0})
    return df

train_test = bool2int(train_test)

In [6]:
train_test

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,CabinDeck,CabinNum,CabinSide,IsGroup,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,TotalBill,BillBins,Transported
0,0,0.0,0,39.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0
1,1,0.0,0,24.0,0.0,1,0.0,1,0.0,109.0,9.0,25.0,549.0,44.0,736.0,0,1.0
2,0,0.0,0,58.0,1.0,2,0.0,1,1.0,43.0,3576.0,0.0,6715.0,49.0,10383.0,2,0.0
3,0,0.0,0,33.0,0.0,2,0.0,1,1.0,0.0,1283.0,371.0,3329.0,193.0,5176.0,2,0.0
4,1,0.0,0,16.0,0.0,1,1.0,1,0.0,303.0,70.0,151.0,565.0,2.0,1091.0,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12965,1,1.0,0,34.0,0.0,3,1496.0,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,
12966,1,0.0,0,42.0,0.0,-1,,-1,0.0,0.0,847.0,17.0,10.0,144.0,1018.0,0,
12967,2,1.0,2,,0.0,5,296.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,
12968,0,0.0,-1,,0.0,5,297.0,0,0.0,0.0,2680.0,0.0,0.0,523.0,3203.0,0,


### モデリング

In [7]:
import lightgbm as lgbm

In [8]:
params = {
    'objective': 'binary',
    'num_leaves': 64,
    'min_child_samples': 20,
    'max_depth': 7,
    'n_estimators': 100,
    'learning_rate': 0.1,
    'importance_type': 'gain',
}

### 学習

In [9]:
import os
import random
from sklearn.model_selection import KFold, cross_validate
from sklearn.metrics import accuracy_score

In [10]:
def set_seed(seed=3407):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)


SEED = 3407
set_seed(SEED)

In [11]:
# DataFrameをnp.ndarrayに変換
trainval = train_test[~train_test['Transported'].isna()]
test = train_test[train_test['Transported'].isna()]
# inputとlabelに分離
x_trainval = trainval.drop('Transported', axis=1).values
y_trainval = trainval.Transported.values
x_test = test.drop('Transported', axis=1).values

In [12]:
x_trainval.shape, y_trainval.shape, x_test.shape

((8693, 16), (8693,), (4277, 16))

In [13]:
kf = KFold(n_splits=5, shuffle=True, random_state=SEED)

val_scores = []
models = []
for fold, (train_inds, val_inds) in enumerate(kf.split(x_trainval)):
    
    x_train, x_val = x_trainval[train_inds], x_trainval[val_inds]
    y_train, y_val = y_trainval[train_inds], y_trainval[val_inds]
    
    model = lgbm.LGBMClassifier(**params)
    model.fit(
        x_train, y_train,
        eval_set=(x_val, y_val),
        eval_metric='logloss',
        callbacks=[lgbm.early_stopping(10)],
    )

    y_val_pred = model.predict(x_val)
    score = accuracy_score(y_val, y_val_pred)
    print(f'fold {fold}/acc: {score}')
    val_scores.append(score)
    models.append(model)

cv_score = np.mean(val_scores)
print(f'CV score: {cv_score}')

Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[53]	valid_0's binary_logloss: 0.39323
fold 0/acc: 0.8073605520414031
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[74]	valid_0's binary_logloss: 0.386443
fold 1/acc: 0.8039102932719954
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[65]	valid_0's binary_logloss: 0.390511
fold 2/acc: 0.8188614146060954
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[82]	valid_0's binary_logloss: 0.36024
fold 3/acc: 0.8216340621403913
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[58]	valid_0's binary_logloss: 0.403059
fold 4/acc: 0.8032220943613348
CV score: 0.810997683284244


In [14]:
importance = pd.DataFrame()

for i in range(len(models)):
    df = pd.DataFrame(models[i].feature_importances_,
                      index=trainval.columns[:-1], columns=[f'model{i+1}'])
    df = df.sort_values(f'model{i+1}', ascending=False)
    importance = pd.concat([importance, df], axis=1)

importance

Unnamed: 0,model1,model2,model3,model4,model5
TotalBill,9360.58796,10478.166259,9561.23613,9750.274666,10052.534656
FoodCourt,2763.997299,2611.305867,2580.081395,2655.734526,2712.765811
CabinDeck,2730.686847,3015.125435,2591.282426,3091.934238,3097.59214
CabinNum,2301.829554,2882.668986,2832.117375,3034.916254,2480.619622
ShoppingMall,1981.005704,1904.279493,1938.316729,2172.126754,1909.639289
Spa,1980.201709,2235.143953,2174.694192,2188.989385,2178.768243
VRDeck,1977.364962,1974.525679,1972.561193,2160.147645,1966.745961
HomePlanet,1341.190551,1086.267734,1348.991078,1106.292429,757.072696
Age,1152.903322,1222.619514,1422.224274,1322.449219,1161.565697
RoomService,1144.70573,1281.989235,1293.018167,1386.533249,1252.09137


### submit用のcsv作成

cvごとの推論の単純平均

In [20]:
y_preds = []

for i in range(len(models)):
    predictor = models[i]
    y_pred = predictor.predict_proba(x_test)
    y_preds.append(y_pred)

ensemble = np.argmax(np.mean(y_preds, axis=0), axis=-1)

In [21]:
test_ids = pd.read_csv('../../dataset/test.csv')['PassengerId']

df_submit = pd.DataFrame(ensemble, index=test_ids, columns=['Transported'])
df_submit.Transported = df_submit.Transported.astype(bool)

In [22]:
df_submit

Unnamed: 0_level_0,Transported
PassengerId,Unnamed: 1_level_1
0013_01,True
0018_01,False
0019_01,True
0021_01,True
0023_01,True
...,...
9266_02,True
9269_01,False
9271_01,True
9273_01,True


In [19]:
df_submit.to_csv('submission/lgbm_trial5.csv')