In [1]:
import pandas as pd
import numpy as np

In [49]:
train = pd.read_csv('../../dataset/train.csv')
test = pd.read_csv('../../dataset/test.csv')
# 前処理を一度にやるためにtrainとtestをconcatする
test['Transported'] = np.nan
train_test = pd.concat([train, test], axis=0, ignore_index=True, sort=False)

# split on `/` to cols (deck/num/side)
def split_cabin(df):
    cabin = df['Cabin'].str.split('/', expand=True).rename(columns={0: 'CabinDeck', 1: 'CabinNum', 2: 'CabinSide'})
    cabin['CabinNum'] = cabin['CabinNum'].astype(float)
    return pd.concat([df, cabin], axis=1)

# group passenger or not
def make_group(df):
    df['GroupId'] = df['PassengerId'].apply(lambda x: x.split('_')[0])
    df['PeopleId'] = df['PassengerId'].apply(lambda x: x.split('_')[1])
    df['IsGroup'] = df['GroupId'].duplicated(keep=False)
    return df

# total room service, etc...
def total_bill(df):
    df['TotalBill'] = df[
        ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)
    return df

# binalize 3 classes
def binalize_bill(df, th1=5000, th2=20000): 
    df = total_bill(df)
    df['BillBins'] = df['TotalBill'].apply(
        lambda x: 0 if x < th1 else (2 if x > th1 and x < th2 else 3))
    return df


train_test = split_cabin(train_test)
train_test = make_group(train_test)
train_test = binalize_bill(train_test)

  train_test = pd.concat([train, test], axis=0, ignore_index=True, sort=False)


In [50]:
train_test.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,Name,Transported,CabinDeck,CabinNum,CabinSide,GroupId,PeopleId,IsGroup,TotalBill,BillBins
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,...,Maham Ofracculy,0.0,B,0.0,P,1,1,False,0.0,0
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,...,Juanna Vines,1.0,F,0.0,S,2,1,False,736.0,0
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,...,Altark Susent,0.0,A,0.0,S,3,1,True,10383.0,2
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,...,Solam Susent,0.0,A,0.0,S,3,2,True,5176.0,2
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,...,Willy Santantines,1.0,F,1.0,S,4,1,False,1091.0,0


### 使う特徴量を選ぶ

In [52]:
train_test = train_test[['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'CabinDeck', 'CabinNum', 'CabinSide', 'IsGroup', 'BillBins', 'Transported']]
train_test.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,CabinDeck,CabinNum,CabinSide,IsGroup,BillBins,Transported
0,Europa,False,TRAPPIST-1e,39.0,False,B,0.0,P,False,0,0.0
1,Earth,False,TRAPPIST-1e,24.0,False,F,0.0,S,False,0,1.0
2,Europa,False,TRAPPIST-1e,58.0,True,A,0.0,S,True,2,0.0
3,Europa,False,TRAPPIST-1e,33.0,False,A,0.0,S,True,2,0.0
4,Earth,False,TRAPPIST-1e,16.0,False,F,1.0,S,False,0,1.0


### Encoding

In [53]:
# HomePlanet, Destination, CabinSideはlabel encoding
for col in ['HomePlanet', 'Destination', 'CabinDeck', 'CabinSide']:
    train_test[col] = pd.factorize(train_test[col])[0]

In [54]:
# boolをintへ
def bool2int(df):
    for col in df.columns:
        if df[col].dtype == bool:
            df[col] = df[col].astype(float)
        if df[col].dtype == 'object':
            df[col] = df[col].map({True: 1, False: 0})
    return df

train_test = bool2int(train_test)

In [55]:
train_test

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,CabinDeck,CabinNum,CabinSide,IsGroup,BillBins,Transported
0,0,0.0,0,39.0,0.0,0,0.0,0,0.0,0,0.0
1,1,0.0,0,24.0,0.0,1,0.0,1,0.0,0,1.0
2,0,0.0,0,58.0,1.0,2,0.0,1,1.0,2,0.0
3,0,0.0,0,33.0,0.0,2,0.0,1,1.0,2,0.0
4,1,0.0,0,16.0,0.0,1,1.0,1,0.0,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
12965,1,1.0,0,34.0,0.0,3,1496.0,1,1.0,0,
12966,1,0.0,0,42.0,0.0,-1,,-1,0.0,0,
12967,2,1.0,2,,0.0,5,296.0,0,0.0,0,
12968,0,0.0,-1,,0.0,5,297.0,0,0.0,0,


### モデリング

In [56]:
import lightgbm as lgbm

In [78]:
params = {
    'objective': 'binary',
    'num_leaves': 64,
    'min_child_samples': 20,
    'max_depth': 7,
    'n_estimators': 100,
    'learning_rate': 0.1,
    'importance_type': 'gain',
}

### 学習

In [79]:
import os
import random
from sklearn.model_selection import KFold, cross_validate
from sklearn.metrics import accuracy_score

In [80]:
def set_seed(seed=3407):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)


SEED = 3407
set_seed(SEED)

In [81]:
# DataFrameをnp.ndarrayに変換
trainval = train_test[~train_test['Transported'].isna()]
test = train_test[train_test['Transported'].isna()]
# inputとlabelに分離
x_trainval = trainval.drop('Transported', axis=1).values
y_trainval = trainval.Transported.values
x_test = test.drop('Transported', axis=1).values

In [82]:
x_trainval.shape, y_trainval.shape, x_test.shape

((8693, 10), (8693,), (4277, 10))

In [83]:
kf = KFold(n_splits=5, shuffle=True, random_state=SEED)

val_scores = []
models = []
for fold, (train_inds, val_inds) in enumerate(kf.split(x_trainval)):
    
    x_train, x_val = x_trainval[train_inds], x_trainval[val_inds]
    y_train, y_val = y_trainval[train_inds], y_trainval[val_inds]
    
    model = lgbm.LGBMClassifier(**params)
    model.fit(
        x_train, y_train,
        eval_set=(x_val, y_val),
        eval_metric='logloss',
        callbacks=[lgbm.early_stopping(10)],
    )

    y_val_pred = model.predict(x_val)
    score = accuracy_score(y_val, y_val_pred)
    print(f'fold {fold}/acc: {score}')
    val_scores.append(score)
    models.append(model)

cv_score = np.mean(val_scores)
print(f'CV score: {cv_score}')

Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's binary_logloss: 0.532328
fold 0/acc: 0.753306497987349
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's binary_logloss: 0.544102
fold 1/acc: 0.7366302472685451
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's binary_logloss: 0.539985
fold 2/acc: 0.7429557216791259
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's binary_logloss: 0.527581
fold 3/acc: 0.7560414269275029
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's binary_logloss: 0.545369
fold 4/acc: 0.738204833141542
CV score: 0.745427745400813


In [68]:
train

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [69]:
importance = pd.DataFrame()

for i in range(len(models)):
    df = pd.DataFrame(models[i].feature_importances_,
                      index=trainval.columns[:-1], columns=[f'model{i+1}'])
    df = df.sort_values(f'model{i+1}', ascending=False)
    importance = pd.concat([importance, df], axis=1)

importance

Unnamed: 0,model1,model2,model3,model4,model5
CryoSleep,8430.44046,8883.079793,8834.195355,8412.231927,8911.470625
CabinNum,2608.090066,2922.019158,2557.348591,2742.70399,2544.62932
CabinDeck,2562.065828,2350.483142,1948.556273,2001.835092,2619.483338
Age,2386.539911,2520.346632,2189.766457,2215.003698,2096.417908
HomePlanet,1527.05267,1727.300131,1814.624478,1950.087278,1219.224805
CabinSide,695.487307,686.659612,692.257259,673.803086,675.609868
Destination,344.420637,343.670484,272.256892,343.916619,211.09323
BillBins,81.670128,168.384421,248.750187,241.970298,150.96364
IsGroup,60.208711,95.312137,70.446637,67.187202,40.696837
VIP,50.98609,40.27737,43.914153,35.0213,23.89718


### submit用のcsv作成

cvごとの推論の単純平均

In [70]:
y_preds = []

for i in range(len(models)):
    predictor = models[i]
    y_pred = predictor.predict_proba(x_test)
    y_preds.append(y_pred)

In [71]:
ensemble = np.argmax(np.mean(y_preds, axis=0), axis=-1)

In [72]:
test

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,CabinDeck,CabinNum,CabinSide,IsGroup,BillBins,Transported
8693,1,1.0,0,27.0,0.0,3,3.0,1,0.0,0,
8694,1,0.0,0,19.0,0.0,1,4.0,1,0.0,0,
8695,0,1.0,2,31.0,0.0,6,0.0,1,0.0,0,
8696,0,0.0,0,38.0,0.0,6,1.0,1,0.0,2,
8697,1,0.0,0,20.0,0.0,1,5.0,1,0.0,0,
...,...,...,...,...,...,...,...,...,...,...,...
12965,1,1.0,0,34.0,0.0,3,1496.0,1,1.0,0,
12966,1,0.0,0,42.0,0.0,-1,,-1,0.0,0,
12967,2,1.0,2,,0.0,5,296.0,0,0.0,0,
12968,0,0.0,-1,,0.0,5,297.0,0,0.0,0,


In [73]:
test_ids = pd.read_csv('../../dataset/test.csv')['PassengerId']

In [74]:
test_ids

0       0013_01
1       0018_01
2       0019_01
3       0021_01
4       0023_01
         ...   
4272    9266_02
4273    9269_01
4274    9271_01
4275    9273_01
4276    9277_01
Name: PassengerId, Length: 4277, dtype: object

In [75]:
df_submit = pd.DataFrame(ensemble, index=test_ids, columns=['Transported'])
df_submit.Transported = df_submit.Transported.astype(bool)

In [76]:
df_submit

Unnamed: 0_level_0,Transported
PassengerId,Unnamed: 1_level_1
0013_01,True
0018_01,False
0019_01,True
0021_01,False
0023_01,False
...,...
9266_02,True
9269_01,False
9271_01,True
9273_01,False


In [77]:
df_submit.to_csv('submission/lgbm_trial2.csv')