LGBMで、欠損値を埋めて学習する

In [1]:
import pandas as pd
import numpy as np

In [88]:
train = pd.read_csv('../../dataset/train.csv')
test = pd.read_csv('../../dataset/test.csv')
# 前処理を一度にやるためにtrainとtestをconcatする
test['Transported'] = np.nan
train_test = pd.concat([train, test], axis=0, ignore_index=True, sort=False)

# split on `/` to cols (deck/num/side)
def split_cabin(df):
    cabin = df['Cabin'].str.split('/', expand=True).rename(columns={0: 'CabinDeck', 1: 'CabinNum', 2: 'CabinSide'})
    cabin['CabinNum'] = cabin['CabinNum'].astype(float)
    return pd.concat([df, cabin], axis=1)

# group passenger or not
def make_group(df):
    df['GroupId'] = df['PassengerId'].apply(lambda x: x.split('_')[0])
    df['PeopleId'] = df['PassengerId'].apply(lambda x: x.split('_')[1])
    df['IsGroup'] = df['GroupId'].duplicated(keep=False)
    return df

# total room service, etc...
def total_bill(df):
    df['TotalBill'] = df[
        ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)
    return df

# binalize 3 classes
def binalize_bill(df, th1=5000, th2=20000): 
    df = total_bill(df)
    df['BillBins'] = df['TotalBill'].apply(
        lambda x: 0 if x < th1 else (2 if x > th1 and x < th2 else 3))
    return df

def is_adult(df):
    df['IsAdult'] = df['Age'] >= 18
    return df

train_test = split_cabin(train_test)
train_test = make_group(train_test)
train_test = binalize_bill(train_test)
train_test = is_adult(train_test)

train_test.head()

  train_test = pd.concat([train, test], axis=0, ignore_index=True, sort=False)


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,Transported,CabinDeck,CabinNum,CabinSide,GroupId,PeopleId,IsGroup,TotalBill,BillBins,IsAdult
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,...,0.0,B,0.0,P,1,1,False,0.0,0,True
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,...,1.0,F,0.0,S,2,1,False,736.0,0,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,...,0.0,A,0.0,S,3,1,True,10383.0,2,True
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,...,0.0,A,0.0,S,3,2,True,5176.0,2,True
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,...,1.0,F,1.0,S,4,1,False,1091.0,0,False


### 使う特徴量を選ぶ

In [87]:
train_test = train_test[['HomePlanet', 'CryoSleep', 'Destination', 'Age', 'VIP', 'CabinDeck', 'CabinNum', 'CabinSide', 'IsGroup', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'TotalBill', 'BillBins', 'IsAdult', 'Transported']]
train_test.head()

KeyError: "['TotalBill', 'BillBins'] not in index"

### 欠損値を埋める

- HomePlanet→最頻値
- CryoSleep→最頻値
- Destination→最頻値
- Age→中央値で埋める
- VIP→VIPなしで埋める
- CabinNum→最頻値で埋める
- CabinSide→CabinNum==82となっているCabinSideの最頻値で埋める？→Pで埋める

In [None]:
for col in train_test.columns:
    if col == 'Transported':
        continue
    elif col == 'CabinSide':
        train_test[col] = train_test[col].fillna('P')
    elif col == 'VIP':
        train_test[col] = train_test[col].fillna(False)
    elif col == 'Age':
        train_test[col] = train_test[col].fillna(train_test[col].median())
    else:
        train_test[col] = train_test[col].fillna(train_test[col].mode()[0])

In [None]:
train_test.isna().any()

HomePlanet      False
CryoSleep       False
Destination     False
Age             False
VIP             False
CabinDeck       False
CabinNum        False
CabinSide       False
IsGroup         False
RoomService     False
FoodCourt       False
ShoppingMall    False
Spa             False
VRDeck          False
TotalBill       False
BillBins        False
IsAdult         False
Transported      True
dtype: bool

In [None]:
train_test

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,CabinDeck,CabinNum,CabinSide,IsGroup,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,TotalBill,BillBins,IsAdult,Transported
0,Europa,False,TRAPPIST-1e,39.0,False,B,0.0,P,False,0.0,0.0,0.0,0.0,0.0,0.0,0,True,0.0
1,Earth,False,TRAPPIST-1e,24.0,False,F,0.0,S,False,109.0,9.0,25.0,549.0,44.0,736.0,0,True,1.0
2,Europa,False,TRAPPIST-1e,58.0,True,A,0.0,S,True,43.0,3576.0,0.0,6715.0,49.0,10383.0,2,True,0.0
3,Europa,False,TRAPPIST-1e,33.0,False,A,0.0,S,True,0.0,1283.0,371.0,3329.0,193.0,5176.0,2,True,0.0
4,Earth,False,TRAPPIST-1e,16.0,False,F,1.0,S,False,303.0,70.0,151.0,565.0,2.0,1091.0,0,False,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12965,Earth,True,TRAPPIST-1e,34.0,False,G,1496.0,S,True,0.0,0.0,0.0,0.0,0.0,0.0,0,True,
12966,Earth,False,TRAPPIST-1e,42.0,False,F,82.0,P,False,0.0,847.0,17.0,10.0,144.0,1018.0,0,True,
12967,Mars,True,55 Cancri e,27.0,False,D,296.0,P,False,0.0,0.0,0.0,0.0,0.0,0.0,0,False,
12968,Europa,False,TRAPPIST-1e,27.0,False,D,297.0,P,False,0.0,2680.0,0.0,0.0,523.0,3203.0,0,False,


### Encoding

In [None]:
# HomePlanet, Destination, CabinSideはonehot encoding
for col in ['HomePlanet', 'Destination', 'CabinDeck', 'CabinSide']:
    train_test = pd.concat([train_test,  pd.get_dummies(train_test[col], prefix=col)], axis=1)
    train_test = train_test.drop(col, axis=1)

In [None]:
# boolをintへ
def bool2int(df):
    for col in df.columns:
        if df[col].dtype == bool:
            df[col] = df[col].astype(float)
        if df[col].dtype == 'object':
            df[col] = df[col].map({True: 1, False: 0})
    return df

train_test = bool2int(train_test)

In [None]:
train_test

Unnamed: 0,CryoSleep,Age,VIP,CabinNum,IsGroup,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,...,CabinDeck_A,CabinDeck_B,CabinDeck_C,CabinDeck_D,CabinDeck_E,CabinDeck_F,CabinDeck_G,CabinDeck_T,CabinSide_P,CabinSide_S
0,0.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,0,0,1,0
1,0.0,24.0,0.0,0.0,0.0,109.0,9.0,25.0,549.0,44.0,...,0,0,0,0,0,1,0,0,0,1
2,0.0,58.0,1.0,0.0,1.0,43.0,3576.0,0.0,6715.0,49.0,...,1,0,0,0,0,0,0,0,0,1
3,0.0,33.0,0.0,0.0,1.0,0.0,1283.0,371.0,3329.0,193.0,...,1,0,0,0,0,0,0,0,0,1
4,0.0,16.0,0.0,1.0,0.0,303.0,70.0,151.0,565.0,2.0,...,0,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12965,1.0,34.0,0.0,1496.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,1
12966,0.0,42.0,0.0,82.0,0.0,0.0,847.0,17.0,10.0,144.0,...,0,0,0,0,0,1,0,0,1,0
12967,1.0,27.0,0.0,296.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,1,0
12968,0.0,27.0,0.0,297.0,0.0,0.0,2680.0,0.0,0.0,523.0,...,0,0,0,1,0,0,0,0,1,0


### モデリング

In [None]:
import lightgbm as lgbm

In [79]:
params = {
    'objective': 'binary',
    'num_leaves': 64,
    'min_child_samples': 20,
    'max_depth': 7,
    'n_estimators': 100,
    'learning_rate': 0.1,
    'importance_type': 'gain',
}

### 学習

In [80]:
import os
import random
from sklearn.model_selection import KFold, cross_validate
from sklearn.metrics import accuracy_score

In [81]:
def set_seed(seed=3407):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)


SEED = 3407
set_seed(SEED)

In [82]:
# DataFrameをnp.ndarrayに変換
trainval = train_test[~train_test['Transported'].isna()]
test = train_test[train_test['Transported'].isna()]
# inputとlabelに分離
x_trainval = trainval.drop('Transported', axis=1).values
y_trainval = trainval.Transported.values
x_test = test.drop('Transported', axis=1).values

In [83]:
x_trainval.shape, y_trainval.shape, x_test.shape

((8693, 20), (8693,), (4277, 20))

In [84]:
kf = KFold(n_splits=5, shuffle=True, random_state=SEED)

val_scores = []
models = []
for fold, (train_inds, val_inds) in enumerate(kf.split(x_trainval)):
    
    x_train, x_val = x_trainval[train_inds], x_trainval[val_inds]
    y_train, y_val = y_trainval[train_inds], y_trainval[val_inds]
    
    model = lgbm.LGBMClassifier(**params)
    model.fit(
        x_train, y_train,
        eval_set=(x_val, y_val),
        eval_metric='logloss',
        callbacks=[lgbm.early_stopping(10)],
    )

    y_val_pred = model.predict(x_val)
    score = accuracy_score(y_val, y_val_pred)
    print(f'fold {fold}/acc: {score}')
    val_scores.append(score)
    models.append(model)

cv_score = np.mean(val_scores)
print(f'CV score: {cv_score}')

ValueError: could not convert string to float: 'Europa'

In [85]:
importance = pd.DataFrame()

for i in range(len(models)):
    df = pd.DataFrame(models[i].feature_importances_,
                      index=trainval.columns[:-1], columns=[f'model{i+1}'])
    df = df.sort_values(f'model{i+1}', ascending=False)
    importance = pd.concat([importance, df], axis=1)

importance

### submit用のcsv作成

cvごとの推論の単純平均

In [72]:
y_preds = []

for i in range(len(models)):
    predictor = models[i]
    y_pred = predictor.predict_proba(x_test)
    y_preds.append(y_pred)

ensemble = np.argmax(np.mean(y_preds, axis=0), axis=-1)

In [74]:
test_ids = pd.read_csv('../../dataset/test.csv')['PassengerId']

df_submit = pd.DataFrame(ensemble, index=test_ids, columns=['Transported'])
df_submit.Transported = df_submit.Transported.astype(bool)

In [75]:
df_submit

Unnamed: 0_level_0,Transported
PassengerId,Unnamed: 1_level_1
0013_01,True
0018_01,False
0019_01,True
0021_01,True
0023_01,True
...,...
9266_02,True
9269_01,False
9271_01,True
9273_01,True


In [76]:
df_submit.to_csv('submission/lgbm_trial8.csv')