In [1]:
import warnings
warnings.filterwarnings(action='ignore')

import pandas as pd
import numpy as np

from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

from sklearn.metrics import f1_score

### Load Data

In [2]:
data = fetch_openml('titanic', version=1)

X = data.data.drop(columns=['boat', 'body'])
y = data.target

In [3]:
X = X.drop(columns=['name'])
y = y.astype('int')

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=0)

### Data Preprocessing

In [5]:
# 결측치 처리
X_train['age'] = X_train.age.fillna(X_train.age.mean())
X_train['cabin'] = X_train.cabin.fillna(X_train.cabin.mode()[0])
X_train['embarked'] = X_train.embarked.fillna(X_train.embarked.mode()[0])
X_train['home.dest'] = X_train['home.dest'].fillna('결측치')

X_test['age'] = X_test.age.fillna(X_train.age.mean())
X_test['fare'] = X_test.fare.fillna(X_train.fare.mean())
X_test['cabin'] = X_test.cabin.fillna(X_train.cabin.mode()[0])
X_test['embarked'] = X_test.embarked.fillna(X_train.embarked.mode()[0])
X_test['home.dest'] = X_test['home.dest'].fillna('결측치')

In [6]:
# 범주형 변수 처리
X_train.sex = X_train.sex.replace(['male', 'female'], [0, 1])
X_test.sex = X_test.sex.replace(['male', 'female'], [0, 1])

value2idx = {x:i for i, x in enumerate(X_train.ticket.unique())}
X_train['ticket'] = X_train.ticket.map(lambda x: value2idx[x])
X_test['ticket'] = X_test.ticket.map(lambda x: value2idx.get(x,-1))

value2idx = {x:i for i, x in enumerate(X_train.cabin.unique())}
X_train['cabin'] = X_train.cabin.map(lambda x: value2idx[x])
X_test['cabin'] = X_test.cabin.map(lambda x: value2idx.get(x, -1))

value2idx = {x:i for i, x in enumerate(X_train.embarked.cat.categories)}
X_train['embarked'] = X_train.embarked.map(lambda x: value2idx[x])
X_test['embarked'] = X_test.embarked.map(lambda x: value2idx.get(x, -1))

value2idx = {x:i for i, x in enumerate(X_train['home.dest'].unique())}
X_train['home.dest'] = X_train['home.dest'].map(lambda x: value2idx[x])
X_test['home.dest'] = X_test['home.dest'].map(lambda x: value2idx.get(x, -1))

X_train[['sex', 'embarked']] = X_train[['sex', 'embarked']].astype('float')
X_test[['sex', 'embarked']] = X_test[['sex', 'embarked']].astype('float')

### Modeling

#### AdaBoost
- `base_estimators`: 부스팅에 사용할 기본 모델 (default: DecisionTreeClassifier)
- `n_estimators`: 사용할 예측기 수 (default: 50)
- `learning_rate`: 부스팅에 사용할 학습률 (default: 1.0)
- `random_state`: 시드값 고정에 사용 (default: None)

In [7]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [8]:
ada_clf1 = AdaBoostClassifier(base_estimator=LogisticRegression(), random_state=0)
ada_clf2 = AdaBoostClassifier(random_state=0)
ada_clf3 = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=5), random_state=0)

In [9]:
ada_clf1.fit(X_train, y_train)
ada_clf2.fit(X_train, y_train)
ada_clf3.fit(X_train, y_train)

In [10]:
pred1 = ada_clf1.predict(X_test)
pred2 = ada_clf2.predict(X_test)
pred3 = ada_clf3.predict(X_test)
print('AdaBoost(Logistic Regression)', f1_score(y_test, pred1))
print('AdaBoost(DecisionTree)', f1_score(y_test, pred2))
print('AdaBoost(DecisionTree with Parameter)', f1_score(y_test, pred3))

AdaBoost(Logistic Regression) 0.7054545454545454
AdaBoost(DecisionTree) 0.6449864498644987
AdaBoost(DecisionTree with Parameter) 0.6912751677852349


#### GBM
- `loss`: 학습에 사용할 loss (default: log_loss)
- `learning_rate`: 부스팅에 사용할 학습률 (default: 0.1)
- `n_estimators`: 사용할 예측기 수 (default: 100)
- `subsample`: 각 base learner에 사용될 sample의 비율 (default: 1.0)
- `random_state`: 시드값 고정에 사용 (default: None)
- `verbose`: 학습 과정 print 여부 (default: 0)
- DecisionTree 관련 파라미터
  - `criterion`
  - `min_samples_split`
  - `min_samples_leaf`
  - `min_weight_fraction_leaf`
  - `max_depth`
  - `min_impurity_decrease`
  - `init`
  - `max_features`
  - `max_leaf_nodes`
- 조기 종료 관련 파라미터
  - `validation_fraction`: 조기 종료 알고리즘에서 사용할 검증 데이터셋 비율
  - `n_iter_no_change`: 조기 종료 알고리즘 사용 여부 (default: None); int
  - `tol`: 조기 종료를 위한 tolerance(최소한 tolerance만큼 증가하지 않으면 학습 종료)

In [11]:
from sklearn.ensemble import GradientBoostingClassifier

In [12]:
gbm_clf1 = GradientBoostingClassifier(random_state=0)
gbm_clf2 = GradientBoostingClassifier(learning_rate=0.5, n_estimators=150,
                                      random_state=0)

In [13]:
gbm_clf1.fit(X_train, y_train)
gbm_clf2.fit(X_train, y_train)

In [14]:
pred1 = gbm_clf1.predict(X_test)
pred2 = gbm_clf2.predict(X_test)
print('AdaBoost(Logistic Regression)', f1_score(y_test, pred1))
print('AdaBoost(DecisionTree)', f1_score(y_test, pred2))

AdaBoost(Logistic Regression) 0.7018867924528301
AdaBoost(DecisionTree) 0.7291666666666666


In [15]:
gbm_clf3 = GradientBoostingClassifier(learning_rate=0.5, n_estimators=150,
                                      validation_fraction=.3, n_iter_no_change=2,
                                      random_state=0)

In [16]:
gbm_clf3.fit(X_train, y_train)

In [17]:
pred3 = gbm_clf3.predict(X_test)
print('AdaBoost(DecisionTree)', f1_score(y_test, pred3))

AdaBoost(DecisionTree) 0.7338129496402876


#### XGBoost
- `booster`: 부스팅에 사용할 기본 모델 (default: gbtree) ; gblinear, dart
- `learning_rate`: 학습률 (default: 0.1)
- `n_estimators`: 사용할 예측기 수 (default: 100)
- `early_stopping_rounds`: 조기 종료를 위한 반복 횟수 (default: None)
- `random_state`: 시드값 고정에 사용 (default: None)
- DecisionTree 관련 파라미터
    - `max_depth`
    - `min_child_weight`
    - `gamma`
    - `subsample`
    - `comsample_bytree`
    - `res_lambda`
    - `reg_alpha`
    - `scale_pos_weight`
- 학습 테스크 파라미터
    - `objective`: 부스팅에 사용할 손실함수
    - `eval_metric`: 검증에 사용하는함수 정의
    - `eval_set`: 검증 데이터셋

In [None]:
!pip install xgboost

In [18]:
from xgboost import XGBClassifier

In [19]:
X_train1, X_val, y_train1, y_val = train_test_split(X_train, y_train, test_size=.3, random_state=0)

In [20]:
xgb_clf1 = XGBClassifier(booster='gblinear', random_state=0)
xgb_clf2 = XGBClassifier(random_state=0)
xgb_clf3 = XGBClassifier(early_stopping_rounds=10, random_state=0)

In [21]:
xgb_clf1.fit(X_train, y_train)
xgb_clf2.fit(X_train, y_train)
xgb_clf3.fit(X_train1, y_train1, eval_set=[(X_val, y_val)])

[0]	validation_0-logloss:0.58347
[1]	validation_0-logloss:0.52048
[2]	validation_0-logloss:0.48587
[3]	validation_0-logloss:0.46578
[4]	validation_0-logloss:0.45537
[5]	validation_0-logloss:0.44177
[6]	validation_0-logloss:0.43734
[7]	validation_0-logloss:0.43816
[8]	validation_0-logloss:0.43846
[9]	validation_0-logloss:0.43940
[10]	validation_0-logloss:0.44287
[11]	validation_0-logloss:0.44022
[12]	validation_0-logloss:0.43691
[13]	validation_0-logloss:0.43900
[14]	validation_0-logloss:0.44325
[15]	validation_0-logloss:0.44414
[16]	validation_0-logloss:0.44414
[17]	validation_0-logloss:0.45129
[18]	validation_0-logloss:0.45338
[19]	validation_0-logloss:0.44934
[20]	validation_0-logloss:0.45125
[21]	validation_0-logloss:0.45390


In [22]:
pred1 = xgb_clf1.predict(X_test)
pred2 = xgb_clf2.predict(X_test)
pred3 = xgb_clf3.predict(X_test)

print('XGBoost(gblinear)', f1_score(y_test, pred1))
print('XGBoost(gbtree)', f1_score(y_test, pred2))
print('XGBoost(gbtree with early stopping)', f1_score(y_test, pred3))

XGBoost(gblinear) 0.6911764705882353
XGBoost(gbtree) 0.6971830985915493
XGBoost(gbtree with early stopping) 0.7106227106227107


#### LightGBM
- `boosting_type`: 부스팅에 사용할 기본 모델 (default: gbdt)
  - `gbdt`: Gradient Boosting Decision Tree
  - `dart`: Dropouts meet Multiple Additive Regression Trees
  - `goss`: Gradient-based One-Side Sampling
  - `rf`: Random Forest
- `learning_rate`: 부스팅에 사용할 학습률 (default: 0.1)
- `n_estimators`: 사용할 예측기 수 (default: 100)
- Decision Tree 관련 파라미터
    - `max_depth`
    - `min_child_weight`
    - `num_leaves`
    - `subsample`
    - `colsample_bytree`
    - `res_lambda`
    - `res_alpha`
- 학습 테스크 파라미터
    - `objective`: 부스팅에 사용할 손실함수
    - `eval_metric`: 검증에 사용하는함수 정의
    - `eval_set`: 검증 데이터셋

In [None]:
!pip install lightgbm

In [23]:
from lightgbm import LGBMClassifier

In [24]:
lgbm_clf1 = LGBMClassifier(random_state=0)
lgbm_clf2 = LGBMClassifier(learning_rate=0.3, n_estimators=200,
                           early_stopping_rounds=30, random_state=0)

In [25]:
lgbm_clf1.fit(X_train, y_train)
lgbm_clf2.fit(X_train1, y_train1, eval_set=[(X_val, y_val)])

[1]	valid_0's binary_logloss: 0.572011
[2]	valid_0's binary_logloss: 0.510866
[3]	valid_0's binary_logloss: 0.481122
[4]	valid_0's binary_logloss: 0.463911
[5]	valid_0's binary_logloss: 0.457126
[6]	valid_0's binary_logloss: 0.453414
[7]	valid_0's binary_logloss: 0.451973
[8]	valid_0's binary_logloss: 0.44606
[9]	valid_0's binary_logloss: 0.455186
[10]	valid_0's binary_logloss: 0.452561
[11]	valid_0's binary_logloss: 0.459888
[12]	valid_0's binary_logloss: 0.460494
[13]	valid_0's binary_logloss: 0.461906
[14]	valid_0's binary_logloss: 0.468642
[15]	valid_0's binary_logloss: 0.475631
[16]	valid_0's binary_logloss: 0.486172
[17]	valid_0's binary_logloss: 0.488036
[18]	valid_0's binary_logloss: 0.489016
[19]	valid_0's binary_logloss: 0.492068
[20]	valid_0's binary_logloss: 0.496865
[21]	valid_0's binary_logloss: 0.502909
[22]	valid_0's binary_logloss: 0.505333
[23]	valid_0's binary_logloss: 0.508246
[24]	valid_0's binary_logloss: 0.510548
[25]	valid_0's binary_logloss: 0.511735
[26]	valid

In [26]:
pred1 = lgbm_clf1.predict(X_test)
pred2 = lgbm_clf2.predict(X_test)

print('LGBM', f1_score(y_test, pred1))
print('LGBM(tuning)', f1_score(y_test, pred2))

LGBM 0.6875000000000001
LGBM(tuning) 0.6821705426356589


#### CatBoost

In [None]:
!pip install catboost

In [27]:
from catboost import CatBoostClassifier

In [28]:
cat_clf = CatBoostClassifier(random_state=0)

cat_clf.fit(X_train, y_train, verbose=0)

pred1 = cat_clf.predict(X_test)
print('CatBoost', f1_score(y_test, pred1))

CatBoost 0.7089552238805971


범주형 변수 입력

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=0)

X_train.head(3)

Unnamed: 0,pclass,sex,age,sibsp,parch,ticket,fare,cabin,embarked,home.dest
501,2.0,female,13.0,0.0,1.0,250644,19.5,,S,"England / Bennington, VT"
588,2.0,female,4.0,1.0,1.0,29103,23.0,,S,"Cornwall / Akron, OH"
402,2.0,female,30.0,1.0,0.0,SC/PARIS 2148,13.8583,,C,"Barcelona, Spain / Havana, Cuba"


In [30]:
# 결측치 처리
X_train['age'] = X_train.age.fillna(X_train.age.mean())
X_train['cabin'] = X_train.cabin.fillna(X_train.cabin.mode()[0])
X_train['embarked'] = X_train.embarked.fillna(X_train.embarked.mode()[0])
X_train['home.dest'] = X_train['home.dest'].fillna('결측치')

X_test['age'] = X_test.age.fillna(X_train.age.mean())
X_test['fare'] = X_test.fare.fillna(X_train.fare.mean())
X_test['cabin'] = X_test.cabin.fillna(X_train.cabin.mode()[0])
X_test['embarked'] = X_test.embarked.fillna(X_train.embarked.mode()[0])
X_test['home.dest'] = X_test['home.dest'].fillna('결측치')

In [31]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 916 entries, 501 to 684
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   pclass     916 non-null    float64 
 1   sex        916 non-null    category
 2   age        916 non-null    float64 
 3   sibsp      916 non-null    float64 
 4   parch      916 non-null    float64 
 5   ticket     916 non-null    object  
 6   fare       916 non-null    float64 
 7   cabin      916 non-null    object  
 8   embarked   916 non-null    category
 9   home.dest  916 non-null    object  
dtypes: category(2), float64(5), object(3)
memory usage: 66.4+ KB


In [32]:
cat_clf = CatBoostClassifier(random_state=0, cat_features=['sex', 'ticket', 'cabin', 'embarked', 'home.dest'])

cat_clf.fit(X_train, y_train, verbose=0)

pred1 = cat_clf.predict(X_test)
print('CatBoost', f1_score(y_test, pred1))

CatBoost 0.739622641509434
