## 모듈

In [54]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV


from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier

## 전처리

In [15]:
def check_fillna( df ):
    df.loc[:, "Age"].fillna( df.loc[:,"Age"].mean(), inplace=True)
    df.loc[:,"Cabin"].fillna("N", inplace=True)
    df.loc[:,"Embarked"].fillna("N", inplace=True)
    df.loc[:,"Fare"].fillna( 0, inplace=True) # 실제 누락은 없음.
    return df


def drop_feature( df):
    # 생존여부와 상관이 없는 이름, id, Ticket
    # ===> pandas에서 지울 떄 : drop + axis=0/1
    df.drop( ["PassengerId", "Name", "Ticket"],axis=1, inplace=True )
    return df

def encode_features( df ):
    df.loc[:,"Cabin"] = df.loc[:,"Cabin"].apply(lambda x: str(x)[:1])
    cols = ["Cabin","Sex","Embarked"]
    for col in cols:
        print(col)
        le = LabelEncoder()
        le.fit( df.loc[:, col])
        df.isetitem( df.columns.get_loc(col), le.transform( df.loc[:, col]))
    return df

def titanic_preprocessing( df ):
    df = check_fillna( df )
    df = drop_feature( df )
    df = encode_features( df )
    #.......
    return df

In [16]:
# 다시 데이터를 불러와서 적용을 하려고 함!!~
path = "train.csv"
data = pd.read_csv(path)
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [17]:
y_titanic = data.loc[:,"Survived"]
X_titanic = data.drop("Survived", axis=1)

X_titanic = titanic_preprocessing( X_titanic)

Cabin
Sex
Embarked


In [21]:
X_train, X_val, y_train, y_val = train_test_split(X_titanic, 
                                                  y_titanic, 
                                                  test_size = 0.2,
                                                  random_state= 1234,
                                                  stratify=y_titanic)

## xgboost

In [27]:
kfold = KFold(n_splits= 5, random_state=1234, shuffle = True)

In [28]:
# base-line
xgbc = XGBClassifier(n_jobs=-1, random_state=1234)
scores = cross_val_score( xgbc,
                         X_train, y_train,
                          cv = kfold, scoring="accuracy")
print(scores)
print(scores.mean())
print(scores.std())

[0.81118881 0.81818182 0.75352113 0.8028169  0.80985915]
0.7991135624938441
0.023312068454051538


In [29]:
xgbc = XGBClassifier(n_jobs=-1, random_state=1234)

parameters = {
    "n_estimators": [ 10, 30, 50, 100,300,500],
    "learning_rate":[0.01, 0.1, 0.2, 0.3], # 0.001~0.01~0.2~3

    "max_depth":[2,3,4,5,6,10], # 트리 깊이    
    "gamma":[0.1, 0.2, 0.3], # 지표를 중심으로 더 분화할지 말지...

    # GBM  기본적으로 overfit 이 심한 모델임으로 샘플링을 통해서 overfit을 조절

    "subsample": [0.3, 0.4,0.5,0.6, 0.9],# 가로에 대한 비율
    "colsample_bytree":[0.3, 0.4,0.5,0.6, 0.9],# 세로에 대한 비율

    # reg_alpha ,reg_lambda 에서 규약의 정도를 통해서...컨트롤...
}
n_iter = 10 # 시간 관계상 10개만 하는 거임.

xgbc_kf_rgs = RandomizedSearchCV(
    xgbc,
    param_distributions = parameters,
    cv = kfold,
    scoring="accuracy",
    random_state=1234,
    n_iter = n_iter,
    n_jobs = -1
)
xgbc_kf_rgs.fit(X_train, y_train)

In [30]:
xgbc_kf_rgs.cv_results_

{'mean_fit_time': array([0.06742001, 0.39884477, 0.23937578, 0.06790218, 0.03199649,
        0.08500242, 0.33471708, 0.04701667, 0.42770596, 0.02477679]),
 'std_fit_time': array([0.01580351, 0.03221492, 0.01718961, 0.02056718, 0.01926287,
        0.01747779, 0.02387073, 0.00617823, 0.03989204, 0.00515724]),
 'mean_score_time': array([0.00942707, 0.01515927, 0.00832033, 0.01114583, 0.00678115,
        0.00782509, 0.00997086, 0.00911317, 0.00564423, 0.00600228]),
 'std_score_time': array([0.00388891, 0.00434205, 0.00236425, 0.00885076, 0.00222099,
        0.00227761, 0.00413282, 0.0037391 , 0.00125496, 0.00128426]),
 'param_subsample': masked_array(data=[0.3, 0.6, 0.4, 0.3, 0.9, 0.5, 0.6, 0.4, 0.9, 0.5],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'param_n_estimators': masked_array(data=[30, 500, 300, 50, 10, 100, 300, 50, 500, 10],
              mask=[False, False, Fal

In [31]:
xgbc_kf_rgs.best_params_

{'subsample': 0.3,
 'n_estimators': 30,
 'max_depth': 5,
 'learning_rate': 0.1,
 'gamma': 0.1,
 'colsample_bytree': 0.9}

In [32]:
# cv를 한 결과의 평균의 최대값임을 유의!!!!!!!
xgbc_kf_rgs.best_score_

0.8272136314389836

In [33]:
# 중간 점검 val
xgb_kfold_rgs_best = xgbc_kf_rgs.best_estimator_
xgb_kfold_rgs_ypred = xgb_kfold_rgs_best.predict(X_val)
xgb_kfold_rgs_acc = accuracy_score(y_val, xgb_kfold_rgs_ypred)
xgb_kfold_rgs_acc

0.8156424581005587

In [36]:
parameters ={
    "subsample":[0.4, 0.3,0.5],
    "n_estimators":[300,250, 350, 500],
    "max_depth":[2,3],
    "gamma":[0.2, 0.3],
    "colsample_bytree":[0.6, 0.5],
    "learning_rate":[0.01, 0.005, 0.02]

}

xgbc = XGBClassifier(n_jobs=-1, random_state=1234,verbosity=3 )
xgb_kf_gs = GridSearchCV(
    xgbc,
    param_grid = parameters,
    cv = kfold,
    n_jobs=-1,
    verbose = 2
)
xgb_kf_gs.fit(X_train, y_train)

Fitting 5 folds for each of 288 candidates, totalling 1440 fits
[10:58:48] AllReduce: 0.000183s, 1 calls @ 183us

[10:58:48] MakeCuts: 0.000286s, 1 calls @ 286us

[10:58:48] DEBUG: C:\b\abs_0fh_d4x2ng\croot\xgboost-split_1713973188995\work\cpp_src\src\gbm\gbtree.cc:130: Using tree method: 0
[10:58:48] Configure: 0.000639s, 1 calls @ 639us

[10:58:48] EvalOneIter: 0.002581s, 250 calls @ 2581us

[10:58:48] GetGradient: 0.023502s, 250 calls @ 23502us

[10:58:48] PredictRaw: 0.000361s, 250 calls @ 361us

[10:58:48] UpdateOneIter: 0.233201s, 250 calls @ 233201us

[10:58:48] BoostNewTrees: 0.206776s, 250 calls @ 206776us

[10:58:48] CommitModel: 0.000184s, 250 calls @ 184us

[10:58:48] BuildHistogram: 0.044895s, 500 calls @ 44895us

[10:58:48] EvaluateSplits: 0.028796s, 750 calls @ 28796us

[10:58:48] InitData: 0.009673s, 250 calls @ 9673us

[10:58:48] InitRoot: 0.018624s, 250 calls @ 18624us

[10:58:48] LeafPartition: 5e-05s, 250 calls @ 50us

[10:58:48] UpdatePosition: 0.060878s, 750 calls

*** 모델링이나 데이터 분석시 숫자에 매몰되지 말 것.  
왜 그런 숫자가 나오는지, 다각적 시도, 관찰 등이 필요함!


In [37]:
xgb_kf_gs.best_score_

0.8398502905545158

In [38]:
xgb_kf_gs.best_params_

{'colsample_bytree': 0.6,
 'gamma': 0.3,
 'learning_rate': 0.01,
 'max_depth': 3,
 'n_estimators': 250,
 'subsample': 0.5}

In [39]:
xgb_kf_gs_best = xgb_kf_gs.best_estimator_
xgb_kf_gs_ypred = xgb_kf_gs_best.predict(X_val)
xgb_kf_gs_acc  = accuracy_score(y_val, xgb_kf_gs_ypred)
xgb_kf_gs_acc

[10:58:48] DEBUG: C:\b\abs_0fh_d4x2ng\croot\xgboost-split_1713973188995\work\cpp_src\src\gbm\gbtree.cc:130: Using tree method: 0


0.8100558659217877

In [None]:
# train 성능 0.83
# test 성능  0.81

# train 이 잘 되었다고 할 수 있지만, 어떻게 보면 overfit 됐을 수도 있음

## LGB

In [42]:
# baseline을 체크
lgmc = LGBMClassifier(n_jobs=-1, random_state=1234)
scores = cross_val_score( lgmc,
                         X_train, y_train,
                          cv = kfold, scoring="accuracy")
scores.mean()

[LightGBM] [Info] Number of positive: 222, number of negative: 347
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000202 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 187
[LightGBM] [Info] Number of data points in the train set: 569, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.390158 -> initscore=-0.446647
[LightGBM] [Info] Start training from score -0.446647
[LightGBM] [Info] Number of positive: 213, number of negative: 356
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000094 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 190
[LightGBM] [Info] Number of data points in the train set: 569, number of used features: 8
[LightGBM] [Info] [binary:BoostFro

0.8272136314389836

In [50]:
# RGS
lgbc = LGBMClassifier(n_jobs=-1, random_state=1234)
parameters={
    "n_estimators" : [10,30, 50, 100,300,500], #몇개 모델을 사용할지
    # 2) 어느 정도 비율을 반영할지
    "learning_rate" : [0.01,0.03, 0.05,0.1,0.3],

    "max_depth": [2,3,4], # 개별 모델은 : 일반적으로 트리 --> 오버핏을 컨트롤!!!
    "min_split_gain":[0.1,0.2] #xgboost 의 gamma 같은 거
}

n_iter = 10

lgbc_kf_rgs  = RandomizedSearchCV(
    lgbc,
    param_distributions = parameters,
    cv = kfold,
    scoring = "accuracy",
    n_jobs = -1,
    n_iter = n_iter,
    verbose = 2,
)
lgbc_kf_rgs.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[LightGBM] [Info] Number of positive: 273, number of negative: 439
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000232 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 210
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383427 -> initscore=-0.475028
[LightGBM] [Info] Start training from score -0.475028


In [51]:
lgbc_kf_rgs.best_score_

0.8342361863488623

In [52]:
lgbc_kf_rgs.best_params_

{'n_estimators': 300,
 'min_split_gain': 0.2,
 'max_depth': 4,
 'learning_rate': 0.1}

In [53]:
# --> 평가
lgbc_kf_rgs_best = lgbc_kf_rgs.best_estimator_
lgbc_kf_rgs_ypred = lgbc_kf_rgs_best.predict(X_val)
lgbc_kf_rgs_acc = accuracy_score(y_val, lgbc_kf_rgs_ypred)
lgbc_kf_rgs_acc

0.8379888268156425

## voting

In [56]:
# 방법 1) 1인 1표제 : Hard Voting
hard_clf = VotingClassifier(
    # 어떤 모델들을 가지고 다수결 투표를 할지
    estimators = [
                  ("XGB", xgb_kf_gs_best),
                   ("LGBM",lgbc_kf_rgs_best)],
    # 어떤 방식으로 의견을 종합을 할지
    voting= "hard",
    n_jobs = -1
)
hard_clf.fit(X_train, y_train)
hard_clf_ypred = hard_clf.predict(X_val)
hard_clf_acc = accuracy_score(y_val, hard_clf_ypred)
hard_clf_acc

[11:47:00] DEBUG: C:\b\abs_0fh_d4x2ng\croot\xgboost-split_1713973188995\work\cpp_src\src\gbm\gbtree.cc:130: Using tree method: 0


0.8044692737430168

In [None]:
# 방법2-2) soft voting을 통해서 가중치를 부여해서...
# 지분율 : lightGBM 2, RF 1
# ===> Try의 영역...코드짜서 돌림...
soft_clf = VotingClassifier(
    estimators = [ ("RF", rf_kf_rgs_best),
                   ("LGBM",lgbc_kf_rgs_best)],
    # 어떤 방식으로 의견을 종합을 할지
    voting= "soft",
    # +++ 지분에 대한 설정!!!!
    weights=[1,2],
    n_jobs = -1
)
soft_clf.fit(X_train, y_train)
accuracy_score(y_val, soft_clf.predict(X_val))