In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder

# 데이터 가져오기
titanic_df = pd.read_csv("../../data/titanic_train.csv")
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [8]:

# 전처리
mean_data = titanic_df["Age"].mean()
titanic_df["Age"] = titanic_df["Age"].fillna(titanic_df["Age"].mean())
titanic_df["Cabin"] = titanic_df["Cabin"].fillna("N")
titanic_df["Embarked"] = titanic_df["Embarked"].fillna("N")

le_Sex = LabelEncoder()
le_Sex.fit(titanic_df["Sex"])
tmp = le_Sex.transform(titanic_df["Sex"])
titanic_df["Sex"] = tmp

le_Embarked = LabelEncoder()
le_Embarked.fit(titanic_df["Embarked"])
tmp = le_Embarked.transform(titanic_df["Embarked"])
titanic_df["Embarked"] = tmp

drop_feature = ["PassengerId", "Name", "Ticket", "Cabin"]
titanic_df = titanic_df.drop(drop_feature, axis=1)

y = titanic_df["Survived"]
X = titanic_df.drop(["Survived"], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)

In [19]:
import optuna
import lightgbm as lgb

# 하이퍼 파라미터 설정
def objective(trial):
    param = {
        'objective': 'binary',
        'metric': 'f1',
        'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt', 'dart']),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10, log=True),  # L1 정규화 - Rasso
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100)
    }
    
    model = lgb.LGBMClassifier(**param)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_pred, y_test)
    f1 = f1_score(y_pred, y_test)

    print(accuracy, f1)
    return f1

In [None]:
# optuna 사용해보기
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)


In [21]:
study.best_params

{'boosting_type': 'gbdt',
 'n_estimators': 112,
 'reg_alpha': 1.8119443263585686e-06,
 'min_child_samples': 47}

In [22]:
study.best_value

0.8205128205128205

In [23]:
study.best_trial

FrozenTrial(number=45, state=1, values=[0.8205128205128205], datetime_start=datetime.datetime(2025, 1, 9, 12, 21, 28, 417765), datetime_complete=datetime.datetime(2025, 1, 9, 12, 21, 28, 459489), params={'boosting_type': 'gbdt', 'n_estimators': 112, 'reg_alpha': 1.8119443263585686e-06, 'min_child_samples': 47}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'boosting_type': CategoricalDistribution(choices=('gbdt', 'dart')), 'n_estimators': IntDistribution(high=1000, log=False, low=50, step=1), 'reg_alpha': FloatDistribution(high=10.0, log=True, low=1e-08, step=None), 'min_child_samples': IntDistribution(high=100, log=False, low=5, step=1)}, trial_id=45, value=None)

In [28]:
divmod(8, 10)

(0, 8)