<a href="https://colab.research.google.com/github/tlsgptj/2024-Samsung-AI-Challenge-Black-box-Optimization/blob/main/samsung_LightGBM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install scikit-optimize

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from lightgbm import LGBMRegressor
from skopt import BayesSearchCV

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
X_train = train_df.drop(columns=['ID', 'y'])  # 'ID'와 'y'를 제외한 특징 사용
y_train = train_df['y']
X_test = test_df.drop(columns=['ID'])  # 'ID'를 제외한 특징 사용
test_ids = test_df['ID']  # 제출을 위한 ID

In [None]:
lgbm_model = LGBMRegressor(random_state=42)

In [None]:
param_space_lgbm = {
    'n_estimators': (100, 1000),
    'max_depth': (3, 15),
    'learning_rate': (0.01, 0.3, 'log-uniform'),
    'num_leaves': (20, 50),
    'min_child_samples': (5, 30),
    'subsample': (0.5, 1.0),
    'colsample_bytree': (0.5, 1.0)
}

In [None]:
opt_lgbm = BayesSearchCV(estimator=lgbm_model, search_spaces=param_space_lgbm, n_iter=50, cv=3, n_jobs=-1, random_state=42)

In [None]:
opt_lgbm.fit(X_train, y_train)
print("Best hyperparameters for LightGBM:", opt_lgbm.best_params_)

In [None]:
y_test_pred_lgbm = opt_lgbm.predict(X_test)

In [None]:
X_train_part, X_valid, y_train_part, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
opt_lgbm.fit(X_train_part, y_train_part)
y_valid_pred_lgbm = opt_lgbm.predict(X_valid)
mse_lgbm = mean_squared_error(y_valid, y_valid_pred_lgbm)
r2_lgbm = r2_score(y_valid, y_valid_pred_lgbm)
print(f"Validation Mean Squared Error (LightGBM): {mse_lgbm:.4f}")
print(f"Validation R-squared (LightGBM): {r2_lgbm:.4f}")

In [None]:
threshold_lgbm = np.percentile(y_test_pred_lgbm, 67)
top_33_percent_mask_lgbm = y_test_pred_lgbm >= threshold_lgbm

In [None]:
submission_df_lgbm = pd.read_csv('sample_submission.csv')
submission_df_lgbm['y'] = y_test_pred_lgbm
submission_df_lgbm.to_csv('lgbm_updated_submission.csv', index=False)

print(f"Top 33% threshold (LightGBM): {threshold_lgbm:.4f}")
print(f"Number of samples in top 33% (LightGBM): {sum(top_33_percent_mask_lgbm)}")
print("Submission file 'lgbm_updated_submission.csv' created successfully.")

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from lightgbm import LGBMRegressor
from skopt import BayesSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [None]:
print("Train DataFrame columns:", train_df.columns)

In [None]:
# 수치형과 범주형 특성 선택
numerical_features = train_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = train_df.select_dtypes(include=['object']).columns.tolist()

# 'ID'와 'y'는 제거
numerical_features = [col for col in numerical_features if col not in ['ID', 'y']]
categorical_features = [col for col in categorical_features if col != 'ID']

print("Numerical features:", numerical_features)
print("Categorical features:", categorical_features)

# 결측값 대체 전략 설정
numerical_transformer = SimpleImputer(strategy='mean')
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
# 컬럼 변환기를 통해 수치형과 범주형에 다른 전처리 적용
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough'  # 나머지 컬럼은 그대로 유지
)

In [None]:
# 전처리 및 모델을 포함하는 파이프라인 구성
model = LGBMRegressor(random_state=42)

In [None]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),  # 스케일링 추가
    ('model', model)
])

In [None]:
# 하이퍼파라미터 검색 공간 정의
param_space_lgbm = {
    'model__n_estimators': (100, 1000),
    'model__max_depth': (3, 15),
    'model__learning_rate': (0.01, 0.3, 'log-uniform'),
    'model__subsample': (0.5, 1.0),
    'model__colsample_bytree': (0.5, 1.0),
    'model__min_child_samples': (1, 30)
}

In [None]:
opt_lgbm = BayesSearchCV(estimator=pipeline, search_spaces=param_space_lgbm, n_iter=50, cv=3, n_jobs=-1, random_state=42)

In [None]:
# 학습 데이터 준비
X_train = train_df.drop(columns=['ID', 'y'])
y_train = train_df['y']

In [None]:
# y_train의 결측값 처리
X_train = X_train[~y_train.isnull()]
y_train = y_train.dropna()

In [None]:
# 모델 학습 및 최적의 하이퍼파라미터 찾기
opt_lgbm.fit(X_train, y_train)
print("Best hyperparameters for LightGBM:", opt_lgbm.best_params_)

In [None]:
# 테스트 데이터 예측
X_test = test_df.drop(columns=['ID'])
y_test_pred_lgbm = opt_lgbm.predict(X_test)

In [None]:
# 성능 평가 (학습 데이터에서 성능 검증)
X_train_part, X_valid, y_train_part, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
opt_lgbm.fit(X_train_part, y_train_part)
y_valid_pred_lgbm = opt_lgbm.predict(X_valid)
mse_lgbm = mean_squared_error(y_valid, y_valid_pred_lgbm)
r2_lgbm = r2_score(y_valid, y_valid_pred_lgbm)
print(f"Validation Mean Squared Error (LightGBM): {mse_lgbm:.4f}")
print(f"Validation R-squared (LightGBM): {r2_lgbm:.4f}")

In [None]:
# 상위 33% 임계값 계산
threshold_lgbm = np.percentile(y_test_pred_lgbm, 67)
top_33_percent_mask_lgbm = y_test_pred_lgbm >= threshold_lgbm

In [None]:
# 제출 파일 생성
submission_df_lgbm = pd.read_csv('sample_submission.csv')
submission_df_lgbm['y'] = y_test_pred_lgbm
submission_df_lgbm.to_csv('lgbm_updated_submission.csv', index=False)

print(f"Top 33% threshold (LightGBM): {threshold_lgbm:.4f}")
print(f"Number of samples in top 33% (LightGBM): {sum(top_33_percent_mask_lgbm)}")
print("Submission file 'lgbm_Feature_submission.csv' created successfully.")

Top 33% threshold (LightGBM): 84.4885
Number of samples in top 33% (LightGBM): 1646
Submission file 'lgbm_Feature_submission.csv' created successfully.