# FD004 Branch 4: XGBoost vs LightGBM ÌïòÏù¥ÌçºÌååÎùºÎØ∏ÌÑ∞ ÏµúÏ†ÅÌôî Î∞è ÎπÑÍµê

## üìã Ïã§Ìóò Í∞úÏöî

- **Dataset**: FD004
- **Models**: XGBoost vs LightGBM
- **Preprocessing**: RUL Clipping (RUL ‚â§ 125)
- **Scaler**: RobustScaler (Branch 3 ÏµúÏ†Å Í≤∞Í≥º)
- **Ïã§Ìóò Î™©Ï†Å**:
  1. XGBoost ÏµúÏ†Å ÌïòÏù¥ÌçºÌååÎùºÎØ∏ÌÑ∞ ÌÉêÏÉâ
  2. LightGBM ÏµúÏ†Å ÌïòÏù¥ÌçºÌååÎùºÎØ∏ÌÑ∞ ÌÉêÏÉâ
  3. Îëê Î™®Îç∏ ÏÑ±Îä• ÎπÑÍµê

---

## üéØ ÌäúÎãù Ï†ÑÎûµ

1. **XGBoost Random Search** ‚Üí Grid Search
2. **LightGBM Random Search** ‚Üí Grid Search
3. **ÏµúÏ¢Ö ÎπÑÍµê**: ÏµúÏ†Å XGBoost vs ÏµúÏ†Å LightGBM

---

## üîß ÌäúÎãù ÎåÄÏÉÅ ÌïòÏù¥ÌçºÌååÎùºÎØ∏ÌÑ∞

### **Í≥µÌÜµ:**
- n_estimators, max_depth, learning_rate
- subsample, colsample_bytree

### **XGBoost Ï†ÑÏö©:**
- min_child_weight, gamma

### **LightGBM Ï†ÑÏö©:**
- num_leaves, min_child_samples

## 1. ÌôòÍ≤Ω ÏÑ§Ï†ï Î∞è ÎùºÏù¥Î∏åÎü¨Î¶¨ Import

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
import lightgbm as lgb
import time

# ÏãúÎìú Í≥†Ï†ï
np.random.seed(42)

## 2. Îç∞Ïù¥ÌÑ∞ Î°úÎìú Î∞è Ï†ÑÏ≤òÎ¶¨

In [2]:
# Google Drive ÎßàÏö¥Ìä∏
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Í≤ΩÎ°ú ÏÑ§Ï†ï
DATA_PATH = '/content/drive/MyDrive/ML project'

# FD004 Îç∞Ïù¥ÌÑ∞ Î°úÎìú
train_fd004 = pd.read_csv(f'{DATA_PATH}/FD004_train_df.csv')
test_fd004 = pd.read_csv(f'{DATA_PATH}/FD004_test_df.csv')

In [4]:
# ÏÑºÏÑú Ïª¨Îüº Ï†ïÏùò
ALL_SENSORS = [f's_{i}' for i in range(1, 22)]

# RUL Clipping
train_preprocessed = train_fd004.copy()
train_preprocessed['RUL'] = train_preprocessed['RUL'].clip(upper=125)

test_preprocessed = test_fd004.copy()
test_preprocessed['RUL'] = test_preprocessed['RUL'].clip(upper=125)

# RobustScaler Ï†ÅÏö©
scaler = RobustScaler()

train_scaled = train_preprocessed.copy()
train_scaled[ALL_SENSORS] = scaler.fit_transform(train_preprocessed[ALL_SENSORS])

test_scaled = test_preprocessed.copy()
test_scaled[ALL_SENSORS] = scaler.transform(test_preprocessed[ALL_SENSORS])

# X, y Î∂ÑÎ¶¨
X_train = train_scaled[ALL_SENSORS].values
y_train = train_scaled['RUL'].values
X_test = test_scaled[ALL_SENSORS].values
y_test = test_scaled['RUL'].values

## 3. ÌèâÍ∞Ä Ìï®Ïàò

In [5]:
def evaluate_model(model, X_train, y_train, X_test, y_test):
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    test_mae = mean_absolute_error(y_test, y_test_pred)
    test_r2 = r2_score(y_test, y_test_pred)

    return {
        'train_rmse': train_rmse,
        'test_rmse': test_rmse,
        'test_mae': test_mae,
        'test_r2': test_r2
    }

## 4. XGBoost ÌïòÏù¥ÌçºÌååÎùºÎØ∏ÌÑ∞ ÌäúÎãù

### 4.1 XGBoost Random Search

In [6]:
# XGBoost ÌååÎùºÎØ∏ÌÑ∞ Î≤îÏúÑ
xgb_param_distributions = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [3, 5, 7, 9],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'min_child_weight': [1, 3, 5],
    'gamma': [0, 0.1, 0.2]
}

xgb_random_results = []
n_xgb_trials = 15

for trial in range(n_xgb_trials):
    params = {
        'n_estimators': np.random.choice(xgb_param_distributions['n_estimators']),
        'max_depth': np.random.choice(xgb_param_distributions['max_depth']),
        'learning_rate': np.random.choice(xgb_param_distributions['learning_rate']),
        'subsample': np.random.choice(xgb_param_distributions['subsample']),
        'colsample_bytree': np.random.choice(xgb_param_distributions['colsample_bytree']),
        'min_child_weight': np.random.choice(xgb_param_distributions['min_child_weight']),
        'gamma': np.random.choice(xgb_param_distributions['gamma']),
        'random_state': 42,
        'n_jobs': -1
    }

    start_time = time.time()
    model = xgb.XGBRegressor(**params)
    model.fit(X_train, y_train, verbose=False)
    training_time = time.time() - start_time

    results = evaluate_model(model, X_train, y_train, X_test, y_test)
    results['params'] = params
    results['training_time'] = training_time

    xgb_random_results.append(results)

In [7]:
# XGBoost Random Search Í≤∞Í≥º
df_xgb_random = pd.DataFrame([
    {
        'n_estimators': r['params']['n_estimators'],
        'max_depth': r['params']['max_depth'],
        'learning_rate': r['params']['learning_rate'],
        'test_rmse': r['test_rmse'],
        'test_r2': r['test_r2']
    }
    for r in xgb_random_results
]).sort_values('test_rmse')

df_xgb_random

Unnamed: 0,n_estimators,max_depth,learning_rate,test_rmse,test_r2
4,500,5,0.05,16.121246,0.599183
12,200,9,0.05,16.132259,0.598636
5,300,7,0.05,16.135323,0.598483
8,500,5,0.05,16.140186,0.598241
1,300,5,0.1,16.221968,0.594159
3,500,9,0.05,16.292433,0.590626
11,200,5,0.2,16.357778,0.587336
14,500,3,0.2,16.386004,0.58591
6,200,7,0.2,16.445319,0.582907
0,300,9,0.01,16.463586,0.58198


### 4.2 XGBoost Grid Search

In [8]:
# Best Random Í∏∞Î∞ò Grid Search
best_xgb_random = xgb_random_results[df_xgb_random.index[0]]

# Grid ÌååÎùºÎØ∏ÌÑ∞ (Ï¢ÅÏùÄ Î≤îÏúÑ)
xgb_grid_params = {
    'n_estimators': [best_xgb_random['params']['n_estimators']],
    'max_depth': [best_xgb_random['params']['max_depth'] - 1,
                  best_xgb_random['params']['max_depth'],
                  best_xgb_random['params']['max_depth'] + 1],
    'learning_rate': [best_xgb_random['params']['learning_rate'] * 0.5,
                     best_xgb_random['params']['learning_rate'],
                     best_xgb_random['params']['learning_rate'] * 1.5],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9]
}

xgb_grid_results = []

for n_est in xgb_grid_params['n_estimators']:
    for depth in xgb_grid_params['max_depth']:
        for lr in xgb_grid_params['learning_rate']:
            for ss in xgb_grid_params['subsample']:
                for cs in xgb_grid_params['colsample_bytree']:
                    params = {
                        'n_estimators': n_est,
                        'max_depth': max(3, depth),  # min 3
                        'learning_rate': min(0.3, max(0.01, lr)),  # Î≤îÏúÑ Ï†úÌïú
                        'subsample': ss,
                        'colsample_bytree': cs,
                        'min_child_weight': best_xgb_random['params']['min_child_weight'],
                        'gamma': best_xgb_random['params']['gamma'],
                        'random_state': 42,
                        'n_jobs': -1
                    }

                    model = xgb.XGBRegressor(**params)
                    model.fit(X_train, y_train, verbose=False)

                    results = evaluate_model(model, X_train, y_train, X_test, y_test)
                    results['params'] = params
                    xgb_grid_results.append(results)

print(f"‚úÖ XGBoost Grid Search: {len(xgb_grid_results)}Í∞ú Ï°∞Ìï©")

‚úÖ XGBoost Grid Search: 81Í∞ú Ï°∞Ìï©


In [9]:
# XGBoost ÏµúÏ†Å Î™®Îç∏
all_xgb_results = xgb_random_results + xgb_grid_results
best_xgb = min(all_xgb_results, key=lambda x: x['test_rmse'])

lines = [
    "",
    "=" * 70,
    "XGBoost ÏµúÏ†Å ÌïòÏù¥ÌçºÌååÎùºÎØ∏ÌÑ∞",
    "=" * 70
]

lines.extend([
    f"{key}: {val}"
    for key, val in best_xgb['params'].items()
    if key not in ['random_state', 'n_jobs']
])

lines.extend([
    "",
    "XGBoost ÏµúÍ≥† ÏÑ±Îä•",
    f"Train RMSE: {best_xgb['train_rmse']:.2f}",
    f"Test RMSE: {best_xgb['test_rmse']:.2f}",
    f"Test MAE: {best_xgb['test_mae']:.2f}",
    f"Test R¬≤: {best_xgb['test_r2']:.4f}"
])

print("\n".join(lines))


XGBoost ÏµúÏ†Å ÌïòÏù¥ÌçºÌååÎùºÎØ∏ÌÑ∞
n_estimators: 500
max_depth: 6
learning_rate: 0.025
subsample: 0.7
colsample_bytree: 0.7
min_child_weight: 1
gamma: 0.2

XGBoost ÏµúÍ≥† ÏÑ±Îä•
Train RMSE: 16.17
Test RMSE: 16.09
Test MAE: 10.50
Test R¬≤: 0.6007


## 5. LightGBM ÌïòÏù¥ÌçºÌååÎùºÎØ∏ÌÑ∞ ÌäúÎãù

### 5.1 LightGBM Random Search

In [10]:
# LightGBM ÌååÎùºÎØ∏ÌÑ∞ Î≤îÏúÑ
lgb_param_distributions = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [3, 5, 7, 9, -1],  # -1: no limit
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'num_leaves': [15, 31, 63, 127],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'min_child_samples': [5, 10, 20]
}

lgb_random_results = []
n_lgb_trials = 15

for trial in range(n_lgb_trials):
    params = {
        'n_estimators': np.random.choice(lgb_param_distributions['n_estimators']),
        'max_depth': np.random.choice(lgb_param_distributions['max_depth']),
        'learning_rate': np.random.choice(lgb_param_distributions['learning_rate']),
        'num_leaves': np.random.choice(lgb_param_distributions['num_leaves']),
        'subsample': np.random.choice(lgb_param_distributions['subsample']),
        'colsample_bytree': np.random.choice(lgb_param_distributions['colsample_bytree']),
        'min_child_samples': np.random.choice(lgb_param_distributions['min_child_samples']),
        'random_state': 42,
        'n_jobs': -1,
        'verbose': -1
    }

    start_time = time.time()
    model = lgb.LGBMRegressor(**params)
    model.fit(X_train, y_train)
    training_time = time.time() - start_time

    results = evaluate_model(model, X_train, y_train, X_test, y_test)
    results['params'] = params
    results['training_time'] = training_time

    lgb_random_results.append(results)



In [11]:
# LightGBM Random Search Í≤∞Í≥º
df_lgb_random = pd.DataFrame([
    {
        'n_estimators': r['params']['n_estimators'],
        'max_depth': r['params']['max_depth'],
        'learning_rate': r['params']['learning_rate'],
        'num_leaves': r['params']['num_leaves'],
        'test_rmse': r['test_rmse'],
        'test_r2': r['test_r2'],
        'time': r['training_time']
    }
    for r in lgb_random_results
]).sort_values('test_rmse')

df_lgb_random

Unnamed: 0,n_estimators,max_depth,learning_rate,num_leaves,test_rmse,test_r2,time
3,300,-1,0.05,63,16.054363,0.602502,0.80687
11,300,7,0.05,127,16.152181,0.597644,1.040451
10,300,-1,0.1,63,16.193366,0.595589,0.937726
2,500,7,0.1,15,16.224536,0.594031,0.688672
13,500,5,0.1,127,16.258366,0.592336,0.831205
9,500,3,0.2,15,16.307815,0.589853,0.542881
7,100,5,0.1,63,16.411917,0.584599,0.241094
5,500,-1,0.1,127,16.562216,0.576956,4.804213
0,500,5,0.01,127,16.722003,0.568754,0.985726
8,100,5,0.05,127,16.87885,0.560626,0.24655


### 5.2 LightGBM Grid Search

In [12]:
# Best Random Í∏∞Î∞ò Grid Search
best_lgb_random = lgb_random_results[df_lgb_random.index[0]]

# Grid ÌååÎùºÎØ∏ÌÑ∞
lgb_grid_params = {
    'n_estimators': [best_lgb_random['params']['n_estimators']],
    'num_leaves': [best_lgb_random['params']['num_leaves'] // 2,
                   best_lgb_random['params']['num_leaves'],
                   best_lgb_random['params']['num_leaves'] * 2],
    'learning_rate': [best_lgb_random['params']['learning_rate'] * 0.5,
                     best_lgb_random['params']['learning_rate'],
                     best_lgb_random['params']['learning_rate'] * 1.5],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9]
}

lgb_grid_results = []

for n_est in lgb_grid_params['n_estimators']:
    for leaves in lgb_grid_params['num_leaves']:
        for lr in lgb_grid_params['learning_rate']:
            for ss in lgb_grid_params['subsample']:
                for cs in lgb_grid_params['colsample_bytree']:
                    params = {
                        'n_estimators': n_est,
                        'max_depth': best_lgb_random['params']['max_depth'],
                        'learning_rate': min(0.3, max(0.01, lr)),
                        'num_leaves': max(15, min(255, int(leaves))),
                        'subsample': ss,
                        'colsample_bytree': cs,
                        'min_child_samples': best_lgb_random['params']['min_child_samples'],
                        'random_state': 42,
                        'n_jobs': -1,
                        'verbose': -1
                    }

                    model = lgb.LGBMRegressor(**params)
                    model.fit(X_train, y_train)

                    results = evaluate_model(model, X_train, y_train, X_test, y_test)
                    results['params'] = params
                    lgb_grid_results.append(results)

print(f"‚úÖ LightGBM Grid Search: {len(lgb_grid_results)}Í∞ú Ï°∞Ìï©")



‚úÖ LightGBM Grid Search: 81Í∞ú Ï°∞Ìï©


In [13]:
# LightGBM ÏµúÏ†Å Î™®Îç∏
all_lgb_results = lgb_random_results + lgb_grid_results
best_lgb = min(all_lgb_results, key=lambda x: x['test_rmse'])

lines = [
    "",
    "=" * 70,
    "LightGBM ÏµúÏ†Å ÌïòÏù¥ÌçºÌååÎùºÎØ∏ÌÑ∞",
    "=" * 70
]

exclude = {'random_state', 'n_jobs', 'verbose'}
lines.extend([
    f"{key}: {val}"
    for key, val in best_lgb['params'].items()
    if key not in exclude
])

# ÏÑ±Îä• ÏßÄÌëú
lines.extend([
    "",
    "LightGBM ÏµúÍ≥† ÏÑ±Îä•",
    f"Train RMSE: {best_lgb['train_rmse']:.2f}",
    f"Test RMSE: {best_lgb['test_rmse']:.2f}",
    f"Test MAE: {best_lgb['test_mae']:.2f}",
    f"Test R¬≤: {best_lgb['test_r2']:.4f}"
])

print("\n".join(lines))


LightGBM ÏµúÏ†Å ÌïòÏù¥ÌçºÌååÎùºÎØ∏ÌÑ∞
n_estimators: 300
max_depth: -1
learning_rate: 0.025
num_leaves: 126
subsample: 0.7
colsample_bytree: 0.7
min_child_samples: 20

LightGBM ÏµúÍ≥† ÏÑ±Îä•
Train RMSE: 15.12
Test RMSE: 16.03
Test MAE: 10.35
Test R¬≤: 0.6035


## 6. XGBoost vs LightGBM ÏµúÏ¢Ö ÎπÑÍµê

In [14]:
comparison = pd.DataFrame([
    {
        'Model': 'XGBoost',
        'Train_RMSE': best_xgb['train_rmse'],
        'Test_RMSE': best_xgb['test_rmse'],
        'Test_MAE': best_xgb['test_mae'],
        'Test_R2': best_xgb['test_r2']
    },
    {
        'Model': 'LightGBM',
        'Train_RMSE': best_lgb['train_rmse'],
        'Test_RMSE': best_lgb['test_rmse'],
        'Test_MAE': best_lgb['test_mae'],
        'Test_R2': best_lgb['test_r2']
    }
])

comparison

Unnamed: 0,Model,Train_RMSE,Test_RMSE,Test_MAE,Test_R2
0,XGBoost,16.170572,16.090619,10.497756,0.600705
1,LightGBM,15.118957,16.033566,10.354217,0.603532
