<a href="https://colab.research.google.com/github/veryHapppy/study_ai/blob/main/Kaggle/regression_crab_age.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install optuna
!pip install catboost

Collecting optuna
  Downloading optuna-4.7.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.7.0-py3-none-any.whl (413 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m413.9/413.9 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.7.0
Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [None]:
import numpy as np
import pandas as pd
import optuna
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from google.colab import drive
drive.mount('/content/drive')
import zipfile
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import root_mean_squared_error, mean_absolute_error
from sklearn.ensemble import VotingRegressor, StackingRegressor, RandomForestRegressor
from sklearn.linear_model import RidgeCV
from sklearn.compose import TransformedTargetRegressor
from sklearn.preprocessing import PowerTransformer

Mounted at /content/drive


In [None]:
def feature_engineering(data) :
  data = data.set_index(data['id']).drop('id', axis=1)
  data['vol'] = data['Length'] * data['Diameter'] * data['Height']
  data['hl_ratio'] = data['Height'] / data['Length']
  data['hd_ratio'] = data['Height'] / data['Diameter']
  data['density'] = data['Weight'] / data['vol']
  data['Shucked_ratio'] = data['Shucked Weight'] / data['Weight']
  data['Shell_ratio'] = data['Shell Weight'] / data['Weight']
  data['Viscera_ratio'] = data['Viscera Weight'] / data['Weight']

  data = pd.get_dummies(data, columns=['Sex'])

  return data

In [None]:
zip_path = '/content/drive/MyDrive/Colab Notebooks/[Regression] crab age/train.csv.zip'
extract_path = '/content/dataset/train'
with zipfile.ZipFile(zip_path, 'r') as z :
  z.extractall(extract_path)
train_set = pd.read_csv(f'{extract_path}/train.csv')

In [None]:
data = feature_engineering(train_set)

In [None]:
transform_cols = ['Weight', 'Shucked Weight', 'Shell Weight', 'Viscera Weight']
data_transformed = data.copy()
pt = PowerTransformer()
data_transformed[transform_cols] = pt.fit_transform(data[transform_cols])


outliers_index = data[(data['Shucked_ratio'] > 1.0) | (data['Shell_ratio'] > 1.0) | (data['Viscera_ratio'] > 1.0)].index
data_transformed = data_transformed.drop(outliers_index).reset_index(drop=True)
data_transformed = data_transformed[data_transformed['vol'] > 0].reset_index(drop=True)

In [None]:
'''
plt.figure(figsize=(10, 8))
sns.boxplot(data_transformed[['hl_ratio', 'hd_ratio', 'Shucked_ratio', 'Shell_ratio', 'Viscera_ratio']])
plt.show()
data_transformed.corr()
'''

"\nplt.figure(figsize=(10, 8))\nsns.boxplot(data_transformed[['hl_ratio', 'hd_ratio', 'Shucked_ratio', 'Shell_ratio', 'Viscera_ratio']])\nplt.show()\ndata_transformed.corr()\n"

In [None]:
X = data_transformed.drop('Age', axis=1)
y = data_transformed['Age']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
y_train_log = np.log1p(y_train)

In [None]:
def objective(trial) :
  xgb_params = {
    'n_estimators': trial.suggest_int('xgb_n', 100, 500),
    'max_depth': trial.suggest_int('xgb_depth', 3, 7),
    'colsample_bytree': trial.suggest_float('xgb_colsample', 0.6, 0.8),
    'reg_alpha': trial.suggest_float('xgb_alpha', 1e-3, 10.0, log=True),
    'reg_lambda': trial.suggest_float('xgb_lambda', 1e-3, 10.0, log=True),
    'learning_rate': trial.suggest_float('xgb_lr', 0.01, 0.1),
  }
  lgbm_params = {
    'n_estimators': trial.suggest_int('lgbm_n', 100, 500),
    'num_leaves': trial.suggest_int('lgbm_leaves', 15, 50),
    'learning_rate': trial.suggest_float('lgbm_lr', 0.01, 0.1),
    'verbose': -1
  }
  cat_params = {
    'iterations': trial.suggest_int('cat_iter', 100, 500),
    'depth': trial.suggest_int('cat_depth', 3, 7),
    'learning_rate': trial.suggest_float('cat_lr', 0.01, 0.1),
    'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1.0, 10.0),
    'random_strength': trial.suggest_float('random_strength', 0.1, 10.0),
    'logging_level': 'Silent' # CatBoost 조용히 시키기
  }
  w_xgb = trial.suggest_float('w_xgb', 0.5, 2.0)
  w_lgbm = trial.suggest_float('w_lgbm', 0.5, 2.0)
  w_cat = trial.suggest_float('w_cat', 0.5, 2.0)

  xgb = XGBRegressor(**xgb_params, tree_method='hist', device='cuda', n_jobs=-1, random_state=42)
  lgbm = LGBMRegressor(**lgbm_params, device='gpu', n_jobs=-1, random_state=42)
  cat = CatBoostRegressor(**cat_params, task_type='GPU',devices='0', random_state=42)

  voting_model = VotingRegressor(
    estimators=[('xgb', xgb), ('lgbm', lgbm), ('cat', cat)],
    weights=[w_xgb, w_lgbm, w_cat],
  )

  tt_model = TransformedTargetRegressor(
    regressor=voting_model, # 아까 만든 보팅 모델
    func=np.log1p,
    check_inverse=False,
    inverse_func=np.expm1
  )

  score = cross_val_score(tt_model, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')

  return -score.mean()

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=80)
print(study.best_value)

[I 2026-01-24 11:59:14,336] A new study created in memory with name: no-name-6dcc9a67-0150-42d7-b491-3d31ff61a6f0
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)
[I 2026-01-24 11:59:45,184] Trial 0 finished with value: 1.3893558208197356 and parameters: {'xgb_n': 103, 'xgb_depth': 5, 'xgb_colsample': 0.6995561946890829, 'xgb_alpha': 0.0015247543449305085, 'xgb_lambda': 2.0628674231017885, 'xgb_lr': 0.03417578981500723, 'lgbm_n': 152, 'lgbm_leaves': 17, 'lgbm_lr': 0.03243856429555121, 'cat_iter': 120, 'cat_depth': 4, 'cat_lr': 0.09945816488201908, 'l2_leaf_reg': 7.078041470980906, 'random_strength': 2.7498171966868377, 'w_xgb': 1.9912992117218329, 'w_lgbm': 1.386886547246154, 'w_cat': 0.6156477056524012}. Best is trial 0 with value: 1.3893558208197356.
[I 2026-01-24 12:00:17,005] Trial 1 finished with value: 1.3887192091175193 and parameters: {'xgb_n': 179, 

1.384864177017148


In [None]:
from sklearn.utils.extmath import stable_cumsum
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

best = study.best_params

xgb_final = XGBRegressor(
    n_estimators=best['xgb_n'],
    max_depth=best['xgb_depth'],
    learning_rate=best['xgb_lr'],
    colsample_bytree=best['xgb_colsample'],
    reg_alpha=best['xgb_alpha'],
    reg_lambda=best['xgb_lambda'],
    random_state=42,
    tree_method='hist',
    device='cuda',
)

lgbm_final = LGBMRegressor(
    n_estimators=best['lgbm_n'],
    num_leaves=best['lgbm_leaves'],
    learning_rate=best['lgbm_lr'],
    random_state=42,
    verbose=-1,
    device='gpu',
)

cat_final = CatBoostRegressor(
    iterations=best['cat_iter'],
    depth=best['cat_depth'],
    learning_rate=best['cat_lr'],
    l2_leaf_reg=best['l2_leaf_reg'],
    random_strength=best['random_strength'],
    task_type='GPU', # GPU 사용
    devices='0',
    random_state=42,
    logging_level='Silent'
)

voting_model = VotingRegressor(
    estimators=[('xgb', xgb_final), ('lgbm', lgbm_final), ('cat', cat_final)],
    weights=[best['w_xgb'], best['w_lgbm'], best['w_cat']]
)

stacking_model = StackingRegressor(
    estimators=[('xgb', xgb_final), ('lgbm', lgbm_final), ('cat', cat_final)],
    final_estimator=RidgeCV(),
    cv=5
)

voting_model.fit(X_train, np.log1p(y_train))
stacking_model.fit(X_train, np.log1p(y_train))

voting_pred = np.expm1(voting_model.predict(X_test))
stacking_pred = np.expm1(stacking_model.predict(X_test))

print(f'voting 오차 : {mean_absolute_error(y_test, voting_pred)}')
print(f'stacking 오차 : {mean_absolute_error(y_test, stacking_pred)}')

joblib.dump(voting_model, 'voting_best.pkl')
joblib.dump(stacking_model, 'stacking_best.pkl')

voting 오차 : 1.3575475877397605
stacking 오차 : 1.3573312342810349


['stacking_best.pkl']

In [None]:
zip_path = '/content/drive/MyDrive/Colab Notebooks/[Regression] crab age/test.csv.zip'
extract_path = '/content/dataset/test'
with zipfile.ZipFile(zip_path, 'r') as z:
  z.extractall(extract_path)
test_set = pd.read_csv(f'{extract_path}/test.csv')

data_test = feature_engineering(test_set)
X = data_test
X[transform_cols] = pt.transform(data_test[transform_cols])
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X = X.fillna(X_train.median())
model = joblib.load('stacking_best.pkl')

prediction = np.expm1(model.predict(X))

submission = pd.DataFrame({
    "id": X.index,
    "Age": prediction
})

submission.to_csv('submission.csv', index=False)
print("파일 생성 완료")

파일 생성 완료
