<a href="https://colab.research.google.com/github/veryHapppy/study_ai/blob/main/Kaggle/bike_sharing_demand.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
!pip install optuna
!pip install catboost



In [20]:
import numpy as np
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import VotingRegressor, StackingRegressor
from sklearn.metrics import mean_squared_log_error
from sklearn.linear_model import RidgeCV

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import optuna
import joblib

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [31]:
def feature_engineering(data) :
  data = data.set_index(data['datetime'])
  data['datetime'] = pd.to_datetime(data['datetime'])
  data['month'] = data['datetime'].dt.month
  data['month_sin'] = np.sin(2 * np.pi * data['month'] / 12)
  data['month_cos'] = np.cos(2 * np.pi * data['month'] / 12)
  data['hour'] = data['datetime'].dt.hour
  data['hour_sin'] = np.sin(2 * np.pi * data['hour'] / 24)
  data['hour_cos'] = np.cos(2 * np.pi * data['hour'] / 24)

  rush_hour = (data['hour']==7) | (data['hour']==8) | (data['hour']==17) | (data['hour']==18)
  data['rush_hour'] = (data['workingday']==1) & rush_hour
  relax_hour = (data['hour']==12) | (data['hour']==13) | (data['hour']==14) | (data['hour']==15)
  data['relax_hour'] = (data['workingday']==0) & relax_hour

  data = pd.get_dummies(data, columns=['season'])
  data['weekend'] = ((data['workingday'] == 0) & (data['holiday'] == 0)).astype(int)
  data['temp_diff'] = data['atemp'] - data['temp']
  data['weather'] = data['weather'].replace(4, 3)

  transform_cols = ['humidity', 'windspeed']#, 'casual', 'registered']
  if 'count' in data.columns :
    #train data
    data[transform_cols] = pt.fit_transform(data[transform_cols])
    data = data.drop(data[(data['temp'] > 20) & (data['atemp'] < 20)].index)

  else :
    #test_data
    data[transform_cols] = pt.transform(data[transform_cols])

  #data = data.drop(['temp', 'datetime','casual', 'registered'],axis=1)
  data = data.drop(['temp', 'datetime'],axis=1)

  return data

In [22]:
data_set = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/[Regression]Bike Sharing Demand/train.csv')

In [23]:
pt = PowerTransformer()
data = feature_engineering(data_set)

X=data.drop('count',axis=1)
y=data['count']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

y_log = np.log1p(y)

In [24]:
def objective(trial) :
  xgb_params = {
    'n_estimators': trial.suggest_int('xgb_n', 500, 2000),
    'max_depth': trial.suggest_int('xgb_depth', 3, 7),
    'colsample_bytree': trial.suggest_float('xgb_colsample', 0.6, 0.8),
    'reg_alpha': trial.suggest_float('xgb_alpha', 1e-3, 10.0, log=True),
    'reg_lambda': trial.suggest_float('xgb_lambda', 1e-3, 10.0, log=True),
    'learning_rate': trial.suggest_float('xgb_lr', 0.005, 0.1),
  }
  lgbm_params = {
    'n_estimators': trial.suggest_int('lgbm_n', 500, 2000),
    'num_leaves': trial.suggest_int('lgbm_leaves', 15, 50),
    'learning_rate': trial.suggest_float('lgbm_lr', 0.005, 0.1),
    'verbose': -1
  }
  cat_params = {
    'iterations': trial.suggest_int('cat_iter', 500, 2000),
    'depth': trial.suggest_int('cat_depth', 3, 7),
    'learning_rate': trial.suggest_float('cat_lr', 0.005, 0.1),
    'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1.0, 10.0),
    'random_strength': trial.suggest_float('random_strength', 0.1, 10.0),
    'logging_level': 'Silent' # CatBoost 조용히 시키기
  }
  w_xgb = trial.suggest_float('w_xgb', 0.5, 2.0)
  w_lgbm = trial.suggest_float('w_lgbm', 0.5, 2.0)
  w_cat = trial.suggest_float('w_cat', 0.5, 2.0)

  xgb = XGBRegressor(**xgb_params, tree_method='hist', device='cuda', n_jobs=-1, random_state=42)
  lgbm = LGBMRegressor(**lgbm_params, device='gpu', n_jobs=-1, random_state=42)
  cat = CatBoostRegressor(**cat_params, task_type='GPU',devices='0', random_state=42)

  voting_model = VotingRegressor(
    estimators=[('xgb', xgb), ('lgbm', lgbm), ('cat', cat)],
    weights=[w_xgb, w_lgbm, w_cat],
  )

  score = cross_val_score(voting_model, X, y_log, cv=5, scoring='neg_mean_squared_error')

  return score.mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=80)
print(study.best_value)

[I 2026-01-26 06:51:27,652] A new study created in memory with name: no-name-23c82c39-46c5-4667-8f21-1a763ad41f46
[I 2026-01-26 06:52:47,345] Trial 0 finished with value: -0.34118964888214653 and parameters: {'xgb_n': 1690, 'xgb_depth': 6, 'xgb_colsample': 0.7064575030444149, 'xgb_alpha': 4.875171706045522, 'xgb_lambda': 0.03295654304912116, 'xgb_lr': 0.027049591179826256, 'lgbm_n': 1488, 'lgbm_leaves': 40, 'lgbm_lr': 0.09026450474079994, 'cat_iter': 663, 'cat_depth': 4, 'cat_lr': 0.06159344921838319, 'l2_leaf_reg': 1.8325067871076692, 'random_strength': 0.20582184158507583, 'w_xgb': 0.6142277352731444, 'w_lgbm': 1.8119203371090464, 'w_cat': 1.659588128409175}. Best is trial 0 with value: -0.34118964888214653.
[I 2026-01-26 06:54:17,905] Trial 1 finished with value: -0.33675636522559815 and parameters: {'xgb_n': 1634, 'xgb_depth': 6, 'xgb_colsample': 0.6133136758893618, 'xgb_alpha': 0.03442373053060117, 'xgb_lambda': 0.0039427364862369245, 'xgb_lr': 0.03916822972199923, 'lgbm_n': 820, 

-0.33219168863027637


In [25]:

best = study.best_params

xgb_final = XGBRegressor(
    n_estimators=best['xgb_n'],
    max_depth=best['xgb_depth'],
    learning_rate=best['xgb_lr'],
    colsample_bytree=best['xgb_colsample'],
    reg_alpha=best['xgb_alpha'],
    reg_lambda=best['xgb_lambda'],
    random_state=42,
    tree_method='hist',
    device='cuda',
)

lgbm_final = LGBMRegressor(
    n_estimators=best['lgbm_n'],
    num_leaves=best['lgbm_leaves'],
    learning_rate=best['lgbm_lr'],
    random_state=42,
    verbose=-1,
    device='gpu',
)

cat_final = CatBoostRegressor(
    iterations=best['cat_iter'],
    depth=best['cat_depth'],
    learning_rate=best['cat_lr'],
    l2_leaf_reg=best['l2_leaf_reg'],
    random_strength=best['random_strength'],
    task_type='GPU', # GPU 사용
    devices='0',
    random_state=42,
    logging_level='Silent'
)

voting_model = VotingRegressor(
    estimators=[('xgb', xgb_final), ('lgbm', lgbm_final), ('cat', cat_final)],
    weights=[best['w_xgb'], best['w_lgbm'], best['w_cat']]
)

stacking_model = StackingRegressor(
    estimators=[('xgb', xgb_final), ('lgbm', lgbm_final), ('cat', cat_final)],
    final_estimator=RidgeCV(),
    cv=5
)

voting_model.fit(X_train, np.log1p(y_train))
stacking_model.fit(X_train, np.log1p(y_train))

voting_pred = np.expm1(voting_model.predict(X_test))
stacking_pred = np.expm1(stacking_model.predict(X_test))

print(f'voting 오차 : {mean_squared_log_error(y_test, voting_pred)}')
print(f'stacking 오차 : {mean_squared_log_error(y_test, stacking_pred)}')

joblib.dump(voting_model, 'voting_best.pkl')
joblib.dump(stacking_model, 'stacking_best.pkl')

voting 오차 : 0.14565498751425587
stacking 오차 : 0.13978879582256382


['stacking_best.pkl']

In [33]:
pt.fit_transform(data_set[['humidity', 'windspeed']])

test_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/[Regression]Bike Sharing Demand/test.csv')
X = feature_engineering(test_data)

model = joblib.load('stacking_best.pkl')
pred = np.expm1(model.predict(X))

submission = pd.DataFrame({
    'datetime': X.index,
    'count': pred
})

submission.to_csv('submission.csv', index=False)
print("파일 생성 완료")

파일 생성 완료


In [34]:
import os
save_path = '/content/drive/MyDrive/Colab Notebooks/[Regression]Bike Sharing Demand/'
submission.to_csv(save_path + 'submission.csv', index=False)