In [238]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split,GridSearchCV, KFold,RandomizedSearchCV
from sklearn.metrics import mean_squared_error

from xgboost import XGBRegressor
import xgboost as xgb
import optuna

import warnings

In [210]:
warnings.filterwarnings(action = 'ignore')

In [261]:
train = pd.read_csv('C:/Users/whileduck/Desktop/open/train.csv')
test = pd.read_csv('C:/Users/whileduck/Desktop/open/test.csv')

In [262]:
def min_max_scaling(x):
    return (x - np.min(x)) / (max(x) - min(x))

def scaler(df):
    
    cols = df.describe().columns
    
    for col in cols:
        
        if col != 'Calories_Burned':
            df[col] = min_max_scaling(df[col])
        
    return df

In [263]:
def preprocessing(df):
    
    df['height'] = df['Height(Feet)'] + df['Height(Remainder_Inches)'] * 0.12
        
    df = scaler(df)
    df = df.drop(['ID','Height(Feet)','Height(Remainder_Inches)'], axis = 1)
    
    df = pd.get_dummies(df)
    
    
    
    return df

In [264]:
df = preprocessing(train)
test = preprocessing(test)

In [265]:
X = df.drop('Calories_Burned',axis = 1)
Y = df['Calories_Burned']

In [271]:
def objective(trial):
    # 하이퍼파라미터 탐색 범위
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 200, 800),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.1, 0.5),
        'subsample': trial.suggest_float('subsample', 0.5, 1),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1),
        'gamma': trial.suggest_float('gamma', 0, 3),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300)
    }
    
    model = xgb.XGBRegressor(**param)
    
    # 모델 정의
    
    
    valid_cv = KFold(n_splits = 5,
                shuffle = True)
    
    
    i = 0
    
    result_rmse = []
    
    while i !=5:
        
        x_train,x_test,y_train,y_test = train_test_split(X,Y, test_size = 0.2)
        
        
        mse_list = []
        
        for train_idx, valid_idx in valid_cv.split(x_train):
            
            train_x , test_x = x_train.iloc[train_idx], y_train.iloc[train_idx]
            valid_x , valid_y = x_train.iloc[valid_idx], y_train.iloc[valid_idx]
            
            # 모델 학습
            model.fit(x_train, y_train)
            
            # 검증 데이터에 대한 예측값 계산
            y_pred = model.predict(valid_x)
            
            # 검증 데이터에 대한 예측값과 실제값 사이의 평균 제곱 오차 계산
            mse = mean_squared_error(valid_y, y_pred,
                                        squared = False)
            
            mse_list.append(mse)
            
        i += 1
        
        result_rmse.append(np.mean(mse_list))
                    
    # 목적 함수 반환값
    return np.mean(result_rmse)

# study 객체 생성 및 실행
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

# 하이퍼파라미터 최적값 출력
print(study.best_params())

[32m[I 2023-04-15 00:56:36,519][0m A new study created in memory with name: no-name-5ebe5f63-8b19-432d-8bf4-074beef2d238[0m
[32m[I 2023-04-15 00:57:04,093][0m Trial 0 finished with value: 2.715522758868214 and parameters: {'n_estimators': 475, 'max_depth': 8, 'learning_rate': 0.40034516024978795, 'subsample': 0.8956143777320931, 'colsample_bytree': 0.9496366810595533, 'gamma': 0.5064639560553709, 'reg_alpha': 0.8807472180219483, 'reg_lambda': 0.12245564276894386, 'min_child_weight': 280}. Best is trial 0 with value: 2.715522758868214.[0m
[32m[I 2023-04-15 00:57:24,328][0m Trial 1 finished with value: 3.165553652306867 and parameters: {'n_estimators': 400, 'max_depth': 6, 'learning_rate': 0.3861823686519149, 'subsample': 0.7430302972577711, 'colsample_bytree': 0.9234177720281542, 'gamma': 2.0478428943424234, 'reg_alpha': 0.8081801288539079, 'reg_lambda': 0.28678642523812176, 'min_child_weight': 265}. Best is trial 0 with value: 2.715522758868214.[0m
[32m[I 2023-04-15 00:57:39,

TypeError: 'dict' object is not callable

In [273]:
# best_parms = {'n_estimators': 753, 'max_depth': 10, 'learning_rate': 0.13308388990265863, 
#                 'subsample': 0.9059989448554313, 'colsample_bytree': 0.7974906616951953, 'gamma': 1.6896272035459874, 
#                 'reg_alpha': 0.8845432907591179, 'reg_lambda': 0.810881097627782, 'min_child_weight': 2}

# model = XGBRegressor(**best_parms)

In [319]:
best_parms = {'n_estimators': 800, 'max_depth': 10, 'learning_rate': 0.1, 
                'subsample': 0.90, 'colsample_bytree': 0.8, 'gamma': 1.5, 
                'reg_alpha': 0.9, 'reg_lambda': 0.8, 'min_child_weight': 2}

model = XGBRegressor(**best_parms)

In [320]:
x_train,x_test,y_train,y_test = train_test_split(X,Y, test_size = 0.2)

In [321]:
model.fit(x_train,y_train)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.8, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=1.5, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.1, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=10, max_leaves=None,
             min_child_weight=2, missing=nan, monotone_constraints=None,
             n_estimators=800, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=None, ...)

In [322]:
mean_squared_error(y_test,model.predict(x_test),
                    squared = False)

2.1672865603837104

In [314]:
test

Unnamed: 0,Exercise_Duration,Body_Temperature(F),BPM,Weight(lb),Age,height,Weight_Status_Normal Weight,Weight_Status_Obese,Weight_Status_Overweight,Gender_F,Gender_M
0,0.862069,0.792208,0.714286,0.155242,0.423729,0.216216,1,0,0,1,0
1,0.965517,0.766234,0.785714,0.577621,0.016949,0.594595,0,0,1,0,1
2,0.344828,0.649351,0.410714,0.433468,0.644068,0.540541,1,0,0,0,1
3,0.793103,0.831169,0.732143,0.566532,0.254237,0.662162,0,0,1,0,1
4,0.965517,0.909091,0.875000,0.322077,0.796610,0.459459,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
7495,0.931034,0.857143,0.696429,0.477823,0.525424,0.662162,1,0,0,0,1
7496,0.103448,0.389610,0.303571,0.500000,0.084746,0.581081,0,0,1,0,1
7497,0.793103,0.792208,0.535714,0.233367,0.372881,0.256757,1,0,0,1,0
7498,0.241379,0.532468,0.285714,0.255544,0.474576,0.378378,1,0,0,1,0


In [323]:
model.fit(X,Y)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.8, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=1.5, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.1, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=10, max_leaves=None,
             min_child_weight=2, missing=nan, monotone_constraints=None,
             n_estimators=800, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=None, ...)

In [324]:
model.predict(test)

array([184.03944 , 205.0208  ,  55.229362, ..., 140.1308  ,  35.691673,
       202.98158 ], dtype=float32)

In [325]:
sub = pd.read_csv('C:/Users/whileduck/Desktop/open/sample_submission.csv')

In [326]:
sub['Calories_Burned'] = model.predict(test)

sub = sub.set_index('ID')

In [327]:
sub.to_csv('optuna_2.csv')

In [12]:
model.fit(x_train,y_train)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.8164485085575031, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=0.0002874864017069455, gpu_id=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=0.09565910837368558, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=10, max_leaves=None,
             min_child_weight=225, missing=nan, monotone_constraints=None,
             n_estimators=391, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=None, ...)

In [14]:
model.predict(x_test)

array([0.37533927, 0.5715646 , 0.10647854, ..., 0.16881195, 0.07902926,
       0.47102004], dtype=float32)

In [15]:
y_test

2125    0.381271
2214    0.585284
6078    0.110368
5757    0.404682
2253    0.484950
          ...   
1649    0.688963
1950    0.505017
88      0.170569
1911    0.073579
2672    0.474916
Name: Calories_Burned, Length: 1500, dtype: float64