In [1]:
import pandas as pd
import numpy as np
import random
import os
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

In [3]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [4]:
train_df

Unnamed: 0,ID,추석까지 남은 기간(주),쇼핑몰 구분,가격(원),프로모션 여부,도시 유형,지역 유형,쇼핑몰 유형,선물 유형,수요량
0,TRAIN_0000,1,쇼핑몰 15,212000,0,도시 6,지역 1,쇼핑몰 유형 2,명절혼합과일선물세트,28
1,TRAIN_0001,2,쇼핑몰 72,113000,0,도시 21,지역 1,쇼핑몰 유형 1,발효홍삼선물세트,27
2,TRAIN_0002,0,쇼핑몰 15,67000,0,도시 6,지역 1,쇼핑몰 유형 2,실속스팸선물세트,769
3,TRAIN_0003,1,쇼핑몰 13,206000,0,도시 12,지역 3,쇼핑몰 유형 1,자연산프리미엄버섯선물세트,27
4,TRAIN_0004,1,쇼핑몰 65,140000,0,도시 16,지역 2,쇼핑몰 유형 2,자연산새우선물세트,337
...,...,...,...,...,...,...,...,...,...,...
5867,TRAIN_5867,2,쇼핑몰 46,225000,0,도시 16,지역 3,쇼핑몰 유형 1,고급한과선물세트,27
5868,TRAIN_5868,2,쇼핑몰 42,62000,0,도시 16,지역 4,쇼핑몰 유형 3,특선스페셜스팸선물세트,40
5869,TRAIN_5869,2,쇼핑몰 43,131000,0,도시 5,지역 4,쇼핑몰 유형 1,명품맛김선물세트,55
5870,TRAIN_5870,2,쇼핑몰 7,85000,0,도시 7,지역 1,쇼핑몰 유형 1,실속형견과류선물세트,231


In [5]:
train_data = train_df.drop(['ID', '수요량'], axis=1)
train_label = train_df['수요량']

In [6]:
test_data = test_df.drop('ID', axis=1)

In [7]:
# Label Encoding
nominal_feature = ['쇼핑몰 구분', '도시 유형', '지역 유형', '쇼핑몰 유형', '선물 유형']

for feature in nominal_feature:
    le = LabelEncoder()
    le = le.fit(train_data[feature])
    train_data[feature] = le.transform(train_data[feature])

    for label in np.unique(test_data[feature]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)
    test_data[feature] = le.transform(test_data[feature])

In [8]:
# Feature Scaling
scaler = MinMaxScaler()
train_data[['가격(원)', '추석까지 남은 기간(주)']] = scaler.fit_transform(train_data[['가격(원)', '추석까지 남은 기간(주)']])
test_data[['가격(원)', '추석까지 남은 기간(주)']] = scaler.transform(test_data[['가격(원)', '추석까지 남은 기간(주)']])

In [9]:
# 학습 및 검증 데이터로 분할
X_train, X_val, y_train, y_val = train_test_split(train_data, train_label, test_size=0.2)

# LightGBM 모델 정의
model = lgb.LGBMRegressor(objective='regression', metric='rmse')

In [10]:
# GridSearch를 위한 하이퍼파라미터 그리드 정의
param_grid = {
    'num_leaves': [31, 64, 128],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15]
}

# GridSearchCV 객체 생성
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=2)

# 그리드 서치 수행
grid_search.fit(X_train, y_train)

# 최적의 하이퍼파라미터와 모델 출력
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
print(f'Best Hyperparameters: {best_params}')

# 검증 데이터에 대한 예측
val_predictions = best_model.predict(X_val)

# RMSE 계산
val_rmse = np.sqrt(mean_squared_error(y_val, val_predictions))
print(f'Validation RMSE: {val_rmse}')

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
[CV] END learning_rate=0.1, max_depth=10, n_estimators=300, num_leaves=31; total time=   0.2s
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 324
[LightGBM] [Info] Number of data points in the train set: 3758, number of used features: 8
[LightGBM] [Info] Start training from score 261.707557
[CV] END learning_rate=0.1, max_depth=10, n_estimators=300, num_leaves=31; total time=   0.2s
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 324
[LightGBM] [Info] Number of data points in the train set: 3758, number of used features: 8
[LightGBM] [Info] Start training from score 262.319319
[CV] END learning_rate=0.1, max_depth=10, n_estimators=300, num_leaves=31; total time=   0.2s
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 322
[LightGBM] [Info] Number of data points

In [11]:
# 테스트 데이터 추론
result_df = pd.read_csv('sample_submission.csv')
result_df['수요량'] = best_model.predict(test_data)



In [12]:
result_df.to_csv('result_submission_lgbm4.csv', index=False)