In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier, Pool, cv
import catboost
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.metrics import log_loss

In [2]:
directory = 'C:/Users/jjy45/OneDrive/바탕 화면/open'
os.chdir(directory)

In [3]:
os.getcwd()

'C:\\Users\\jjy45\\OneDrive\\바탕 화면\\open'

In [4]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
trade = pd.read_csv('international_trade.csv')

In [5]:
train.head()   #날짜데이터 2019-01-01 ~ 2023-03-03 

Unnamed: 0,ID,timestamp,item,corporation,location,supply(kg),price(원/kg)
0,TG_A_J_20190101,2019-01-01,TG,A,J,0.0,0.0
1,TG_A_J_20190102,2019-01-02,TG,A,J,0.0,0.0
2,TG_A_J_20190103,2019-01-03,TG,A,J,60601.0,1728.0
3,TG_A_J_20190104,2019-01-04,TG,A,J,25000.0,1408.0
4,TG_A_J_20190105,2019-01-05,TG,A,J,32352.0,1250.0


In [6]:
# 'date'를 제외한 컬럼 저장하기
# 'timestamp' 컬럼을 datetime 타입으로 변경
train['timestamp'] = pd.to_datetime(train['timestamp'])
test['timestamp'] = pd.to_datetime(test['timestamp'])

train.head()

Unnamed: 0,ID,timestamp,item,corporation,location,supply(kg),price(원/kg)
0,TG_A_J_20190101,2019-01-01,TG,A,J,0.0,0.0
1,TG_A_J_20190102,2019-01-02,TG,A,J,0.0,0.0
2,TG_A_J_20190103,2019-01-03,TG,A,J,60601.0,1728.0
3,TG_A_J_20190104,2019-01-04,TG,A,J,25000.0,1408.0
4,TG_A_J_20190105,2019-01-05,TG,A,J,32352.0,1250.0


In [7]:
# 날짜 관련 feature 추가 하기

train['year'] = train['timestamp'].dt.year
train['month'] = train['timestamp'].dt.month
train['week'] = train['timestamp'].dt.isocalendar().week.astype(np.int32)
train['weekday']  = train['timestamp'].dt.weekday

# 날짜 관련 피처를 저장해 둔다.
features_date = ['month', 'week', 'weekday']

In [8]:
train['item'].value_counts()

TG    15230
BC    13707
RD    12184
CR    10661
CB     7615
Name: item, dtype: int64

In [9]:
# 컬럼명 변경
train = train.rename(columns = {'price(원/kg)' : 'price', 'supply(kg)' : 'supply'})
train

Unnamed: 0,ID,timestamp,item,corporation,location,supply,price,year,month,week,weekday
0,TG_A_J_20190101,2019-01-01,TG,A,J,0.0,0.0,2019,1,1,1
1,TG_A_J_20190102,2019-01-02,TG,A,J,0.0,0.0,2019,1,1,2
2,TG_A_J_20190103,2019-01-03,TG,A,J,60601.0,1728.0,2019,1,1,3
3,TG_A_J_20190104,2019-01-04,TG,A,J,25000.0,1408.0,2019,1,1,4
4,TG_A_J_20190105,2019-01-05,TG,A,J,32352.0,1250.0,2019,1,1,5
...,...,...,...,...,...,...,...,...,...,...,...
59392,RD_F_J_20230227,2023-02-27,RD,F,J,452440.0,468.0,2023,2,9,0
59393,RD_F_J_20230228,2023-02-28,RD,F,J,421980.0,531.0,2023,2,9,1
59394,RD_F_J_20230301,2023-03-01,RD,F,J,382980.0,574.0,2023,3,9,2
59395,RD_F_J_20230302,2023-03-02,RD,F,J,477220.0,523.0,2023,3,9,3


In [10]:
items = train['item'].value_counts().index.to_list()
items

['TG', 'BC', 'RD', 'CR', 'CB']

In [11]:
corp = train['corporation'].value_counts().index.to_list()
corp

['A', 'E', 'D', 'C', 'B', 'F']

In [12]:
loc = train['location'].value_counts().index.to_list()
loc

['J', 'S']

### supply=0인 경우 제외

In [13]:
train[train['supply']==0]

Unnamed: 0,ID,timestamp,item,corporation,location,supply,price,year,month,week,weekday
0,TG_A_J_20190101,2019-01-01,TG,A,J,0.0,0.0,2019,1,1,1
1,TG_A_J_20190102,2019-01-02,TG,A,J,0.0,0.0,2019,1,1,2
5,TG_A_J_20190106,2019-01-06,TG,A,J,0.0,0.0,2019,1,1,6
12,TG_A_J_20190113,2019-01-13,TG,A,J,0.0,0.0,2019,1,2,6
19,TG_A_J_20190120,2019-01-20,TG,A,J,0.0,0.0,2019,1,3,6
...,...,...,...,...,...,...,...,...,...,...,...
59363,RD_F_J_20230129,2023-01-29,RD,F,J,0.0,0.0,2023,1,4,6
59370,RD_F_J_20230205,2023-02-05,RD,F,J,0.0,0.0,2023,2,5,6
59377,RD_F_J_20230212,2023-02-12,RD,F,J,0.0,0.0,2023,2,6,6
59384,RD_F_J_20230219,2023-02-19,RD,F,J,0.0,0.0,2023,2,7,6


In [14]:
not_zero = train[train['supply']!=0]
not_zero

Unnamed: 0,ID,timestamp,item,corporation,location,supply,price,year,month,week,weekday
2,TG_A_J_20190103,2019-01-03,TG,A,J,60601.0,1728.0,2019,1,1,3
3,TG_A_J_20190104,2019-01-04,TG,A,J,25000.0,1408.0,2019,1,1,4
4,TG_A_J_20190105,2019-01-05,TG,A,J,32352.0,1250.0,2019,1,1,5
6,TG_A_J_20190107,2019-01-07,TG,A,J,44995.0,1474.0,2019,1,2,0
7,TG_A_J_20190108,2019-01-08,TG,A,J,26975.0,1326.0,2019,1,2,1
...,...,...,...,...,...,...,...,...,...,...,...
59392,RD_F_J_20230227,2023-02-27,RD,F,J,452440.0,468.0,2023,2,9,0
59393,RD_F_J_20230228,2023-02-28,RD,F,J,421980.0,531.0,2023,2,9,1
59394,RD_F_J_20230301,2023-03-01,RD,F,J,382980.0,574.0,2023,3,9,2
59395,RD_F_J_20230302,2023-03-02,RD,F,J,477220.0,523.0,2023,3,9,3


In [15]:
not_zero['y-m']=not_zero['timestamp'].dt.strftime('%Y-%m')
not_zero['y-m']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  not_zero['y-m']=not_zero['timestamp'].dt.strftime('%Y-%m')


2        2019-01
3        2019-01
4        2019-01
6        2019-01
7        2019-01
          ...   
59392    2023-02
59393    2023-02
59394    2023-03
59395    2023-03
59396    2023-03
Name: y-m, Length: 23945, dtype: object

In [16]:
#ID 열 제거
not_zero = not_zero.drop('ID', axis = 1)
not_zero

Unnamed: 0,timestamp,item,corporation,location,supply,price,year,month,week,weekday,y-m
2,2019-01-03,TG,A,J,60601.0,1728.0,2019,1,1,3,2019-01
3,2019-01-04,TG,A,J,25000.0,1408.0,2019,1,1,4,2019-01
4,2019-01-05,TG,A,J,32352.0,1250.0,2019,1,1,5,2019-01
6,2019-01-07,TG,A,J,44995.0,1474.0,2019,1,2,0,2019-01
7,2019-01-08,TG,A,J,26975.0,1326.0,2019,1,2,1,2019-01
...,...,...,...,...,...,...,...,...,...,...,...
59392,2023-02-27,RD,F,J,452440.0,468.0,2023,2,9,0,2023-02
59393,2023-02-28,RD,F,J,421980.0,531.0,2023,2,9,1,2023-02
59394,2023-03-01,RD,F,J,382980.0,574.0,2023,3,9,2,2023-03
59395,2023-03-02,RD,F,J,477220.0,523.0,2023,3,9,3,2023-03


### 휴일여부

In [17]:
from pytimekr import pytimekr

In [18]:
#주말 또는 공휴일이면 1반환 코드

year_2019 = pytimekr.holidays(year=2019)
year_2020 = pytimekr.holidays(year=2020)
year_2021 = pytimekr.holidays(year=2021)
year_2022 = pytimekr.holidays(year=2022)
year_2023 = pytimekr.holidays(year=2023)



def holidays(x):
    if x.weekday() in range(5,8):
        return 1
    if x.year == 2019  and x in year_2019 :
        return 1 
    elif x.year == 2020 and x in year_2020:
        return 1 
    elif x.year == 2021 and x in year_2021 :
        return 1 
    elif x.year == 2022 and x in year_2022 :
        return 1
    elif x.year == 2023 and x in year_2023:
        return 1
    else:
        return 0

In [19]:
import warnings
warnings.filterwarnings('ignore')
train['holiday'] = train['timestamp'].apply(holidays)
train

Unnamed: 0,ID,timestamp,item,corporation,location,supply,price,year,month,week,weekday,holiday
0,TG_A_J_20190101,2019-01-01,TG,A,J,0.0,0.0,2019,1,1,1,1
1,TG_A_J_20190102,2019-01-02,TG,A,J,0.0,0.0,2019,1,1,2,0
2,TG_A_J_20190103,2019-01-03,TG,A,J,60601.0,1728.0,2019,1,1,3,0
3,TG_A_J_20190104,2019-01-04,TG,A,J,25000.0,1408.0,2019,1,1,4,0
4,TG_A_J_20190105,2019-01-05,TG,A,J,32352.0,1250.0,2019,1,1,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...
59392,RD_F_J_20230227,2023-02-27,RD,F,J,452440.0,468.0,2023,2,9,0,0
59393,RD_F_J_20230228,2023-02-28,RD,F,J,421980.0,531.0,2023,2,9,1,0
59394,RD_F_J_20230301,2023-03-01,RD,F,J,382980.0,574.0,2023,3,9,2,1
59395,RD_F_J_20230302,2023-03-02,RD,F,J,477220.0,523.0,2023,3,9,3,0


In [20]:
train['y-m']=train['timestamp'].dt.strftime('%Y-%m')
train['y-m']

0        2019-01
1        2019-01
2        2019-01
3        2019-01
4        2019-01
          ...   
59392    2023-02
59393    2023-02
59394    2023-03
59395    2023-03
59396    2023-03
Name: y-m, Length: 59397, dtype: object

In [21]:
train['holiday'].value_counts()

0    40872
1    18525
Name: holiday, dtype: int64

In [22]:
train

Unnamed: 0,ID,timestamp,item,corporation,location,supply,price,year,month,week,weekday,holiday,y-m
0,TG_A_J_20190101,2019-01-01,TG,A,J,0.0,0.0,2019,1,1,1,1,2019-01
1,TG_A_J_20190102,2019-01-02,TG,A,J,0.0,0.0,2019,1,1,2,0,2019-01
2,TG_A_J_20190103,2019-01-03,TG,A,J,60601.0,1728.0,2019,1,1,3,0,2019-01
3,TG_A_J_20190104,2019-01-04,TG,A,J,25000.0,1408.0,2019,1,1,4,0,2019-01
4,TG_A_J_20190105,2019-01-05,TG,A,J,32352.0,1250.0,2019,1,1,5,1,2019-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...
59392,RD_F_J_20230227,2023-02-27,RD,F,J,452440.0,468.0,2023,2,9,0,0,2023-02
59393,RD_F_J_20230228,2023-02-28,RD,F,J,421980.0,531.0,2023,2,9,1,0,2023-02
59394,RD_F_J_20230301,2023-03-01,RD,F,J,382980.0,574.0,2023,3,9,2,1,2023-03
59395,RD_F_J_20230302,2023-03-02,RD,F,J,477220.0,523.0,2023,3,9,3,0,2023-03


### trade 데이터

In [23]:
trade

Unnamed: 0,기간,품목명,수출 중량,수출 금액,수입 중량,수입 금액,무역수지
0,2019-01,토마토(신선한 것이나 냉장한 것으로 한정한다),356571,990,0,0,990
1,2019-01,양파,821330,222,4003206,1118,-896
2,2019-01,쪽파,60,1,93405,128,-127
3,2019-01,꽃양배추와 브로콜리(broccoli),160,1,638913,563,-562
4,2019-01,방울다다기 양배추,0,0,7580,38,-38
...,...,...,...,...,...,...,...
1269,2023-02,포포(papaw)[파파야(papaya)],0,0,23830,71,-71
1270,2023-02,사과,135165,351,0,0,351
1271,2023-02,배,2206012,5411,1,0,5411
1272,2023-02,신 체리[프루너스 체라서스(Prunus cerasus)],5,0,0,0,0


In [24]:
df_test = trade[trade.품목명.str.contains('감귤|브로콜리|무|당근|양배추')]
df_test

Unnamed: 0,기간,품목명,수출 중량,수출 금액,수입 중량,수입 금액,무역수지
3,2019-01,꽃양배추와 브로콜리(broccoli),160,1,638913,563,-562
4,2019-01,방울다다기 양배추,0,0,7580,38,-38
5,2019-01,양배추,184650,94,395802,90,4
8,2019-01,당근,23150,22,7466150,2955,-2934
12,2019-01,무화과,2627,23,94529,464,-441
...,...,...,...,...,...,...,...
1250,2023-02,양배추,13188,13,377456,104,-91
1253,2023-02,당근,22510,20,9260020,3758,-3737
1254,2023-02,순무,4000,4,2,0,4
1258,2023-02,무화과,1319,14,104566,454,-440


In [25]:
df_test.loc[df_test['품목명']=='꽃양배추와 브로콜리(broccoli)','품목명'] = '브로콜리'
df_test = df_test[df_test['품목명'] != '방울다다기 양배추']
df_test.loc[df_test['품목명']=='순무','품목명'] = '무'
df_test = df_test[df_test['품목명']!='무화과']
df_test

Unnamed: 0,기간,품목명,수출 중량,수출 금액,수입 중량,수입 금액,무역수지
3,2019-01,브로콜리,160,1,638913,563,-562
5,2019-01,양배추,184650,94,395802,90,4
8,2019-01,당근,23150,22,7466150,2955,-2934
17,2019-01,감귤,58368,172,0,0,172
28,2019-02,브로콜리,780,1,396870,399,-398
...,...,...,...,...,...,...,...
1248,2023-02,브로콜리,24,0,332640,352,-352
1250,2023-02,양배추,13188,13,377456,104,-91
1253,2023-02,당근,22510,20,9260020,3758,-3737
1254,2023-02,무,4000,4,2,0,4


In [26]:
df_test['품목명'].value_counts()

브로콜리    50
양배추     50
당근      50
감귤      50
무        6
Name: 품목명, dtype: int64

In [27]:
fruits_dict = {'감귤':'TG' ,'브로콜리':'BC' ,'무':'RD' ,'당근':'CR' ,'양배추':'CB'}
fruits_dict

{'감귤': 'TG', '브로콜리': 'BC', '무': 'RD', '당근': 'CR', '양배추': 'CB'}

In [28]:
df_test['품목명'] = df_test['품목명'].map(fruits_dict)

In [29]:
df_test.reset_index()

Unnamed: 0,index,기간,품목명,수출 중량,수출 금액,수입 중량,수입 금액,무역수지
0,3,2019-01,BC,160,1,638913,563,-562
1,5,2019-01,CB,184650,94,395802,90,4
2,8,2019-01,CR,23150,22,7466150,2955,-2934
3,17,2019-01,TG,58368,172,0,0,172
4,28,2019-02,BC,780,1,396870,399,-398
...,...,...,...,...,...,...,...,...
201,1248,2023-02,BC,24,0,332640,352,-352
202,1250,2023-02,CB,13188,13,377456,104,-91
203,1253,2023-02,CR,22510,20,9260020,3758,-3737
204,1254,2023-02,RD,4000,4,2,0,4


In [30]:
# 컬럼명 변경
trade = df_test.rename(columns = {'품목명' : 'item'})
trade

Unnamed: 0,기간,item,수출 중량,수출 금액,수입 중량,수입 금액,무역수지
3,2019-01,BC,160,1,638913,563,-562
5,2019-01,CB,184650,94,395802,90,4
8,2019-01,CR,23150,22,7466150,2955,-2934
17,2019-01,TG,58368,172,0,0,172
28,2019-02,BC,780,1,396870,399,-398
...,...,...,...,...,...,...,...
1248,2023-02,BC,24,0,332640,352,-352
1250,2023-02,CB,13188,13,377456,104,-91
1253,2023-02,CR,22510,20,9260020,3758,-3737
1254,2023-02,RD,4000,4,2,0,4


In [30]:
#정렬한 것

In [33]:
copied_df = trade.copy()
copied_df.head()

Unnamed: 0,기간,item,수출 중량,수출 금액,수입 중량,수입 금액,무역수지
3,2019-01,BC,160,1,638913,563,-562
5,2019-01,CB,184650,94,395802,90,4
8,2019-01,CR,23150,22,7466150,2955,-2934
17,2019-01,TG,58368,172,0,0,172
28,2019-02,BC,780,1,396870,399,-398


In [34]:
item_order = ['TG', 'BC', 'RD', 'CR', 'CB']
copied_df['item'] = pd.Categorical(copied_df['item'], categories=item_order, ordered=True)

# 결과 출력 (item으로 먼저 정렬 후 기간으로 정렬)
trade_sort = copied_df.sort_values(['item', '기간'])

# 결과 출력
trade_sort

Unnamed: 0,기간,item,수출 중량,수출 금액,수입 중량,수입 금액,무역수지
17,2019-01,TG,58368,172,0,0,172
41,2019-02,TG,8474,33,0,0,33
64,2019-03,TG,2061,10,4599,12,-2
88,2019-04,TG,328,9,13457,36,-27
114,2019-05,TG,2560,21,0,0,21
...,...,...,...,...,...,...,...
1150,2022-10,CB,3926,7,672530,240,-233
1175,2022-11,CB,13120,23,418180,121,-98
1199,2022-12,CB,133572,84,299384,87,-3
1223,2023-01,CB,216721,129,303960,85,44


In [35]:
trade_sort['item'].value_counts()

TG    50
BC    50
CR    50
CB    50
RD     6
Name: item, dtype: int64

#### 데이터 프레임 합치기

In [31]:
dr_merged = pd.merge(train, trade, left_on=['item', 'y-m'], right_on=['item', '기간'])
dr_merged

Unnamed: 0,ID,timestamp,item,corporation,location,supply,price,year,month,week,weekday,holiday,y-m,기간,수출 중량,수출 금액,수입 중량,수입 금액,무역수지
0,TG_A_J_20190101,2019-01-01,TG,A,J,0.0,0.0,2019,1,1,1,1,2019-01,2019-01,58368,172,0,0,172
1,TG_A_J_20190102,2019-01-02,TG,A,J,0.0,0.0,2019,1,1,2,0,2019-01,2019-01,58368,172,0,0,172
2,TG_A_J_20190103,2019-01-03,TG,A,J,60601.0,1728.0,2019,1,1,3,0,2019-01,2019-01,58368,172,0,0,172
3,TG_A_J_20190104,2019-01-04,TG,A,J,25000.0,1408.0,2019,1,1,4,0,2019-01,2019-01,58368,172,0,0,172
4,TG_A_J_20190105,2019-01-05,TG,A,J,32352.0,1250.0,2019,1,1,5,1,2019-01,2019-01,58368,172,0,0,172
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48555,BC_E_S_20230224,2023-02-24,BC,E,S,2104.0,2025.0,2023,2,8,4,0,2023-02,2023-02,24,0,332640,352,-352
48556,BC_E_S_20230225,2023-02-25,BC,E,S,1032.0,2353.0,2023,2,8,5,1,2023-02,2023-02,24,0,332640,352,-352
48557,BC_E_S_20230226,2023-02-26,BC,E,S,0.0,0.0,2023,2,8,6,1,2023-02,2023-02,24,0,332640,352,-352
48558,BC_E_S_20230227,2023-02-27,BC,E,S,2200.0,2488.0,2023,2,9,0,0,2023-02,2023-02,24,0,332640,352,-352


In [32]:
#원-핫 인코딩
dr_merged = pd.get_dummies(dr_merged, columns=['item', 'corporation', 'location'], drop_first=True)

In [33]:
dr_merged

Unnamed: 0,ID,timestamp,supply,price,year,month,week,weekday,holiday,y-m,...,item_CB,item_CR,item_RD,item_TG,corporation_B,corporation_C,corporation_D,corporation_E,corporation_F,location_S
0,TG_A_J_20190101,2019-01-01,0.0,0.0,2019,1,1,1,1,2019-01,...,0,0,0,1,0,0,0,0,0,0
1,TG_A_J_20190102,2019-01-02,0.0,0.0,2019,1,1,2,0,2019-01,...,0,0,0,1,0,0,0,0,0,0
2,TG_A_J_20190103,2019-01-03,60601.0,1728.0,2019,1,1,3,0,2019-01,...,0,0,0,1,0,0,0,0,0,0
3,TG_A_J_20190104,2019-01-04,25000.0,1408.0,2019,1,1,4,0,2019-01,...,0,0,0,1,0,0,0,0,0,0
4,TG_A_J_20190105,2019-01-05,32352.0,1250.0,2019,1,1,5,1,2019-01,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48555,BC_E_S_20230224,2023-02-24,2104.0,2025.0,2023,2,8,4,0,2023-02,...,0,0,0,0,0,0,0,1,0,1
48556,BC_E_S_20230225,2023-02-25,1032.0,2353.0,2023,2,8,5,1,2023-02,...,0,0,0,0,0,0,0,1,0,1
48557,BC_E_S_20230226,2023-02-26,0.0,0.0,2023,2,8,6,1,2023-02,...,0,0,0,0,0,0,0,1,0,1
48558,BC_E_S_20230227,2023-02-27,2200.0,2488.0,2023,2,9,0,0,2023-02,...,0,0,0,0,0,0,0,1,0,1


In [34]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# 데이터 전처리 및 특성 선택
# (데이터프레임 df에서 필요한 열만 선택하여 X로, 예측하고자 하는 'target' 열을 y로 설정)
#X = df[['supply', 'year','month', '수출 중량', '수입 중량', '수입 금액', '무역수지']]
#X= dr_merged[
#y = dr_merged['price']


# 데이터프레임에서 필요한 열만 선택
#selected_columns = ['supply','year', 'month', 'week' , 'weekday', 'holiday', 'price']
#df = dr_merged[selected_columns]

# 특성과 타겟 데이터 분리
columns_to_drop = ['price', 'ID', 'timestamp', '기간', '수출 중량', '수출 금액', '수입 중량', '수입 금액','y-m']
X = dr_merged.drop(columns=columns_to_drop)
#X = dr_merged['supply',
y = dr_merged['price']

# 학습 데이터와 테스트 데이터로 나누기
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# LightGBM 데이터셋 생성
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# LightGBM 모델 설정 및 학습
params = {
    'objective': 'regression',  # 회귀 문제 설정
    'metric': 'rmse',  # 평가 지표 설정 (Root Mean Squared Error)
    'boosting_type': 'gbdt',  # 부스팅 알고리즘 설정
    'num_leaves': 31,  # 트리의 최대 잎 수
    'learning_rate': 0.05,  # 학습 속도
    'feature_fraction': 0.9,  # 각 트리마다 사용되는 특성의 비율
}

num_round = 100  # 학습 라운드 수 (트리의 개수)

# LightGBM 모델 학습
model = lgb.train(params, train_data, num_round, valid_sets=[train_data, test_data])

# 예측
y_pred = model.predict(X_test, num_iteration=model.best_iteration)

# 평가 지표 계산
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'Root Mean Squared Error: {rmse}')

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001374 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 550
[LightGBM] [Info] Number of data points in the train set: 38848, number of used features: 17
[LightGBM] [Info] Start training from score 1339.535575
Root Mean Squared Error: 525.6031859056745


In [35]:
columns_to_drop = ['price', 'ID', 'timestamp', '기간', '수출 중량', '수출 금액', '수입 중량', '수입 금액','y-m']
X = dr_merged.drop(columns=columns_to_drop)
X

Unnamed: 0,supply,year,month,week,weekday,holiday,무역수지,item_CB,item_CR,item_RD,item_TG,corporation_B,corporation_C,corporation_D,corporation_E,corporation_F,location_S
0,0.0,2019,1,1,1,1,172,0,0,0,1,0,0,0,0,0,0
1,0.0,2019,1,1,2,0,172,0,0,0,1,0,0,0,0,0,0
2,60601.0,2019,1,1,3,0,172,0,0,0,1,0,0,0,0,0,0
3,25000.0,2019,1,1,4,0,172,0,0,0,1,0,0,0,0,0,0
4,32352.0,2019,1,1,5,1,172,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48555,2104.0,2023,2,8,4,0,-352,0,0,0,0,0,0,0,1,0,1
48556,1032.0,2023,2,8,5,1,-352,0,0,0,0,0,0,0,1,0,1
48557,0.0,2023,2,8,6,1,-352,0,0,0,0,0,0,0,1,0,1
48558,2200.0,2023,2,9,0,0,-352,0,0,0,0,0,0,0,1,0,1
