In [3]:
import random
import pandas as pd
import numpy as np
import os

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings(action='ignore') 

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

In [5]:
train_df = pd.read_csv('/mnt/c/Users/wschu/OneDrive/Documents/data/jeju_specialty/open/train.csv')
test_df = pd.read_csv('/mnt/c/Users/wschu/OneDrive/Documents/data/jeju_specialty/open/test.csv')

In [6]:
#시계열 특성을 학습에 반영하기 위해 timestamp를 월, 일, 시간으로 나눕니다
train_df['year'] = train_df['timestamp'].apply(lambda x : int(x[0:4]))
train_df['month'] = train_df['timestamp'].apply(lambda x : int(x[5:7]))
train_df['day'] = train_df['timestamp'].apply(lambda x : int(x[8:10]))

test_df['year'] = test_df['timestamp'].apply(lambda x : int(x[0:4]))
test_df['month'] = test_df['timestamp'].apply(lambda x : int(x[5:7]))
test_df['day'] = test_df['timestamp'].apply(lambda x : int(x[8:10]))

In [7]:
print(train_df.head())
print(test_df.head())

                ID   timestamp item corporation location  supply(kg)  \
0  TG_A_J_20190101  2019-01-01   TG           A        J         0.0   
1  TG_A_J_20190102  2019-01-02   TG           A        J         0.0   
2  TG_A_J_20190103  2019-01-03   TG           A        J     60601.0   
3  TG_A_J_20190104  2019-01-04   TG           A        J     25000.0   
4  TG_A_J_20190105  2019-01-05   TG           A        J     32352.0   

   price(원/kg)  year  month  day  
0          0.0  2019      1    1  
1          0.0  2019      1    2  
2       1728.0  2019      1    3  
3       1408.0  2019      1    4  
4       1250.0  2019      1    5  
                ID   timestamp item corporation location  year  month  day
0  TG_A_J_20230304  2023-03-04   TG           A        J  2023      3    4
1  TG_A_J_20230305  2023-03-05   TG           A        J  2023      3    5
2  TG_A_J_20230306  2023-03-06   TG           A        J  2023      3    6
3  TG_A_J_20230307  2023-03-07   TG           A        J 

In [8]:
#학습에 사용하지 않을 변수들을 제거합니다
train_x = train_df.drop(columns=['ID', 'timestamp', 'supply(kg)', 'price(원/kg)'])
train_y = train_df['price(원/kg)']

test_x = test_df.drop(columns=['ID', 'timestamp'])

In [9]:
#질적 변수들을 수치화합니다
qual_col = ['item', 'corporation', 'location']

for i in qual_col:
    le = LabelEncoder()
    train_x[i]=le.fit_transform(train_x[i])
    test_x[i]=le.transform(test_x[i]) #test 데이터에 대해서 fit하는 것은 data leakage에 해당합니다

print('Done.')

Done.


In [12]:
print(train_x.head())

   item  corporation  location  year  month  day
0     4            0         0  2019      1    1
1     4            0         0  2019      1    2
2     4            0         0  2019      1    3
3     4            0         0  2019      1    4
4     4            0         0  2019      1    5


In [13]:
print(test_x.head())

   item  corporation  location  year  month  day
0     4            0         0  2023      3    4
1     4            0         0  2023      3    5
2     4            0         0  2023      3    6
3     4            0         0  2023      3    7
4     4            0         0  2023      3    8


In [14]:
model = RandomForestRegressor()
model.fit(train_x, train_y)

In [15]:
preds = model.predict(test_x)

In [17]:
submission = pd.read_csv('/mnt/c/Users/wschu/OneDrive/Documents/data/jeju_specialty/open/sample_submission.csv')
submission

Unnamed: 0,ID,answer
0,TG_A_J_20230304,0
1,TG_A_J_20230305,0
2,TG_A_J_20230306,0
3,TG_A_J_20230307,0
4,TG_A_J_20230308,0
...,...,...
1087,RD_F_J_20230327,0
1088,RD_F_J_20230328,0
1089,RD_F_J_20230329,0
1090,RD_F_J_20230330,0


In [18]:
submission['answer'] = preds
submission

Unnamed: 0,ID,answer
0,TG_A_J_20230304,3636.64
1,TG_A_J_20230305,4109.28
2,TG_A_J_20230306,401.82
3,TG_A_J_20230307,3277.93
4,TG_A_J_20230308,3067.03
...,...,...
1087,RD_F_J_20230327,271.27
1088,RD_F_J_20230328,441.28
1089,RD_F_J_20230329,439.31
1090,RD_F_J_20230330,436.00


In [20]:
submission.to_csv('/mnt/c/Users/wschu/OneDrive/Documents/data/jeju_specialty/open/baseline_submission.csv', index=False)

In [21]:
# using darts library
!pip install pytimekr darts

Collecting pytimekr
  Downloading pytimekr-0.1.0.tar.gz (7.3 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting darts
  Downloading darts-0.28.0-py3-none-any.whl (846 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m846.9/846.9 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting lunardate>=0.1.5
  Downloading lunardate-0.2.2-py3-none-any.whl (18 kB)
Collecting shap>=0.40.0
  Downloading shap-0.45.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (538 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m538.2/538.2 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m00:01[0m
Collecting statsmodels>=0.14.0
  Downloading statsmodels-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hCollecting xarra