# 라이브러리

In [1]:
import warnings # 경고 메시지를 관리하는 라이브러리
warnings.filterwarnings('ignore') # 코드 실행 중 발생할 수 있는 경고 메시지를 무시하도록 설정
from datetime import datetime, timedelta # 파이썬의 기본 모듈로, 날짜와 시간을 다룰 때 사용

import torch # 파이토치(PyTorch)는 딥러닝 프레임워크로, 신경망 모델을 구축하고 학습시키는 데 사용
from torch import nn #  파이토치의 신경망 모델을 정의하는 데 사용되는 모듈 nn.Module 과 같은 클래스를 포함하고 있다.
from torch.nn import functional as F # 파이토치의 함수형 API로, 다양한 신경망 함수와 연산을 제공합니다. 활성화 함수, 손실 함수 등을 포함
from torch.utils.data import Dataset # 데이터셋을 정의하는데 사용되는 파이토치의 클래스, 주어진 데이터를 모델 학습에 맞게 불러오고 가공하는 역할
from torch.utils.data import DataLoader # 데이터셋을 배치 단위로 나누고 데이터 로딩을 관리하는 클래스, 모델 학습시 데이터를 효율적으로 처리

import numpy as np 
import pandas as pd 
import seaborn as sns
from tqdm import tqdm

from sklearn.preprocessing import StandardScaler 
from xgboost.sklearn import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

# 파라미터 설정

In [2]:
random_seed = 42 # 난수 생성을 위한 seed 값을 설정 , 시드값을 고정하면 실험을 재현하거나 결과의 일관성을 유지하는데 도움
torch.manual_seed(random_seed) # PyTorch 에 딥 러닝 모뎅을 학습할 때 결과의 일관성을 유지하기 위해 필요한 작업 중 하나 
np.random.seed(random_seed) # NumPy는 데이터 처리와 관련된 다양한 작업에서 사용되며, 시드를 설정하여 결과를 일관성 있게 만든다.

pd.set_option('display.max_columns', None) # Pandas에서 최대 열 표시 옵션을 설정 None 으로 설정하면 모든 열을 화면에 표시
pd.set_option('display.max_rows', 50) # Pandas 최대 행 표시 옵션을 설정 현재는 50개의 행을 화면에  표시하도록 설정.

# 데이터 전처리

## 데이터 로드

In [3]:
df = pd.read_csv('./train.csv') 
df

Unnamed: 0,신고접수번호,접수경로,신고접수일시,시군구,접수분류,긴급구조종류
0,JGT2IBW4,이동전화,20130101_0001,,안내,
1,74ED11Z4,기타,20130101_0002,,안내,
2,B4I8RIBW,이동전화,20130101_0002,,안내,
3,482FI3AJ,이동전화,20130101_0003,북구,출동,구급
4,AR9N3QT4,이동전화,20130101_0004,,안내,
...,...,...,...,...,...,...
6415797,KAOUMKLU,기타,20220630_2354,,안내,
6415798,LAI1MQ2K,이동전화,20220630_2354,사상구,출동,구급
6415799,N5KF2ELA,이동전화,20220630_2356,동구,출동,구급
6415800,53Q98D6Z,의료지도연결,20220630_2358,,안내,


In [4]:
# df.isna().sum()
# df['접수경로'].value_counts()
# df['시군구'].value_counts()
# df['접수분류'].value_counts()
df['긴급구조종류'].value_counts()

긴급구조종류
구급    1582608
기타     403642
구조     321461
화재      92466
Name: count, dtype: int64

## 시간대 이동

In [5]:
df['dt'] = df['신고접수일시'].apply(lambda x: datetime.strptime(x, '%Y%m%d_%H%M'))
df['_dt'] = df['dt'] + timedelta(hours=6) # 6 hours shift (18~08 -> 00~13)

#  '_dt' 열에서 날짜 정보만 추출하여 '_date' 열에 저장. 그리고 이 값을 문자열로 변환합니다. 이로써 날짜 정보만 가지는 새로운 열이 생성
df['_date'] = df._dt.dt.date.astype(str)

# dt' 열에서 시간 정보만 추출하여 '_hour' 열에 저장. 이로써 시간 정보만 가지는 새로운 열이 생성
df['_hour'] = df._dt.dt.hour
df

Unnamed: 0,신고접수번호,접수경로,신고접수일시,시군구,접수분류,긴급구조종류,dt,_dt,_date,_hour
0,JGT2IBW4,이동전화,20130101_0001,,안내,,2013-01-01 00:01:00,2013-01-01 06:01:00,2013-01-01,6
1,74ED11Z4,기타,20130101_0002,,안내,,2013-01-01 00:02:00,2013-01-01 06:02:00,2013-01-01,6
2,B4I8RIBW,이동전화,20130101_0002,,안내,,2013-01-01 00:02:00,2013-01-01 06:02:00,2013-01-01,6
3,482FI3AJ,이동전화,20130101_0003,북구,출동,구급,2013-01-01 00:03:00,2013-01-01 06:03:00,2013-01-01,6
4,AR9N3QT4,이동전화,20130101_0004,,안내,,2013-01-01 00:04:00,2013-01-01 06:04:00,2013-01-01,6
...,...,...,...,...,...,...,...,...,...,...
6415797,KAOUMKLU,기타,20220630_2354,,안내,,2022-06-30 23:54:00,2022-07-01 05:54:00,2022-07-01,5
6415798,LAI1MQ2K,이동전화,20220630_2354,사상구,출동,구급,2022-06-30 23:54:00,2022-07-01 05:54:00,2022-07-01,5
6415799,N5KF2ELA,이동전화,20220630_2356,동구,출동,구급,2022-06-30 23:56:00,2022-07-01 05:56:00,2022-07-01,5
6415800,53Q98D6Z,의료지도연결,20220630_2358,,안내,,2022-06-30 23:58:00,2022-07-01 05:58:00,2022-07-01,5


## 시간별 신고량 집계

In [6]:
df = df[['_date', '_hour', '신고접수번호']].groupby(['_date', '_hour']).count().reset_index()

column_map = {'_date': 'date', '_hour': 'hour', '신고접수번호': 'y'}
df = df.rename(column_map, axis='columns')
df

Unnamed: 0,date,hour,y
0,2013-01-01,6,77
1,2013-01-01,7,81
2,2013-01-01,8,72
3,2013-01-01,9,45
4,2013-01-01,10,47
...,...,...,...
83227,2022-07-01,1,136
83228,2022-07-01,2,128
83229,2022-07-01,3,90
83230,2022-07-01,4,100


## 데이터 확인

In [7]:
# 일별로 24시간의 데이터가 모두 있는지 확인
check = df[['date', 'y']].groupby('date').count()
drop_dates = list(check[check['y'] != 24].index)

# column "date" -> index
df = df.set_index('date')
df = df.drop(index=drop_dates)
df

Unnamed: 0_level_0,hour,y
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2013-01-02,0,97
2013-01-02,1,103
2013-01-02,2,102
2013-01-02,3,102
2013-01-02,4,61
...,...,...
2022-06-30,19,112
2022-06-30,20,115
2022-06-30,21,105
2022-06-30,22,78


In [8]:
df.to_csv('p_train.csv', index=True)

## 데이터 분할 (train/valid)

In [9]:
dates = list(set(df.index)) # 중복 날짜 제거
dates = sorted(dates)

valid_ratio = 0.02 
train_idx = int(len(dates) * (1 - valid_ratio))

train_dates, valid_dates = dates[:train_idx], dates[train_idx:]
assert len(dates) == len(train_dates) + len(valid_dates)
print(f'{len(dates)} -> #train {len(train_dates)} #val {len(valid_dates)}')

3467 -> #train 3397 #val 70


# Dataset

## 스케일러 정의

In [10]:
scaler = StandardScaler()
scaler.fit(df.loc[train_dates, 'y'].to_numpy().reshape((-1, 1)))

StandardScaler()

## Dataset 정의

In [11]:
class CustomDataset(Dataset):
    def __init__(self, df: pd.DataFrame, input_dim: int, output_dim: int, scaler: StandardScaler):
        self.df = df.copy()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.scaler = scaler
        
    def __len__(self):
        return len(self.df) - self.input_dim - self.output_dim + 1 # window size 24(=15+9)
    
    def __getitem__(self, idx) -> (torch.Tensor, torch.Tensor):
        x_begin, x_end = idx, idx + self.input_dim
        X = self.df.iloc[x_begin:x_end]['y']

        y_begin, y_end = x_end, x_end + self.output_dim
        y = self.df.iloc[y_begin:y_end]['y']

        X = X.to_numpy().reshape((-1, 1))
        y = y.to_numpy().reshape((-1, 1))

        X = self.scaler.transform(X).reshape(-1).astype(np.float32)
        y = self.scaler.transform(y).reshape(-1).astype(np.float32)

        return X, y
    
    def get_dates(self) -> list:
        return self.dates

# Model 정의

In [12]:
class MLP(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        # self.fc1 = nn.Linear(input_dim, 256)
        # self.fc2 = nn.Linear(256, 64)
        # self.fc3 = nn.Linear(64, output_dim)
        self.fc1 = nn.Linear(input_dim, 512) # 512 -> 256
        self.ln1 = nn.GroupNorm(1, 512, eps=1e-08) # 512 -> 256
        self.fc2 = nn.Linear(512, 128)
        self.ln2 = nn.GroupNorm(1, 128, eps=1e-08) # 1e-08 -> 1e-07
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, output_dim)

    def forward(self, inputs):
        # x = F.relu(self.fc1(inputs))
        # x = F.relu(self.fc2(x))
        # x = self.fc3(x)
        x = F.relu(self.ln1(self.fc1(inputs)))
        x = F.relu(self.ln2(self.fc2(x)))
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        return x


class LSTM(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_size):
        super().__init__()
        self.lstm = nn.LSTM(input_size=input_dim, hidden_size=hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_dim)

    def forward(self, x):
        x, _ = self.lstm(x)
        #x = x[:, -1, :] # last time step
        x = self.fc(x)
        return x

# 학습

## 학습 설정

In [13]:
n_epochs = 20
batch_size = 32 
learning_rate = 0.02 

input_dim = 15 # 18~8
output_dim = 9 # 9~17

train_df = df.loc[train_dates]
train_dataset = CustomDataset(train_df, input_dim=input_dim, output_dim=output_dim, scaler=scaler)
train_data_loader = DataLoader(train_dataset, batch_size=batch_size)

valid_df = df.loc[valid_dates]
valid_dataset = CustomDataset(valid_df, input_dim=input_dim, output_dim=output_dim, scaler=scaler)
valid_data_loader = DataLoader(valid_dataset, batch_size=batch_size)

# model = MLP(input_dim, output_dim) # MLP 에서 LSTM 사용
model = LSTM(input_dim, output_dim, hidden_size=128) #  hidden_size 50 -> 100

loss_function = nn.L1Loss() # MAE
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

## 모델 학습

In [14]:
for epoch in range(n_epochs):
    print(f'epoch #{epoch+1}')
    train_loss, valid_loss = 0, 0
    y_trues, y_preds = [], []

    # train
    model.train()
    for X, y in tqdm(train_data_loader):
        pred = model(X)
        
        loss = loss_function(y, pred) # true, pred
        train_loss += loss.detach().numpy()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # valid
    model.eval()
    for X, y in tqdm(valid_data_loader):
        pred = model(X)

        loss = loss_function(y, pred) # true, pred
        valid_loss += loss.detach().numpy()

        y_true = y.detach().numpy()[0]
        y_pred = pred.detach().numpy()[0]
        
        y_trues.append(y_true)
        y_preds.append(y_pred)

    y_trues = scaler.inverse_transform(np.array(y_trues)).reshape(-1)
    y_preds = scaler.inverse_transform(np.array(y_preds)).reshape(-1)

    MAE = mean_absolute_error(y_trues, y_preds)
    MAPE = mean_absolute_percentage_error(y_trues, y_preds)
    print(f'train {train_loss:.6f} | valid {valid_loss:.6f} | MAE {MAE:.6f} | MAPE {MAPE:.6f}')

epoch #1


100%|██████████| 2548/2548 [00:18<00:00, 134.97it/s]
100%|██████████| 52/52 [00:00<00:00, 188.14it/s]


train 870.903123 | valid 16.668663 | MAE 18.155186 | MAPE 0.215831
epoch #2


100%|██████████| 2548/2548 [00:20<00:00, 126.06it/s]
100%|██████████| 52/52 [00:00<00:00, 258.98it/s]


train 836.049938 | valid 17.380390 | MAE 19.642368 | MAPE 0.236907
epoch #3


100%|██████████| 2548/2548 [00:17<00:00, 142.79it/s]
100%|██████████| 52/52 [00:00<00:00, 238.70it/s]


train 828.326013 | valid 16.543244 | MAE 18.540724 | MAPE 0.224000
epoch #4


100%|██████████| 2548/2548 [00:18<00:00, 139.63it/s]
100%|██████████| 52/52 [00:00<00:00, 292.09it/s]


train 834.857394 | valid 17.771691 | MAE 18.735659 | MAPE 0.225970
epoch #5


100%|██████████| 2548/2548 [00:18<00:00, 139.49it/s]
100%|██████████| 52/52 [00:00<00:00, 306.50it/s]


train 843.333473 | valid 17.491973 | MAE 18.029718 | MAPE 0.204966
epoch #6


100%|██████████| 2548/2548 [00:18<00:00, 141.27it/s]
100%|██████████| 52/52 [00:00<00:00, 308.65it/s]


train 829.803308 | valid 17.385953 | MAE 20.006348 | MAPE 0.250593
epoch #7


100%|██████████| 2548/2548 [00:17<00:00, 149.25it/s]
100%|██████████| 52/52 [00:00<00:00, 307.60it/s]


train 832.413612 | valid 16.872039 | MAE 16.481312 | MAPE 0.197112
epoch #8


100%|██████████| 2548/2548 [00:17<00:00, 145.31it/s]
100%|██████████| 52/52 [00:00<00:00, 256.00it/s]


train 823.841620 | valid 17.061435 | MAE 18.262878 | MAPE 0.223986
epoch #9


100%|██████████| 2548/2548 [00:17<00:00, 149.56it/s]
100%|██████████| 52/52 [00:00<00:00, 309.44it/s]


train 824.603656 | valid 16.887896 | MAE 18.519144 | MAPE 0.228847
epoch #10


100%|██████████| 2548/2548 [00:16<00:00, 150.22it/s]
100%|██████████| 52/52 [00:00<00:00, 281.84it/s]


train 813.653248 | valid 16.387081 | MAE 16.704529 | MAPE 0.209841
epoch #11


100%|██████████| 2548/2548 [00:18<00:00, 140.84it/s]
100%|██████████| 52/52 [00:00<00:00, 280.48it/s]


train 821.476526 | valid 16.995253 | MAE 19.277990 | MAPE 0.239523
epoch #12


100%|██████████| 2548/2548 [00:17<00:00, 142.61it/s]
100%|██████████| 52/52 [00:00<00:00, 245.36it/s]


train 821.564875 | valid 17.111197 | MAE 20.124432 | MAPE 0.263821
epoch #13


100%|██████████| 2548/2548 [00:16<00:00, 156.48it/s]
100%|██████████| 52/52 [00:00<00:00, 309.30it/s]


train 825.270508 | valid 16.955923 | MAE 18.469526 | MAPE 0.221760
epoch #14


100%|██████████| 2548/2548 [00:17<00:00, 146.13it/s]
100%|██████████| 52/52 [00:00<00:00, 294.44it/s]


train 831.976181 | valid 16.920597 | MAE 18.310093 | MAPE 0.225337
epoch #15


100%|██████████| 2548/2548 [00:16<00:00, 152.16it/s]
100%|██████████| 52/52 [00:00<00:00, 278.10it/s]


train 819.821617 | valid 17.270272 | MAE 17.891678 | MAPE 0.220532
epoch #16


100%|██████████| 2548/2548 [00:16<00:00, 151.08it/s]
100%|██████████| 52/52 [00:00<00:00, 294.07it/s]


train 821.711118 | valid 17.172910 | MAE 18.670559 | MAPE 0.231788
epoch #17


100%|██████████| 2548/2548 [00:17<00:00, 143.00it/s]
100%|██████████| 52/52 [00:00<00:00, 307.08it/s]


train 833.752362 | valid 17.343246 | MAE 20.143137 | MAPE 0.246338
epoch #18


100%|██████████| 2548/2548 [00:17<00:00, 149.32it/s]
100%|██████████| 52/52 [00:00<00:00, 243.26it/s]


train 835.722767 | valid 16.639602 | MAE 19.102539 | MAPE 0.237479
epoch #19


100%|██████████| 2548/2548 [00:17<00:00, 146.22it/s]
100%|██████████| 52/52 [00:00<00:00, 311.78it/s]


train 819.095166 | valid 16.929484 | MAE 19.278894 | MAPE 0.238633
epoch #20


100%|██████████| 2548/2548 [00:16<00:00, 153.98it/s]
100%|██████████| 52/52 [00:00<00:00, 308.41it/s]

train 822.501057 | valid 17.036939 | MAE 18.255220 | MAPE 0.223726





# 예측값 시각화

In [15]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# valid
y_trues, y_preds = [], []
MAE, MAPE = 0, 0

model.eval()
for X, y in tqdm(valid_data_loader):
    pred = model(X)

    y_true = y.detach().numpy()[0]
    y_pred = pred.detach().numpy()[0]
    
    y_trues.append(y_true)
    y_preds.append(y_pred)

y_trues = scaler.inverse_transform(np.array(y_trues)).reshape(-1)
y_preds = scaler.inverse_transform(np.array(y_preds)).reshape(-1)

MAE = mean_absolute_error(y_trues, y_preds)
MAPE = mean_absolute_percentage_error(y_trues, y_preds)

fig = make_subplots(rows=1, cols=1)
fig.update_layout(title_text=f'prediction on daily (MAE: {MAE:.4f}    MAPE: {MAPE*100:.2f}%)')

fig.add_trace(go.Scatter(x=list(range(len(y_trues))), y=y_trues, name='y_true'), row=1, col=1)
fig.add_trace(go.Scatter(x=list(range(len(y_trues))), y=y_preds, name='y_pred'), row=1, col=1)
fig.show()

100%|██████████| 52/52 [00:00<00:00, 276.07it/s]


# 제출

## 테스트용 Dataset 정의

In [16]:
class TestDataset(Dataset):
    def __init__(self, df: pd.DataFrame, dates: list, scaler: StandardScaler):
        self.df = df.copy()
        self.dates = dates
        self.scaler = scaler
        
    def __len__(self):
        return len(self.dates)

    def __getitem__(self, idx) -> (torch.Tensor, torch.Tensor):
        date = self.dates[idx]

        select_data = self.df.loc[date]
        X = select_data['y'].to_numpy().reshape((-1, 1))
        X = self.scaler.transform(X).reshape(-1).astype(np.float32)

        return torch.from_numpy(X)
    
    def get_input_dimension(self) -> int:
        return 15 # 0 ~ 14 (6h shifted)
    
    def get_output_dimension(self) -> int:
        return 9 # 14 ~ 23 (6h shifted)
    
    def get_dates(self) -> list:
        return self.dates

## 테스트 데이터 전처리

In [17]:
df = pd.read_csv('test.csv')
df['dt'] = df['신고접수일시'].apply(lambda x: datetime.strptime(x, '%Y%m%d_%H%M'))
df['_dt'] = df['dt'] + timedelta(hours=6) # 6 hours shift (21~08 -> 00~14)
df['date'] = df._dt.dt.date.astype(str)
df['hour'] = df._dt.dt.hour

df = df[['date', 'hour', '신고접수번호']].groupby(['date', 'hour']).count().reset_index()

column_map = {'_date': 'date', '_hour': 'hour', '신고접수번호': 'y'}
df = df.rename(column_map, axis='columns')
df

Unnamed: 0,date,hour,y
0,2022-07-01,0,150
1,2022-07-01,1,136
2,2022-07-01,2,128
3,2022-07-01,3,90
4,2022-07-01,4,100
...,...,...,...
1600,2022-10-15,10,31
1601,2022-10-15,11,36
1602,2022-10-15,12,21
1603,2022-10-15,13,47


In [18]:
df.to_csv('p_test.csv', index=True)

## 테스트 예측

In [19]:
df = df.set_index('date')
dates = df.index.unique().tolist()

test_dataset = TestDataset(df, dates=dates, scaler=scaler)
test_data_loader = DataLoader(test_dataset, batch_size=1)

# test
y_trues, y_preds = [], []
MAE, MAPE = 0, 0

model.eval()
for X in test_data_loader:
    pred = model(X)

    y_pred = pred.detach().numpy()[0]
    y_preds.append(y_pred)

y_preds = scaler.inverse_transform(np.array(y_preds))
y_preds = y_preds.reshape(-1)

## 제출 파일 생성

In [20]:
submission = pd.read_csv('sample_submission.csv')
submission['y'] = y_preds
submission.to_csv('submission.csv', index=False)

# 성능향상

In [21]:
df = pd.read_csv('p_train.csv') # train -> test 필요시 마다 변동
display(df)

dates = df['date'].unique().tolist()

df['date'] = pd.to_datetime(df['date']) # str -> datetime
df['doy'] = df['date'].dt.dayofyear
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['weekday'] = df['date'].dt.weekday

#datetype -> str
df['date'] = df['date'].dt.date.astype(str)
display(df.head())

Unnamed: 0,date,hour,y
0,2013-01-02,0,97
1,2013-01-02,1,103
2,2013-01-02,2,102
3,2013-01-02,3,102
4,2013-01-02,4,61
...,...,...,...
83203,2022-06-30,19,112
83204,2022-06-30,20,115
83205,2022-06-30,21,105
83206,2022-06-30,22,78


Unnamed: 0,date,hour,y,doy,year,month,day,weekday
0,2013-01-02,0,97,2,2013,1,2,2
1,2013-01-02,1,103,2,2013,1,2,2
2,2013-01-02,2,102,2,2013,1,2,2
3,2013-01-02,3,102,2,2013,1,2,2
4,2013-01-02,4,61,2,2013,1,2,2


In [22]:
# ML model (xgboost)
# xgboost classifier 를 많이 사용하지만  regressor 도 괜찮다고 함.

# hour -> daily
hour_data = []
for date in dates:
    # 2013-01-02
    data = df[df['date'] == date]['y'].tolist()
    hour_data.append(data)
    # print(data)

print(hour_data[:5])


[[97, 103, 102, 102, 61, 59, 41, 31, 28, 22, 17, 12, 27, 44, 39, 85, 74, 37, 63, 65, 59, 69, 52, 57], [112, 105, 111, 118, 85, 81, 65, 39, 32, 28, 19, 37, 40, 52, 89, 68, 81, 57, 53, 68, 56, 75, 44, 75], [107, 108, 102, 119, 103, 95, 36, 44, 27, 39, 34, 22, 39, 55, 75, 83, 108, 57, 77, 49, 66, 66, 56, 60], [59, 85, 78, 95, 65, 68, 52, 42, 42, 36, 24, 51, 29, 27, 38, 65, 77, 75, 69, 60, 90, 94, 126, 130], [111, 113, 84, 113, 101, 76, 38, 55, 35, 34, 33, 28, 26, 60, 59, 170, 190, 175, 161, 160, 164, 163, 151, 172]]


In [23]:
hour_df = pd.DataFrame(data=hour_data, columns=[f'{i}H' for i in range(24)]) # train 24, test 15
hour_df

Unnamed: 0,0H,1H,2H,3H,4H,5H,6H,7H,8H,9H,10H,11H,12H,13H,14H,15H,16H,17H,18H,19H,20H,21H,22H,23H
0,97,103,102,102,61,59,41,31,28,22,17,12,27,44,39,85,74,37,63,65,59,69,52,57
1,112,105,111,118,85,81,65,39,32,28,19,37,40,52,89,68,81,57,53,68,56,75,44,75
2,107,108,102,119,103,95,36,44,27,39,34,22,39,55,75,83,108,57,77,49,66,66,56,60
3,59,85,78,95,65,68,52,42,42,36,24,51,29,27,38,65,77,75,69,60,90,94,126,130
4,111,113,84,113,101,76,38,55,35,34,33,28,26,60,59,170,190,175,161,160,164,163,151,172
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3462,151,161,154,123,91,96,96,84,51,40,26,44,70,90,79,136,111,109,111,110,127,106,113,106
3463,122,120,120,120,81,93,92,82,80,53,64,59,60,98,131,124,116,158,161,111,98,121,103,107
3464,104,95,80,91,85,90,53,54,39,58,36,37,55,57,96,96,94,85,106,123,112,92,97,79
3465,105,143,142,164,112,79,73,66,46,30,34,20,38,43,64,103,120,103,85,102,88,100,52,74


In [24]:
df = df.drop_duplicates('date', keep='first').reset_index(drop=True)
df = pd.concat([df,hour_df], axis='columns')

In [25]:
# train val

# random split
val_size = 0.2 # 0.1 -> 0.2
from sklearn.model_selection import train_test_split
train_test_split, val_dates = train_test_split(dates, train_size=1-val_size, shuffle=False)
print(val_dates)

# select date
# 22-07-01 ~ 22-10-15
begin_date, end_date = '2021-07-01', '21-10-15'
train = df[(df['date'] < begin_date) | (df['date'] > end_date)]
val = df[(df['date'] >= begin_date) & (df['date'] <= end_date)]

# X 15h -> y 9h
X_column = ['doy','year','month','day','weekday'] + [f'{i}H' for i in range(15)] #
y_column = [f'{i}H' for i in range(15, 24)]

X_train = train[X_column].to_numpy()
y_train = train[y_column].to_numpy()

X_val = val[X_column].to_numpy()
y_val = val[y_column].to_numpy()

print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)

['2020-08-06', '2020-08-07', '2020-08-08', '2020-08-09', '2020-08-10', '2020-08-11', '2020-08-12', '2020-08-13', '2020-08-14', '2020-08-15', '2020-08-16', '2020-08-17', '2020-08-18', '2020-08-19', '2020-08-20', '2020-08-21', '2020-08-22', '2020-08-23', '2020-08-24', '2020-08-25', '2020-08-26', '2020-08-27', '2020-08-28', '2020-08-29', '2020-08-30', '2020-08-31', '2020-09-01', '2020-09-02', '2020-09-03', '2020-09-04', '2020-09-05', '2020-09-06', '2020-09-07', '2020-09-08', '2020-09-09', '2020-09-10', '2020-09-11', '2020-09-12', '2020-09-13', '2020-09-14', '2020-09-15', '2020-09-16', '2020-09-17', '2020-09-18', '2020-09-19', '2020-09-20', '2020-09-21', '2020-09-22', '2020-09-23', '2020-09-24', '2020-09-25', '2020-09-26', '2020-09-27', '2020-09-28', '2020-09-29', '2020-09-30', '2020-10-01', '2020-10-02', '2020-10-03', '2020-10-04', '2020-10-05', '2020-10-06', '2020-10-07', '2020-10-08', '2020-10-09', '2020-10-10', '2020-10-11', '2020-10-12', '2020-10-13', '2020-10-14', '2020-10-15', '2020

In [26]:
xgb_params = {    
    'learning_rate' : 0.1,
    'n_estimators' : 50,
    'max_depth' : 7,
    'min_child_weight' : 0.1,
    'gamma' : 1,
    'subsample' : 0.9,
    'colsample_bytree' : 0.8,
    'n_jobs' : -1,
    'scale_pos_weight' : 0.1,
    'seed' : 42
}

model = XGBRegressor(**xgb_params)

eval_set = [(X_val, y_val)]

model.fit(X=X_train, 
          y=y_train,
          eval_set = eval_set,
          eval_metric = 'mape', # 'mape' 'mae'. 'mse' 
          early_stopping_rounds = 10,
          verbose = 1)

[0]	validation_0-mape:0.90293


[1]	validation_0-mape:0.82149
[2]	validation_0-mape:0.74601
[3]	validation_0-mape:0.67903
[4]	validation_0-mape:0.62048
[5]	validation_0-mape:0.56716
[6]	validation_0-mape:0.52033
[7]	validation_0-mape:0.47738
[8]	validation_0-mape:0.43856
[9]	validation_0-mape:0.40461
[10]	validation_0-mape:0.37398
[11]	validation_0-mape:0.34770
[12]	validation_0-mape:0.32406
[13]	validation_0-mape:0.30187
[14]	validation_0-mape:0.28227
[15]	validation_0-mape:0.26718
[16]	validation_0-mape:0.25385
[17]	validation_0-mape:0.24297
[18]	validation_0-mape:0.23372
[19]	validation_0-mape:0.22596
[20]	validation_0-mape:0.21907
[21]	validation_0-mape:0.21316
[22]	validation_0-mape:0.20827
[23]	validation_0-mape:0.20446
[24]	validation_0-mape:0.20152
[25]	validation_0-mape:0.19832
[26]	validation_0-mape:0.19598
[27]	validation_0-mape:0.19404
[28]	validation_0-mape:0.19251
[29]	validation_0-mape:0.19133
[30]	validation_0-mape:0.19044
[31]	validation_0-mape:0.18972
[32]	validation_0-mape:0.18907
[33]	validation_0

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.8, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=1, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.1, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=7, max_leaves=None,
             min_child_weight=0.1, missing=nan, monotone_constraints=None,
             n_estimators=50, n_jobs=-1, num_parallel_tree=None, predictor=None,
             random_state=None, ...)

In [27]:
preds = model.predict(X_val)

from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
mae = mean_absolute_error(y_val, preds)
mape = mean_absolute_percentage_error(y_val, preds)
print(mae, mape)

23.085978327131343 0.18441096578901764


# Test

In [28]:
df = pd.read_csv('p_test.csv')
dates = df['date'].unique().tolist()

df['date'] = pd.to_datetime(df['date']) # str -> datetime
df['doy'] = df['date'].dt.day_of_year
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['weekday'] = df['date'].dt.weekday
# datetime -> str
df['date'] = df['date'].dt.date.astype(str)
df
hour_data = []
date_data = []  # 날짜(date)를 저장할 리스트
years = []  # 연도(year)를 저장할 리스트
for date in dates:
    # 2013-01-02
    data = df[df['date'] == date]['y'].tolist()
    hour_data.append(data)
    date_data.append(date)
    
hour_df = pd.DataFrame(data=hour_data, columns= [f'{i}H' for i in range(15)])
date_series = pd.Series(date_data, name='Date')  # 날짜를 시리즈로 변환
hour_df.insert(loc=0, column='Date', value=date_series)  # 'Date' 컬럼 추가
# df , hour_df
df = df.drop_duplicates('date', keep = 'first').reset_index(drop=True)
df = pd.concat([df, hour_df], axis='columns')
df

Unnamed: 0.1,Unnamed: 0,date,hour,y,doy,year,month,day,weekday,Date,0H,1H,2H,3H,4H,5H,6H,7H,8H,9H,10H,11H,12H,13H,14H
0,0,2022-07-01,0,150,182,2022,7,1,4,2022-07-01,150,136,128,90,100,104,75,67,66,52,57,43,60,51,67
1,15,2022-07-02,0,130,183,2022,7,2,5,2022-07-02,130,113,147,114,119,103,97,77,50,67,58,37,76,65,74
2,30,2022-07-03,0,145,184,2022,7,3,6,2022-07-03,145,128,129,123,125,101,80,67,94,34,58,48,91,83,92
3,45,2022-07-04,0,149,185,2022,7,4,0,2022-07-04,149,168,130,85,86,90,65,39,54,26,39,40,53,54,110
4,60,2022-07-05,0,138,186,2022,7,5,1,2022-07-05,138,140,95,124,107,75,66,65,49,57,33,40,57,67,68
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102,1530,2022-10-11,0,254,284,2022,10,11,1,2022-10-11,254,115,71,96,67,85,61,44,13,27,34,25,31,37,88
103,1545,2022-10-12,0,126,285,2022,10,12,2,2022-10-12,126,80,93,93,106,93,47,54,22,23,34,28,38,36,89
104,1560,2022-10-13,0,93,286,2022,10,13,3,2022-10-13,93,77,72,89,75,59,66,68,53,45,40,36,43,50,74
105,1575,2022-10-14,0,112,287,2022,10,14,4,2022-10-14,112,92,78,92,85,66,58,55,35,47,39,29,26,33,73


In [29]:
df.to_csv("pp_test.csv", index=False) 

In [30]:
test = pd.read_csv("pp_test.csv")
test

Unnamed: 0.1,Unnamed: 0,date,hour,y,doy,year,month,day,weekday,Date,0H,1H,2H,3H,4H,5H,6H,7H,8H,9H,10H,11H,12H,13H,14H
0,0,2022-07-01,0,150,182,2022,7,1,4,2022-07-01,150,136,128,90,100,104,75,67,66,52,57,43,60,51,67
1,15,2022-07-02,0,130,183,2022,7,2,5,2022-07-02,130,113,147,114,119,103,97,77,50,67,58,37,76,65,74
2,30,2022-07-03,0,145,184,2022,7,3,6,2022-07-03,145,128,129,123,125,101,80,67,94,34,58,48,91,83,92
3,45,2022-07-04,0,149,185,2022,7,4,0,2022-07-04,149,168,130,85,86,90,65,39,54,26,39,40,53,54,110
4,60,2022-07-05,0,138,186,2022,7,5,1,2022-07-05,138,140,95,124,107,75,66,65,49,57,33,40,57,67,68
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102,1530,2022-10-11,0,254,284,2022,10,11,1,2022-10-11,254,115,71,96,67,85,61,44,13,27,34,25,31,37,88
103,1545,2022-10-12,0,126,285,2022,10,12,2,2022-10-12,126,80,93,93,106,93,47,54,22,23,34,28,38,36,89
104,1560,2022-10-13,0,93,286,2022,10,13,3,2022-10-13,93,77,72,89,75,59,66,68,53,45,40,36,43,50,74
105,1575,2022-10-14,0,112,287,2022,10,14,4,2022-10-14,112,92,78,92,85,66,58,55,35,47,39,29,26,33,73


In [31]:
# test 코드
# submission 파일 만들기
submission = pd.read_csv('sample_submission.csv')
test_preds = model.predict(test[X_column])

# test_preds
submission['y'] = test_preds.reshape(-1)
submission.to_csv('Final_submission.csv', index=False)