## Import & install


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install statsforecast s3fs fastparquet
!pip install git+https://github.com/Nixtla/neuralforecast.git

In [None]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from sklearn.preprocessing import LabelEncoder

import torch
import os
import random
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from ray import tune

from neuralforecast import NeuralForecast
from neuralforecast.auto import AutoNHITS, AutoTFT
from neuralforecast.losses.pytorch import DistributionLoss
import pandas as pd
from statsforecast import StatsForecast as sf

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

## Hyperparameter Setting

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(42) # Seed 고정

# 전처리

In [None]:
train_data = pd.read_csv('/content/drive/MyDrive/LG_AIMERS/train.csv').drop(columns=['ID', '제품'])
keyword = pd.read_csv('/content/drive/MyDrive/LG_AIMERS/brand_keyword_cnt.csv')
product = pd.read_csv('/content/drive/MyDrive/LG_AIMERS/product_info.csv')
sales = pd.read_csv('/content/drive/MyDrive/LG_AIMERS/sales.csv')
submission = pd.read_csv('/content/drive/MyDrive/LG_AIMERS/sample_submission.csv')


## 최근 30일이 0인것 -> sub1

In [None]:
to_zero_index = train_data[train_data.iloc[:,-21:].mean(axis=1)==0].index

In [None]:
submission1 = submission.loc[to_zero_index]

## 마지막 7일 중 6일이 0 이고, 마지막 값만 존재하는경우 -> sub2

In [None]:
mean_30days_over_zero = list(set(submission.index) - set(submission1.index))

data1 = train_data.loc[mean_30days_over_zero]

condition = (data1.iloc[:, -7:-1].sum(axis=1) == 0) & (data1.iloc[:, -1] > 0)
filtered_data = data1[condition]

only_one_value_exist_index = filtered_data.index

real_value_last_one = train_data.loc[only_one_value_exist_index].iloc[:,-1]

In [None]:
submission2 = submission.loc[only_one_value_exist_index]
submission2.iloc[:,1:] = submission2.iloc[:,1:].apply(lambda _: real_value_last_one, axis=0)

# 최종 데이터

In [None]:
last_index = set(submission.index) - set(submission1.index) - set(submission2.index)

In [None]:
len(last_index)

In [None]:
data_2023 = train_data.loc[last_index]
ID_2023 = data_2023.index
data_2023.reset_index(drop=True,inplace=True)

In [None]:
ID_2023

In [None]:
data_2023

# 앞에서부터 0인 데이터를 제거하고, 이상치 제거와 rolling 을 통해서 데이터를 부드럽게 해주는 과정

In [None]:
train_data_info = data_2023.iloc[:,:4]
train_data_values = data_2023.iloc[:,4:]
values = train_data_values.T

In [None]:
def hampel_filter(data, window_size, num_std_dev):
    median = data.rolling(window=window_size, center=True).median()
    deviation = np.abs(data - median)
    median_deviation = deviation.rolling(window=window_size, center=True).median()
    threshold = num_std_dev * 1.4826 * median_deviation
    outlier_mask = deviation > threshold

    filtered_data = data.copy()
    filtered_data[outlier_mask] = median[outlier_mask]

    return filtered_data

In [None]:
window_size = 7
num_std_dev = 3
filtered_values = hampel_filter(values, window_size, num_std_dev)

In [None]:
filtered_values

In [None]:
df = filtered_values.T
df = df.T
df = df.rolling(window=7,min_periods=1).mean()

In [None]:
data = df

In [None]:
a = data.T

In [None]:
ID_2023

In [None]:
pd.DataFrame(ID_2023)

In [None]:
data1 = pd.concat([pd.DataFrame(ID_2023),a],axis=1)

In [None]:
data1 = data1.rename(columns ={0 : 'ID'})

In [None]:
high_data = data1[data1.iloc[:,1:].mean(axis=1)>30]

In [None]:
low_data =data1.loc[set(data1.index)-set(high_data.index)]

In [None]:
low_data

In [None]:
high_data

# high_data

In [None]:
final_data1 = high_data

In [None]:
high_ID_list = final_data1['ID']

In [None]:
final_data1 = final_data1.set_index('ID')

In [None]:
final_data1

In [None]:
data1 = final_data1.T

In [None]:
df = data1

# 머신러닝 데이터셋으로 변환
ml_dataset = []
for date, row in df.iterrows():
    for column, value in row.items():
        ml_dataset.append({'date': date, 'ID': column, 'values': value})

ml_df = pd.DataFrame(ml_dataset)

In [None]:
ml_df.columns  = ['ds','unique_id','y']

In [None]:
ml_df

In [None]:
from neuralforecast.losses.pytorch import SMAPE
from neuralforecast.losses.pytorch import MAE


In [None]:
def to_dataset(df):
  ml_dataset = []
  for date, row in df.iterrows():
      for column, value in row.items():
          ml_dataset.append({'date': date, 'ID': column, 'values': value})

  ml_df = pd.DataFrame(ml_dataset)
  ml_df.columns  = ['ds','unique_id','y']
  return ml_df

config_nhits = {
    "input_size": tune.choice([21, 21*2, 21*3,21*4,21*5]),              # Length of input window
    "n_blocks": 5*[1],                                              # Length of input window
    "mlp_units": 5 * [[512, 512]],                                  # Length of input window
    "n_pool_kernel_size": tune.choice([4*[1], 4*[2], 4*[4],
                                       8*[1], 8*[2], 8*[4],
                                        [16, 8, 1],
                                       16*[1], 16*[2], 16*[4],
                                      [8, 4, 2, 1, 1]]),            # MaxPooling Kernel size
    "n_freq_downsample": tune.choice([[8, 4, 2, 1, 1],
                                      [1, 1, 1, 1, 1]]),            # Interpolation expressivity ratios
    "learning_rate": tune.loguniform(1e-4, 1e-2),                   # Initial Learning rate
    "scaler_type": None, #tune.choice([MinMax]),                             # Scaler type
    "max_steps": tune.choice([500]),                               # Max number of training iterations
    "batch_size": tune.choice([32,64,256,512,1024]),                  # Number of series in batch
    "windows_batch_size": tune.choice([32,64,96,128,256,512]),       # Number of windows in batch
    "random_seed": 42,                             # Random seed
    "val_check_steps": 50,                                                    # Compute validation every 50 steps
    }

In [None]:
torch.cuda.is_available()


In [None]:
ID_LIST = ml_df['unique_id'].unique()

In [None]:
ml_df['ds'] = pd.to_datetime(ml_df['ds'])

In [None]:
ml_df

In [None]:
model = AutoNHITS(h=21,
                  loss = SMAPE(),
                  config=config_nhits,
                  #search_alg=HyperOptSearch(),
                  num_samples=100)

In [None]:
nf = NeuralForecast(models=[model], freq='D')
nf.fit(df=ml_df, val_size=21)

In [None]:
fcst_df = nf.predict()


In [None]:
fcst_df

In [None]:
config1 = nf.models[0].results.get_best_result().config

In [None]:
config1

In [None]:
final_data = fcst_df.reset_index()
ids = final_data['unique_id'].unique()

In [None]:
p = pd.DataFrame()
for i in ids:
  k = final_data[final_data['unique_id']==i]['AutoNHITS']
  k = pd.DataFrame(k).T
  k.columns = submission.iloc[:,1:].columns
  p = pd.concat([p,k],axis=0)

In [None]:
p = p.reset_index(drop=True)

In [None]:
high_ID_list

In [None]:
p.index = high_ID_list

In [None]:
submission5 = p.reset_index()

# low_data

In [None]:
final_data1 = low_data

In [None]:
high_ID_list = final_data1['ID']

In [None]:
final_data1 = final_data1.set_index('ID')

In [None]:
final_data1

In [None]:
data1 = final_data1.T

In [None]:
df = data1

# 머신러닝 데이터셋으로 변환
ml_dataset = []
for date, row in df.iterrows():
    for column, value in row.items():
        ml_dataset.append({'date': date, 'ID': column, 'values': value})

ml_df = pd.DataFrame(ml_dataset)

In [None]:
ml_df.columns  = ['ds','unique_id','y']

In [None]:
ml_df

In [None]:
from neuralforecast.losses.pytorch import SMAPE
from neuralforecast.losses.pytorch import MAE


In [None]:
def to_dataset(df):
  ml_dataset = []
  for date, row in df.iterrows():
      for column, value in row.items():
          ml_dataset.append({'date': date, 'ID': column, 'values': value})

  ml_df = pd.DataFrame(ml_dataset)
  ml_df.columns  = ['ds','unique_id','y']
  return ml_df

config_nhits = {
    "input_size": tune.choice([21, 21*2, 21*3,21*4,21*5]),              # Length of input window
    "n_blocks": 5*[1],                                              # Length of input window
    "mlp_units": 5 * [[512, 512]],                                  # Length of input window
    "n_pool_kernel_size": tune.choice([4*[1], 4*[2], 4*[4],
                                       8*[1], 8*[2], 8*[4],
                                        [16, 8, 1],
                                       16*[1], 16*[2], 16*[4],
                                      [8, 4, 2, 1, 1]]),            # MaxPooling Kernel size
    "n_freq_downsample": tune.choice([[8, 4, 2, 1, 1],
                                      [1, 1, 1, 1, 1]]),            # Interpolation expressivity ratios
    "learning_rate": tune.loguniform(1e-4, 1e-2),                   # Initial Learning rate
    "scaler_type": None, #tune.choice([MinMax]),                             # Scaler type
    "max_steps": tune.choice([500]),                               # Max number of training iterations
    "batch_size": tune.choice([32,64,256,512,1024]),                  # Number of series in batch
    "windows_batch_size": tune.choice([32,64,96,128,256,512]),       # Number of windows in batch
    "random_seed": 42,                             # Random seed
    "val_check_steps": 50,                                                    # Compute validation every 50 steps
    }

In [None]:
torch.cuda.is_available()


In [None]:
ID_LIST = ml_df['unique_id'].unique()

In [None]:
ml_df['ds'] = pd.to_datetime(ml_df['ds'])

In [None]:
ml_df

In [None]:
model = AutoNHITS(h=21,
                  loss = SMAPE(),
                  config=config_nhits,
                  #search_alg=HyperOptSearch(),
                  num_samples=100)

In [None]:
nf = NeuralForecast(models=[model], freq='D')
nf.fit(df=ml_df, val_size=21)

In [None]:
fcst_df = nf.predict()


In [None]:
config2 = nf.models[0].results.get_best_result().config

In [None]:
config2

In [None]:
final_data = fcst_df.reset_index()
ids = final_data['unique_id'].unique()

In [None]:
p = pd.DataFrame()
for i in ids:
  k = final_data[final_data['unique_id']==i]['AutoNHITS']
  k = pd.DataFrame(k).T
  k.columns = submission.iloc[:,1:].columns
  p = pd.concat([p,k],axis=0)

In [None]:
p = p.reset_index(drop=True)

In [None]:
high_ID_list

In [None]:
p.index = high_ID_list

In [None]:
submission6 = p.reset_index()

In [None]:
final_data = pd.concat([submission1,submission2,submission5,submission6],axis=0)

In [None]:
final_data.iloc[:,1:] = final_data.iloc[:,1:].replace(0,1)

In [None]:
final_data

In [None]:
#final_data.iloc[:,1:] = np.round(final_data.iloc[:,1:],0).astype(int)

In [None]:
final_submission = final_data.sort_values(by='ID')

In [None]:
final_submission.iloc[:,1:] = np.round(final_submission.iloc[:,1:],0).astype(int)

In [None]:
final_submission = final_submission.reset_index(drop=True)

In [None]:
final_submission.to_csv('/content/drive/MyDrive/LG_AIMERS/nhits_단일.csv', index=False)