In [1]:
import torch, json
from math import floor
import pandas as pd
import numpy as np

from FileManager.dataManager import dataManager
from AnalyzeTools.models import autoregressive_integrated_moving_average, linear_regression, support_vector_regression, random_forest, gradient_boosting
from AnalyzeTools.prepare import data_split, model_eval, pathForSavingModels
from AnalyzeTools.superModels import DEEPAR, TFT, RNN
from AnalyzeTools.preprocess import preprocessData



  from .autonotebook import tqdm as notebook_tqdm
Global seed set to 123


In [2]:
period = 'Day'
future_step = 14
params_path = f'./Models/single/{period}_lead_{future_step}'
train_size = 0.8
product_object = json.load(open("./File information.json", "r", encoding='utf8'))

all_experiments= []
for product in product_object.keys():
    for raw_file_name in  product_object[product].keys():
        for product_type in product_object[product][raw_file_name]['product_types']:
            for target in product_object[product][raw_file_name]['targets']:
                all_experiments.append([product, raw_file_name, product_type, target])

In [3]:
n = 0
experiment = all_experiments[n]
product, raw_file_name, product_type, target = experiment
print(f"Product: {product}\nRaw file name: {raw_file_name}\nProduct_type: {product_type}\ntarget: {target}")

Product: pork
Raw file name: (중)경략가격집계 - 소,돼지
Product_type: 돼지 온도체
target: MAX_COST_AMT


In [4]:
df, product_and_product_type, product_attribute = dataManager(raw_file_name, product, product_type, target)

if len(df) == 0:
    raise ValueError("No data!")


In [5]:
data, input_features = preprocessData(df, 'date', target)


-->Feature scores:
                 Features        Scores
6  DEFECT_MAX_COST_AMT  6.811574e+10
1         MIN_COST_AMT  1.222592e+04
5  DEFECT_MIN_COST_AMT  1.222443e+04
2         SUM_COST_AMT  4.538033e+02
7  DEFECT_SUM_COST_AMT  4.537454e+02
8    DEFECT_SUM_WEIGHT  2.279391e+01
3           SUM_WEIGHT  2.277102e+01
4           DEFECT_CNT  1.084715e+01
0                  CNT  1.083094e+01

-->TOP K features:
   ['MIN_COST_AMT', 'SUM_COST_AMT', 'DEFECT_MIN_COST_AMT', 'DEFECT_MAX_COST_AMT']

-->Final features:
  ['MIN_COST_AMT', 'SUM_COST_AMT']


In [12]:
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt

def removeOutliers(dataframe, test_size, target_col, vis=True, **params):
    data = dataframe.copy()

    test_size = floor(len(data) * test_size) if type(test_size) == float else test_size
    training_data = data[:-1*test_size]
    training_idxs = training_data.index

    n_estimators = params.get('n_estimators') if params.get('n_estimators') else 100
    contamination = params.get('contamination') if params.get('contamination') else 0.03
    
    iforest = IsolationForest(n_estimators=n_estimators, contamination=contamination, max_samples='auto')
    prediction = iforest.fit_predict(training_data[[target_col]])

    print("Number of outliers detected: {}".format(prediction[prediction < 0].sum()))
    print("Number of normal samples detected: {}".format(prediction[prediction > 0].sum()))

    if vis:
        normals = []
        outliers = []
        for value, label in zip(training_data[target_col].values, prediction):
            if label not in [1, -1]:
                print(label)
            if label == 1:
                normals.append(value)
                outliers.append(None)
            elif label == -1:
                normals.append(None)
                outliers.append(value)
        plt.figure(figsize=(12,7))
        plt.plot(normals, label='normal')
        plt.plot(outliers, label='outlier')
        plt.legend()
        plt.show()
    
    for idx, label in zip(training_idxs, prediction):
        if label == -1:
            data.loc[idx, target_col] = np.nan

    data[target_col] = data[target_col].interpolate(method='linear', limit_direction='both')

    return data
