In [1]:
import torch, json
from math import floor
import pandas as pd
import numpy as np

from FileManager.dataManager import dataManager
from AnalyzeTools.models import autoregressive_integrated_moving_average, linear_regression, support_vector_regression, random_forest, gradient_boosting
from AnalyzeTools.prepare import data_split, model_eval, pathForSavingModels
from AnalyzeTools.superModels import DEEPAR, TFT, RNN



  from .autonotebook import tqdm as notebook_tqdm
Global seed set to 123


In [2]:
period = 'Day'
future_step = 14
params_path = f'./Models/single/{period}_lead_{future_step}'
train_size = 0.8
product_object = json.load(open("./File information.json", "r", encoding='utf8'))

all_experiments= []
for product in product_object.keys():
    for raw_file_name in  product_object[product].keys():
        for product_type in product_object[product][raw_file_name]['product_types']:
            for target in product_object[product][raw_file_name]['targets']:
                all_experiments.append([product, raw_file_name, product_type, target])

all_experiments

[['pork', '(중)경략가격집계 - 소,돼지', '돼지 온도체', 'MAX_COST_AMT'],
 ['pork', '(중)경략가격집계 - 소,돼지', '돼지 온도체', 'MIN_COST_AMT'],
 ['pork', '(중)축산유통정보 - 소비자가격', 4304, 'DLPC'],
 ['pork', '(중)축산유통정보 - 소비자가격', 4402, 'DLPC'],
 ['pork', '축평원_돼지 삼겹살 소매가격', '삼겹살', '평균'],
 ['pork', '축평원_돼지 삼겹살 소매가격', '삼겹살', '최고'],
 ['pork', '축평원_돼지 삼겹살 소매가격', '삼겹살', '최저'],
 ['pork', '축평원_돼지수입 삼겹살 소매가격', '수입_돼지고기', '평균'],
 ['pork', '축평원_돼지수입 삼겹살 소매가격', '수입_돼지고기', '최고'],
 ['pork', '축평원_돼지수입 삼겹살 소매가격', '수입_돼지고기', '최저'],
 ['beef', '경략가격집계 - 소,돼지', '소', 'MAX_COST_AMT'],
 ['beef', '경략가격집계 - 소,돼지', '소', 'MIN_COST_AMT'],
 ['beef', '경략가격집계 - 소,돼지', '부분육(쇠고기)', 'MAX_COST_AMT'],
 ['beef', '경략가격집계 - 소,돼지', '부분육(쇠고기)', 'MIN_COST_AMT'],
 ['beef', '축산유통정보 - 소비자가격', 4301, 'DLPC'],
 ['beef', '축산유통정보 - 소비자가격', 4401, 'DLPC'],
 ['beef', '축평원_소 수입 소매가격', '미국산_갈비', '평균'],
 ['beef', '축평원_소 수입 소매가격', '미국산_갈비', '최고'],
 ['beef', '축평원_소 수입 소매가격', '미국산_갈비', '최저'],
 ['beef', '축평원_소 수입 소매가격', '호주산_갈비', '평균'],
 ['beef', '축평원_소 수입 소매가격', '호주산_갈비', '최고'],
 [

In [3]:
n = 0
experiment = all_experiments[n]
product, raw_file_name, product_type, target = experiment
print(f"Product: {product}\nRaw file name: {raw_file_name}\nProduct_type: {product_type}\ntarget: {target}")

Product: pork
Raw file name: (중)경략가격집계 - 소,돼지
Product_type: 돼지 온도체
target: MAX_COST_AMT


In [4]:
df, product_and_product_type, product_attribute = dataManager(raw_file_name, product, product_type, target)

if len(df) == 0:
    raise ValueError("No data!")


In [5]:
df

Unnamed: 0,date,JUDGE_KIND,CNT,MIN_COST_AMT,MAX_COST_AMT,SUM_COST_AMT,SUM_WEIGHT,STATUS,DEFECT_CNT,DEFECT_MIN_COST_AMT,DEFECT_MAX_COST_AMT,DEFECT_SUM_COST_AMT,DEFECT_SUM_WEIGHT,DIEOUT_CNT,DIEOUT_MIN_COST_AMT,DIEOUT_MAX_COST_AMT,DIEOUT_SUM_COST_AMT,DIEOUT_SUM_WEIGHT,ETC
0,2012-01-02,2.0,21.596386,4270.060241,5325.981928,9.613190e+06,1873.530120,,,,,,,,,,,,
1,2012-01-03,2.0,23.241573,4520.449438,5501.471910,1.057113e+07,2033.264045,,,,,,,,,,,,
2,2012-01-04,2.0,25.333333,4477.540984,5473.923497,1.126577e+07,2220.666667,,,,,,,,,,,,
3,2012-01-05,2.0,26.641618,4355.057803,5491.317919,1.196984e+07,2370.236994,,,,,,,,,,,,
4,2012-01-06,2.0,27.395604,4276.236264,5410.994505,1.185943e+07,2474.637363,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2968,2022-08-04,2.0,51.347826,3872.246377,6309.115942,2.499731e+07,4605.695652,,51.347826,3872.246377,6309.115942,2.499731e+07,4605.695652,,,,,,
2969,2022-08-05,2.0,38.772727,4239.212121,6330.590909,1.846640e+07,3458.590909,,38.772727,4239.212121,6330.590909,1.846640e+07,3458.590909,,,,,,
2970,2022-08-08,2.0,53.056338,4458.507042,6689.915493,2.865364e+07,4741.633803,,53.056338,4458.507042,6689.915493,2.865364e+07,4741.633803,,,,,,
2971,2022-08-09,2.0,54.619718,4071.070423,6714.225352,2.797866e+07,4787.901408,,54.619718,4071.070423,6714.225352,2.797866e+07,4787.901408,,,,,,


In [6]:
df[['MIN_COST_AMT', 'SUM_COST_AMT', 'DEFECT_MIN_COST_AMT', 'DEFECT_MAX_COST_AMT', 'MAX_COST_AMT']]

Unnamed: 0,MIN_COST_AMT,SUM_COST_AMT,DEFECT_MIN_COST_AMT,DEFECT_MAX_COST_AMT,MAX_COST_AMT
0,4270.060241,9.613190e+06,,,5325.981928
1,4520.449438,1.057113e+07,,,5501.471910
2,4477.540984,1.126577e+07,,,5473.923497
3,4355.057803,1.196984e+07,,,5491.317919
4,4276.236264,1.185943e+07,,,5410.994505
...,...,...,...,...,...
2968,3872.246377,2.499731e+07,3872.246377,6309.115942,6309.115942
2969,4239.212121,1.846640e+07,4239.212121,6330.590909,6330.590909
2970,4458.507042,2.865364e+07,4458.507042,6689.915493,6689.915493
2971,4071.070423,2.797866e+07,4071.070423,6714.225352,6714.225352


In [19]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, f_regression
from termcolor import colored

def preprocessData(dataframe, time_col, target, prepared=False, fs=True, parse_date=True):
    # prerequirments:
    # dataframe must contain time columns that formatted as XXXX-XX-XX or XXXX/XX/XX
    # dataframe must contain target columns, such as retail price
    # if prepared is set as True, skip step 1 ~ 3, and 5
    # if fs (feature selection) set as False, skip step 4

    df = dataframe.copy()
    df[target] = df[target].apply(lambda x: np.nan if x == 0 else x)
    df[target] = df[target].interpolate(method='linear', limit_direction='both')
    
    if not prepared:
        # step 1
        features = removeNoCorrCols(df, target)
        df = df[[time_col] + features + [target]]

        # step 2
        removeFirstNaRows(df, features)

        # step 3
        fillNa(df, features)
    
    # step 4
    if fs:
        features = featureSelection(df, features, target)
    
    # step 5
    if not prepared:
        features = filterSameCols(df, features, target)
    
    df.index = range(len(df))
    print(f"\n-->Final features:\n  {features}")
    # parse date
    if parse_date:
        dateParser(df, time_col)
        df = df[[time_col, 'year', 'month', 'week', 'day'] + features + [target]]
    else:
        df = df[[time_col] + features + [target]]

    return df, features

def removeNoCorrCols(dataframe, target):
    features = dataframe.corr()[target].dropna().index.tolist()
    if not features:
        raise ValueError("No features correlated with the target!")
    features.remove(target)

    return features

def removeFirstNaRows(dataframe, features):
    drop_rows = []
    if dataframe[features].isnull().values.any():
        for i, row in dataframe[features].iterrows():
            if row.isnull().values.any():
                drop_rows.append(i)
                continue
            break
    
    dataframe.drop(drop_rows, axis=0, inplace=True)

def fillNa(dataframe, features):
    if dataframe[features].isnull().values.any():
        dataframe.interpolate(method='linear', limit_direction='forward', inplace=True)

def featureSelection(dataframe, features, target, K=None):
    # use sklearn selectbest function
    if not K:
        if len(features) > 2:
            K = int(len(features) / 2)
        else:
            print(colored("There are too few features in the data. The raw data features will be used.", 'yellow'))
            return features
    
    feature_selector = SelectKBest(score_func=f_regression, k=K)
    feature_selector.fit_transform(dataframe[features].values, dataframe[target].values)

    feature_scores = [[k, v] for k, v in zip(features, feature_selector.scores_)]
    print("\n-->Feature scores:\n  ", pd.DataFrame(feature_scores, columns=['Features', 'Scores']).sort_values('Scores', ascending=False))

    k_best_features = list(np.array(features)[feature_selector.get_support()])
    print("\n-->TOP K features:\n  ", k_best_features)

    return k_best_features

def filterSameCols(dataframe, features, target, shreshold=0.8):
    length = len(features)
    if length < 2:
        print("Too few features to filter!")
        return features

    cols_filter = np.repeat([True], length)
    for i in range(length):
        col1 = features[i]
        output_sim = (dataframe[col1] == dataframe[target]).sum() / len(dataframe)
        if output_sim > shreshold:
            cols_filter[i] = False
            continue

        for j in range(i+1, length):
            col2 = features[j]
            input_sim = (dataframe[col1] == dataframe[col2]).sum() / len(dataframe)
            if input_sim > shreshold:
                cols_filter[j] = False

    return np.array(features)[cols_filter].tolist()

def dateParser(dataframe, time_col):
    dataframe[time_col] = pd.to_datetime(dataframe[time_col])
    dataframe['year'] = dataframe[time_col].dt.year
    dataframe['month'] = dataframe[time_col].dt.month
    dataframe['week'] = dataframe[time_col].dt.isocalendar().week
    dataframe['day'] = dataframe[time_col].dt.day

In [20]:
data, input_features = preprocessData(df, 'date', target)
predictions_x_axis = data['date'][floor(len(data) * train_size):].values


-->Feature scores:
                 Features        Scores
6  DEFECT_MAX_COST_AMT  6.811574e+10
1         MIN_COST_AMT  1.222592e+04
5  DEFECT_MIN_COST_AMT  1.222443e+04
2         SUM_COST_AMT  4.538033e+02
7  DEFECT_SUM_COST_AMT  4.537454e+02
8    DEFECT_SUM_WEIGHT  2.279391e+01
3           SUM_WEIGHT  2.277102e+01
4           DEFECT_CNT  1.084715e+01
0                  CNT  1.083094e+01

-->TOP K features:
   ['MIN_COST_AMT', 'SUM_COST_AMT', 'DEFECT_MIN_COST_AMT', 'DEFECT_MAX_COST_AMT']
######################## Test ############################
MAX_COST_AMT
[ True  True False False]
['MIN_COST_AMT' 'SUM_COST_AMT' 'DEFECT_MIN_COST_AMT' 'DEFECT_MAX_COST_AMT']
['MIN_COST_AMT', 'SUM_COST_AMT']

-->Final features:
  ['MIN_COST_AMT', 'SUM_COST_AMT']
