# Data Pipeline

## 0. Load Required Libraries

In [1]:
from tqdm import tqdm
import pandas as pd
import numpy as np
import yfinance as yf
from datetime import date
import joblib
import os
import yaml
import src.util as util
from sklearn.model_selection import TimeSeriesSplit


## 1. Load Configuration File

In [2]:
def read_raw_data(config: dict) -> pd.DataFrame:
    # Load and define stock ticker list at IDX
    stock_list = pd.read_excel(config['raw_dataset_dir'])

    # Add new column with a value suitable to ticker name at yfinance
    stock_list['ticker.jk'] = stock_list['Kode'] + config['ticker_ext']

    # Take only the needed column and change it from df to list
    ticker_list = stock_list['ticker.jk'].tolist()
    
    # Define the date range parameter
    start_date = config['start_date']
    end_date = date.today()
    interval = config['interval_date']

    # Download stock data from yfinance
    stock_data = {}
    for ticker in tqdm(ticker_list, desc='Downloading stock data'):
        stock_data[ticker] = yf.download(ticker, start=start_date, end=end_date, interval=interval, progress=False)

    # Convert the dictionary to a pandas DataFrame with a MultiIndex
    dataset = pd.concat(stock_data, axis=1)

    # re adjust the table only to show the required column (adj. closing price)
    dataset = dataset[dataset.columns[4::6]]
    dataset.columns = dataset.columns.droplevel(1)

    # return raw dataset
    return dataset

In [3]:
config_data = util.load_config()
raw_dataset_ori = read_raw_data(config_data)

Downloading stock data:   4%|▎         | 31/853 [00:07<03:10,  4.31it/s]


1 Failed download:
- TRIL.JK: No timezone found, symbol may be delisted


Downloading stock data:  28%|██▊       | 235/853 [01:05<02:27,  4.19it/s]


1 Failed download:
- HDTX.JK: No timezone found, symbol may be delisted


Downloading stock data:  44%|████▎     | 372/853 [01:38<01:54,  4.22it/s]


1 Failed download:
- NIPS.JK: No timezone found, symbol may be delisted


Downloading stock data:  55%|█████▌    | 470/853 [02:09<01:42,  3.74it/s]


1 Failed download:
- SUGI.JK: No timezone found, symbol may be delisted


Downloading stock data:  57%|█████▋    | 486/853 [02:13<01:32,  3.99it/s]


1 Failed download:
- TRIO.JK: No timezone found, symbol may be delisted


Downloading stock data: 100%|██████████| 853/853 [04:50<00:00,  2.94it/s]


In [4]:
# Keep a copy data to be reuse in case needed later to prevent redownload
raw_dataset = raw_dataset_ori.copy()
raw_dataset

Unnamed: 0_level_0,PACK.JK,VAST.JK,CHIP.JK,HALO.JK,KING.JK,PGEO.JK,FUTR.JK,HILL.JK,BDKR.JK,PTMP.JK,...,CBPE.JK,SUNI.JK,CBRE.JK,WINE.JK,BMBL.JK,PEVE.JK,LAJU.JK,FWCT.JK,NAYZ.JK,IRSX.JK
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-02-08,146.0,101.0,176.0,117.0,,,,,,,...,148.0,296.0,90.0,372.0,68.0,204.0,173.0,112.0,73.0,101.0
2023-02-09,132.0,96.0,193.0,118.0,,,,,,,...,149.0,298.0,84.0,368.0,64.0,200.0,206.0,128.0,75.0,100.0
2023-02-10,124.0,120.0,212.0,131.0,,,,,,,...,147.0,306.0,88.0,344.0,65.0,200.0,244.0,130.0,72.0,93.0
2023-02-13,112.0,112.0,232.0,122.0,,,,,,,...,152.0,300.0,85.0,324.0,65.0,199.0,228.0,144.0,76.0,94.0
2023-02-14,102.0,108.0,230.0,119.0,,,,,,,...,149.0,312.0,80.0,304.0,65.0,208.0,214.0,134.0,81.0,95.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-03-23,,,,,,,,,,,...,,,,,,,,,,
2022-05-03,,,,,,,,,,,...,,,,,,,,,,
2022-05-04,,,,,,,,,,,...,,,,,,,,,,
2022-05-05,,,,,,,,,,,...,,,,,,,,,,


In [5]:
# change the index format from object into datetime 
raw_dataset.index = pd.to_datetime(raw_dataset.index)

# sort the date index
raw_dataset = raw_dataset.sort_index(ascending=True)

#raw_dataset = raw_dataset.reset_index()

# Delete unrequired rows & columns where all its value is NaN
raw_dataset.dropna(axis=0, thresh=0.01*len(raw_dataset.columns), inplace=True)
raw_dataset.dropna(axis=1, thresh=0.01*len(raw_dataset.index), inplace=True)
raw_dataset.dropna(axis=1, how='any', inplace=True)

# Replace NaN value with 0
#raw_dataset.fillna(0, inplace=True)

# set date as the index of the dataset
raw_dataset.shape



(313, 759)

In [6]:
# Check nan value
raw_dataset.isna().any().sum()

0

In [7]:
raw_dataset

Unnamed: 0_level_0,TCID.JK,TELE.JK,TFCO.JK,TGKA.JK,TIFA.JK,TINS.JK,TIRA.JK,TIRT.JK,BHAT.JK,CASH.JK,...,TAYS.JK,WMPP.JK,RMKE.JK,OBMD.JK,AVIA.JK,IPPE.JK,NASI.JK,BSML.JK,DRMA.JK,ADMR.JK
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-01-03,5375.0,121.0,665.0,6751.204590,585.0,1424.660645,446.0,113.0,995.0,278.0,...,178.0,155.0,226.0,126.0,904.187500,208.0,236.0,181.796310,531.287964,135.0
2022-01-04,5325.0,121.0,650.0,6775.315918,635.0,1414.968994,500.0,110.0,1000.0,268.0,...,200.0,155.0,228.0,123.0,899.299988,206.0,256.0,192.783997,614.149414,182.0
2022-01-05,5300.0,121.0,690.0,6775.315918,640.0,1381.048584,466.0,118.0,995.0,260.0,...,200.0,152.0,224.0,123.0,889.524963,202.0,298.0,245.724670,643.394653,244.0
2022-01-06,5425.0,121.0,690.0,6775.315918,635.0,1371.356934,446.0,112.0,990.0,238.0,...,198.0,150.0,226.0,122.0,894.412476,199.0,312.0,237.733627,648.268799,304.0
2022-01-07,5425.0,121.0,670.0,6775.315918,625.0,1366.511230,436.0,118.0,1000.0,248.0,...,206.0,147.0,228.0,124.0,894.412476,198.0,356.0,277.688873,692.136658,380.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-04-03,6175.0,50.0,595.0,6875.000000,408.0,1025.000000,404.0,50.0,760.0,70.0,...,155.0,50.0,770.0,174.0,605.000000,50.0,61.0,240.000000,810.000000,1235.0
2023-04-04,6175.0,50.0,595.0,6600.000000,410.0,1010.000000,384.0,50.0,750.0,72.0,...,208.0,50.0,760.0,167.0,595.000000,50.0,60.0,230.000000,830.000000,1220.0
2023-04-05,6175.0,50.0,595.0,6600.000000,410.0,1020.000000,408.0,50.0,745.0,71.0,...,228.0,50.0,745.0,165.0,605.000000,50.0,60.0,214.000000,855.000000,1180.0
2023-04-06,6075.0,50.0,595.0,6675.000000,428.0,1000.000000,418.0,50.0,750.0,74.0,...,222.0,50.0,730.0,164.0,620.000000,50.0,76.0,200.000000,850.000000,1175.0


In [8]:
# Save raw dataset to file
util.pickle_dump(raw_dataset, config_data['raw_dataset_path'])

## 2. Data Definition

## 3. Data Validation

### 3.1 Data type

In [9]:
# Chek data type each variable
raw_dataset.dtypes

TCID.JK    float64
TELE.JK    float64
TFCO.JK    float64
TGKA.JK    float64
TIFA.JK    float64
            ...   
IPPE.JK    float64
NASI.JK    float64
BSML.JK    float64
DRMA.JK    float64
ADMR.JK    float64
Length: 759, dtype: object

In [10]:
# since there are a lot of columns, need to check the dtypes in summarry
# showing that there are only 2 dtypes, datetime & float.
raw_dataset.dtypes.value_counts()

float64    759
dtype: int64

### 3.2 Data Range

In [11]:
raw_dataset.describe()

Unnamed: 0,TCID.JK,TELE.JK,TFCO.JK,TGKA.JK,TIFA.JK,TINS.JK,TIRA.JK,TIRT.JK,BHAT.JK,CASH.JK,...,TAYS.JK,WMPP.JK,RMKE.JK,OBMD.JK,AVIA.JK,IPPE.JK,NASI.JK,BSML.JK,DRMA.JK,ADMR.JK
count,313.0,313.0,313.0,313.0,313.0,313.0,313.0,313.0,313.0,313.0,...,313.0,313.0,313.0,313.0,313.0,313.0,313.0,313.0,313.0,313.0
mean,5678.578275,73.41853,685.623003,7110.151275,500.773163,1413.651792,409.738019,71.402556,925.095847,161.980831,...,316.338658,121.591054,711.859425,179.325879,756.96693,221.118211,205.846645,698.457738,645.340709,1669.086262
std,604.2105,32.900889,51.514048,316.702471,59.867613,230.529563,19.752192,26.926138,85.344271,55.266252,...,171.485965,31.983853,229.738883,78.312342,90.639679,115.778644,122.997376,363.867557,75.7596,448.925464
min,4720.0,50.0,550.0,6500.0,388.0,955.0,328.0,50.0,745.0,59.0,...,150.0,50.0,224.0,94.0,570.0,50.0,56.0,181.79631,492.294373,135.0
25%,5175.0,50.0,650.0,6895.873535,478.0,1250.0,402.0,50.0,845.0,132.0,...,195.0,105.0,446.0,121.0,694.025024,158.0,103.0,390.0,585.0,1490.0
50%,5500.0,50.0,690.0,7100.0,500.0,1375.0,412.0,56.0,975.0,161.0,...,216.0,128.0,745.0,165.0,770.25,212.0,154.0,589.339661,625.0,1685.0
75%,6175.0,121.0,710.0,7250.0,540.0,1525.0,422.0,96.0,995.0,184.0,...,462.0,140.0,920.0,194.0,821.099976,290.0,314.0,963.919983,680.0,1810.0
max,7200.0,121.0,795.0,8390.783203,640.0,1986.771606,500.0,152.0,1080.0,300.0,...,745.0,197.0,1130.0,458.0,938.125,600.0,540.0,1802.979858,895.0,2990.0


In [12]:
# Check data statistics, since the column qty is a lot, then we summarize the describe feature in following function
def raw_dataset_describe(dataset):

    # Get the date index statistic info
    date_df = pd.Series(dataset.index).describe(include='datetime64', datetime_is_numeric=True)

    df = dataset.describe()
    # Get the minimum value for each row across all columns and convert it to a DataFrame
    row_min_df = df.min(axis=1).to_frame('Min')

    # Get the maximum value for each row across all columns and convert it to a DataFrame
    row_max_df = df.max(axis=1).to_frame('Max')

    # Concatenate the min and max DataFrames horizontally
    result_df = pd.concat([date_df, row_min_df, row_max_df], axis=1)


    return result_df

In [13]:
# Call Dataset statistics
raw_dataset_describe(raw_dataset)

Unnamed: 0,Date,Min,Max
count,313,313.0,313.0
mean,2022-08-23 17:47:20.894568704,32.57508,38689.936102
min,2022-01-03 00:00:00,19.612309,31975.0
25%,2022-04-27 00:00:00,28.0,36800.0
50%,2022-08-29 00:00:00,31.0,38000.0
75%,2022-12-15 00:00:00,37.0,40500.0
max,2023-04-10 00:00:00,46.0,167198.484375
std,,0.0,11266.191496


### 3.3 Data Dimension

In [14]:
raw_dataset.shape

(313, 759)

## 4. Data Defense

In [15]:
def check_data(input_data, params, print_errors=True):

    error_messages = []
    error_stock_tickers = []
    #input_data = input_data.fillna(0)
    try:
        # Check index data types
        assert input_data.index.dtype == params['datetime_index'], 'an error occurs in index format, should be datetime.'

        # Check index data type & range
        for column in input_data.columns:
            if input_data[column].dtype != 'float64':
                error_messages.append(f"Column ({column}) has a non-float data type")
                error_stock_tickers.append(column)

            if not (input_data[column] >= 0).sum() == len(input_data):
                error_messages.append(f'an error occurs in {column} column')
                if column not in error_stock_tickers:
                    error_stock_tickers.append(column)
        
        if error_messages:
            total_errors = len(error_messages)
            error_summary = f"\nTotal errors: {total_errors} errors out of {len(input_data.columns)}\n"
            raise AssertionError(error_summary + "\n".join(error_messages))
    
    except AssertionError as e:
        if print_errors:
            print(e)
    
    return error_stock_tickers



In [16]:
# Check the error stock in the dataset
check_data(raw_dataset, config_data)

[]

In [17]:
# Found error in stock above (SCPI.JK), which after checking through news and yfinance data,
# it is already delisted since 2013. So this stock should be removed since it is considered
# as an anomaly.
error_stock_tickers = check_data(raw_dataset, config_data, print_errors=False)
raw_dataset.drop(error_stock_tickers, axis=1, inplace=True)

In [18]:
# Recheck the data and found no issue
check_data(raw_dataset, config_data)


[]

In [19]:
# Anomaly Handling; change into actual value based on other source
raw_dataset['BMRI.JK'].loc['2023-03-30'] = float(5112)
raw_dataset['MYOR.JK'].loc['2022-06-14'] = float(1602.730957)


In [20]:
util.pickle_dump(raw_dataset, config_data["clean_dataset_path"])


## 5. Data Splitting

### 5.1 Time Series Split

In [21]:
# Initialize TimeSeriesSplit object
n_splits = 3
tscv = TimeSeriesSplit(n_splits = n_splits)

# Get the train & test_val indices at the last split
for train_index, test_val_index in tscv.split(raw_dataset):
    pass

# Calculate the size of the test and validation sets
test_val_size = len(test_val_index)
test_size = val_size = test_val_size // 2

# Define test and validation indices
val_index = test_val_index[:test_size]
test_index = test_val_index[test_size:]

# Extract the train, test, and validation sets
train = raw_dataset.iloc[train_index]
val = raw_dataset.iloc[val_index]
test = raw_dataset.iloc[test_index]





In [22]:
# Split feature and target columns for train, test, and validation sets
feature_columns = raw_dataset.drop([config_data['target']], axis=1).columns
target_column = config_data['target']
X_train, y_train = train[feature_columns], train[target_column]
X_test, y_test = test[feature_columns], test[target_column]
X_val, y_val = val[feature_columns], val[target_column]

print("TRAIN Set:")
display(X_train, display_id='X_train')
display(y_train, display_id='y_train')

print("TEST Set:")
display(X_test, display_id='X_test')
display(y_test, display_id='y_test')

print("VALIDATION Set:")
display(X_val, display_id='X_val')
display(y_val, display_id='y_val')





TRAIN Set:


Unnamed: 0_level_0,TCID.JK,TELE.JK,TFCO.JK,TGKA.JK,TIFA.JK,TINS.JK,TIRA.JK,TIRT.JK,BHAT.JK,CASH.JK,...,TAYS.JK,WMPP.JK,RMKE.JK,OBMD.JK,AVIA.JK,IPPE.JK,NASI.JK,BSML.JK,DRMA.JK,ADMR.JK
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-01-03,5375.0,121.0,665.0,6751.204590,585.0,1424.660645,446.0,113.0,995.0,278.0,...,178.0,155.0,226.0,126.0,904.187500,208.0,236.0,181.796310,531.287964,135.0
2022-01-04,5325.0,121.0,650.0,6775.315918,635.0,1414.968994,500.0,110.0,1000.0,268.0,...,200.0,155.0,228.0,123.0,899.299988,206.0,256.0,192.783997,614.149414,182.0
2022-01-05,5300.0,121.0,690.0,6775.315918,640.0,1381.048584,466.0,118.0,995.0,260.0,...,200.0,152.0,224.0,123.0,889.524963,202.0,298.0,245.724670,643.394653,244.0
2022-01-06,5425.0,121.0,690.0,6775.315918,635.0,1371.356934,446.0,112.0,990.0,238.0,...,198.0,150.0,226.0,122.0,894.412476,199.0,312.0,237.733627,648.268799,304.0
2022-01-07,5425.0,121.0,670.0,6775.315918,625.0,1366.511230,436.0,118.0,1000.0,248.0,...,206.0,147.0,228.0,124.0,894.412476,198.0,356.0,277.688873,692.136658,380.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-09,6100.0,50.0,695.0,7200.000000,488.0,1165.000000,410.0,54.0,790.0,95.0,...,675.0,98.0,920.0,197.0,720.000000,164.0,100.0,436.000000,580.000000,1670.0
2022-12-12,6025.0,50.0,695.0,7200.000000,488.0,1175.000000,422.0,53.0,800.0,100.0,...,690.0,101.0,920.0,199.0,725.000000,165.0,95.0,422.000000,625.000000,1675.0
2022-12-13,6000.0,50.0,695.0,7100.000000,486.0,1175.000000,398.0,53.0,810.0,95.0,...,690.0,96.0,960.0,208.0,735.000000,164.0,95.0,434.000000,610.000000,1675.0
2022-12-14,6000.0,50.0,695.0,7050.000000,484.0,1185.000000,398.0,52.0,820.0,101.0,...,645.0,90.0,945.0,204.0,715.000000,164.0,95.0,432.000000,595.000000,1695.0


Date
2022-01-03    3201.196533
2022-01-04    3257.955322
2022-01-05    3189.844727
2022-01-06    3189.844727
2022-01-07    3201.196533
                 ...     
2022-12-09    4721.505859
2022-12-12    4733.398926
2022-12-13    4709.613281
2022-12-14    4733.398926
2022-12-15    4709.613281
Name: BMRI.JK, Length: 235, dtype: float64

TEST Set:


Unnamed: 0_level_0,TCID.JK,TELE.JK,TFCO.JK,TGKA.JK,TIFA.JK,TINS.JK,TIRA.JK,TIRT.JK,BHAT.JK,CASH.JK,...,TAYS.JK,WMPP.JK,RMKE.JK,OBMD.JK,AVIA.JK,IPPE.JK,NASI.JK,BSML.JK,DRMA.JK,ADMR.JK
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-02-10,6700.0,50.0,640.0,7000.0,426.0,1210.0,410.0,50.0,795.0,82.0,...,292.0,96.0,800.0,171.0,630.0,51.0,96.0,350.0,595.0,1455.0
2023-02-13,6850.0,50.0,600.0,7150.0,408.0,1210.0,400.0,50.0,790.0,78.0,...,294.0,93.0,830.0,170.0,645.0,50.0,94.0,328.0,595.0,1500.0
2023-02-14,6700.0,50.0,600.0,7100.0,408.0,1210.0,408.0,50.0,790.0,79.0,...,274.0,93.0,820.0,170.0,650.0,58.0,94.0,328.0,590.0,1500.0
2023-02-15,6600.0,50.0,600.0,7050.0,408.0,1210.0,418.0,50.0,790.0,81.0,...,264.0,87.0,795.0,169.0,635.0,57.0,94.0,320.0,615.0,1495.0
2023-02-16,6475.0,50.0,600.0,7050.0,410.0,1205.0,418.0,50.0,795.0,82.0,...,262.0,87.0,795.0,169.0,640.0,54.0,92.0,346.0,625.0,1500.0
2023-02-17,6600.0,50.0,600.0,6950.0,406.0,1200.0,420.0,50.0,795.0,81.0,...,264.0,84.0,770.0,168.0,640.0,53.0,90.0,408.0,610.0,1450.0
2023-02-20,6550.0,50.0,610.0,7000.0,406.0,1200.0,396.0,50.0,800.0,82.0,...,260.0,79.0,725.0,172.0,640.0,52.0,90.0,380.0,610.0,1440.0
2023-02-21,6550.0,50.0,610.0,6950.0,412.0,1190.0,408.0,50.0,805.0,84.0,...,286.0,78.0,745.0,176.0,640.0,51.0,90.0,354.0,640.0,1420.0
2023-02-22,6550.0,50.0,610.0,6900.0,424.0,1185.0,408.0,50.0,795.0,89.0,...,286.0,75.0,750.0,190.0,625.0,50.0,90.0,330.0,625.0,1395.0
2023-02-23,6325.0,50.0,610.0,6950.0,424.0,1190.0,408.0,50.0,820.0,89.0,...,278.0,78.0,780.0,185.0,625.0,51.0,89.0,344.0,720.0,1395.0


Date
2023-02-10    4899.900391
2023-02-13    4923.686523
2023-02-14    4935.579590
2023-02-15    4864.221680
2023-02-16    4840.435547
2023-02-17    4899.900391
2023-02-20    4888.007812
2023-02-21    4852.328613
2023-02-22    4757.185059
2023-02-23    4792.863770
2023-02-24    4816.649902
2023-02-27    4828.542969
2023-02-28    4757.185059
2023-03-01    4804.756836
2023-03-02    4864.221680
2023-03-03    4780.970703
2023-03-06    4816.649902
2023-03-07    4840.435547
2023-03-08    4888.007812
2023-03-09    4971.258301
2023-03-10    4935.579590
2023-03-13    4923.686523
2023-03-14    4721.505859
2023-03-15    4780.970703
2023-03-16    4685.827148
2023-03-17    4804.756836
2023-03-20    4757.185059
2023-03-21    4995.044434
2023-03-24    5185.331543
2023-03-27    5100.000000
2023-03-28    5062.500000
2023-03-29    5175.000000
2023-03-30    5112.000000
2023-03-31    5162.500000
2023-04-03    5262.500000
2023-04-04    5200.000000
2023-04-05    5225.000000
2023-04-06    5175.000000
2023-04

VALIDATION Set:


Unnamed: 0_level_0,TCID.JK,TELE.JK,TFCO.JK,TGKA.JK,TIFA.JK,TINS.JK,TIRA.JK,TIRT.JK,BHAT.JK,CASH.JK,...,TAYS.JK,WMPP.JK,RMKE.JK,OBMD.JK,AVIA.JK,IPPE.JK,NASI.JK,BSML.JK,DRMA.JK,ADMR.JK
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-12-16,6100.0,50.0,695.0,7050.0,450.0,1195.0,404.0,52.0,840.0,122.0,...,640.0,90.0,960.0,198.0,665.0,166.0,96.0,426.0,600.0,1725.0
2022-12-19,6025.0,50.0,695.0,7050.0,430.0,1185.0,400.0,52.0,850.0,134.0,...,745.0,84.0,930.0,194.0,645.0,171.0,108.0,418.0,600.0,1700.0
2022-12-20,6025.0,50.0,655.0,7075.0,430.0,1160.0,408.0,52.0,850.0,147.0,...,695.0,79.0,920.0,191.0,635.0,166.0,102.0,410.0,600.0,1645.0
2022-12-21,6025.0,50.0,655.0,7075.0,450.0,1160.0,406.0,50.0,870.0,150.0,...,650.0,82.0,940.0,189.0,645.0,159.0,124.0,400.0,560.0,1670.0
2022-12-22,6050.0,50.0,655.0,7050.0,450.0,1165.0,384.0,50.0,880.0,158.0,...,605.0,82.0,940.0,193.0,640.0,157.0,116.0,390.0,580.0,1720.0
2022-12-23,6025.0,50.0,655.0,7050.0,420.0,1165.0,404.0,50.0,890.0,155.0,...,565.0,83.0,935.0,194.0,630.0,157.0,114.0,364.0,580.0,1700.0
2022-12-26,6025.0,50.0,655.0,7050.0,408.0,1165.0,404.0,51.0,880.0,167.0,...,610.0,87.0,935.0,194.0,630.0,157.0,110.0,342.0,585.0,1660.0
2022-12-27,6275.0,50.0,695.0,7100.0,410.0,1170.0,404.0,51.0,870.0,167.0,...,570.0,87.0,945.0,189.0,635.0,154.0,109.0,334.0,585.0,1660.0
2022-12-28,6500.0,50.0,650.0,7100.0,422.0,1165.0,416.0,50.0,860.0,151.0,...,535.0,84.0,940.0,188.0,650.0,144.0,105.0,336.0,575.0,1685.0
2022-12-29,6250.0,50.0,650.0,7100.0,414.0,1165.0,414.0,51.0,865.0,136.0,...,500.0,79.0,940.0,188.0,640.0,134.0,101.0,334.0,585.0,1695.0


Date
2022-12-16    4804.756836
2022-12-19    4757.185059
2022-12-20    4780.970703
2022-12-21    4757.185059
2022-12-22    4733.398926
2022-12-23    4721.505859
2022-12-26    4780.970703
2022-12-27    4733.398926
2022-12-28    4745.291992
2022-12-29    4745.291992
2022-12-30    4721.505859
2023-01-02    4697.720215
2023-01-03    4733.398926
2023-01-04    4769.078125
2023-01-05    4673.934082
2023-01-06    4662.041504
2023-01-09    4626.362305
2023-01-10    4412.289062
2023-01-11    4257.680664
2023-01-12    4376.610352
2023-01-13    4364.717285
2023-01-16    4424.182129
2023-01-17    4638.255371
2023-01-18    4543.111816
2023-01-19    4638.255371
2023-01-20    4745.291992
2023-01-24    4709.613281
2023-01-25    4614.469238
2023-01-26    4673.934082
2023-01-27    4769.078125
2023-01-30    4733.398926
2023-01-31    4733.398926
2023-02-01    4614.469238
2023-02-02    4626.362305
2023-02-03    4721.505859
2023-02-06    4697.720215
2023-02-07    4840.435547
2023-02-08    4852.328613
2023-02

<DisplayHandle display_id=y_val>

In [23]:
util.pickle_dump(X_train, config_data["train_set_path"][0])
util.pickle_dump(y_train, config_data["train_set_path"][1])

util.pickle_dump(X_val, config_data["valid_set_path"][0])
util.pickle_dump(y_val, config_data["valid_set_path"][1])

util.pickle_dump(X_test, config_data["test_set_path"][0])
util.pickle_dump(y_test, config_data["test_set_path"][1])