# Data Pipeline

## 0. Load Required Libraries

In [2]:
from tqdm import tqdm
import pandas as pd
import numpy as np
import yfinance as yf
from datetime import date
import joblib
import os
import yaml
import src.util as util
from sklearn.model_selection import TimeSeriesSplit


## 1. Load Configuration File

In [3]:
def read_raw_data(config: dict) -> pd.DataFrame:
    # Load and define stock ticker list at IDX
    stock_list = pd.read_excel(config['raw_dataset_dir'])

    # Add new column with a value suitable to ticker name at yfinance
    stock_list['ticker.jk'] = stock_list['Kode'] + config['ticker_ext']

    # Take only the needed column and change it from df to list
    ticker_list = stock_list['ticker.jk'].tolist()
    
    # Define the date range parameter
    start_date = config['start_date']
    end_date = date.today()
    interval = config['interval_date']

    # Download stock data from yfinance
    stock_data = {}
    for ticker in tqdm(ticker_list, desc='Downloading stock data'):
        stock_data[ticker] = yf.download(ticker, start=start_date, end=end_date, interval=interval, progress=False)

    # Convert the dictionary to a pandas DataFrame with a MultiIndex
    dataset = pd.concat(stock_data, axis=1)

    # re adjust the table only to show the required column (adj. closing price)
    dataset = dataset[dataset.columns[4::6]]
    dataset.columns = dataset.columns.droplevel(1)

    # return raw dataset
    return dataset

In [4]:
config_data = util.load_config()
raw_dataset_ori = read_raw_data(config_data)

Downloading stock data:   4%|▍         | 32/853 [00:03<01:41,  8.12it/s]


1 Failed download:
- TRIL.JK: No timezone found, symbol may be delisted


Downloading stock data:  28%|██▊       | 236/853 [00:33<01:18,  7.86it/s]


1 Failed download:
- HDTX.JK: No timezone found, symbol may be delisted


Downloading stock data:  44%|████▎     | 373/853 [00:53<00:57,  8.39it/s]


1 Failed download:
- NIPS.JK: No timezone found, symbol may be delisted


Downloading stock data:  55%|█████▌    | 471/853 [01:08<00:50,  7.55it/s]


1 Failed download:
- SUGI.JK: No timezone found, symbol may be delisted


Downloading stock data:  57%|█████▋    | 487/853 [01:11<00:54,  6.77it/s]


1 Failed download:
- TRIO.JK: No timezone found, symbol may be delisted


Downloading stock data: 100%|██████████| 853/853 [01:55<00:00,  7.36it/s]


In [5]:
# Keep a copy data to be reuse in case needed later to prevent redownload
raw_dataset = raw_dataset_ori.copy()
raw_dataset

Unnamed: 0_level_0,PACK.JK,VAST.JK,CHIP.JK,HALO.JK,KING.JK,PGEO.JK,FUTR.JK,HILL.JK,BDKR.JK,PTMP.JK,...,CBPE.JK,SUNI.JK,CBRE.JK,WINE.JK,BMBL.JK,PEVE.JK,LAJU.JK,FWCT.JK,NAYZ.JK,IRSX.JK
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-02-08,146.0,101.0,176.0,117.0,,,,,,,...,148.0,296.0,90.0,372.0,68.0,204.0,173.0,112.0,73.0,101.0
2023-02-09,132.0,96.0,193.0,118.0,,,,,,,...,149.0,298.0,84.0,368.0,64.0,200.0,206.0,128.0,75.0,100.0
2023-02-10,124.0,120.0,212.0,131.0,,,,,,,...,147.0,306.0,88.0,344.0,65.0,200.0,244.0,130.0,72.0,93.0
2023-02-13,112.0,112.0,232.0,122.0,,,,,,,...,152.0,300.0,85.0,324.0,65.0,199.0,228.0,144.0,76.0,94.0
2023-02-14,102.0,108.0,230.0,119.0,,,,,,,...,149.0,312.0,80.0,304.0,65.0,208.0,214.0,134.0,81.0,95.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-07-20,,,,,,,,,,,...,,,,,,,,,,
2022-05-03,,,,,,,,,,,...,,,,,,,,,,
2022-05-04,,,,,,,,,,,...,,,,,,,,,,
2022-05-05,,,,,,,,,,,...,,,,,,,,,,


In [6]:
# change the index format from object into datetime 
raw_dataset.index = pd.to_datetime(raw_dataset.index)

# sort the date index
raw_dataset = raw_dataset.sort_index(ascending=True)

#raw_dataset = raw_dataset.reset_index()

# Delete unrequired rows & columns where all its value is NaN
raw_dataset.dropna(axis=0, thresh=0.01*len(raw_dataset.columns), inplace=True)
raw_dataset.dropna(axis=1, thresh=0.01*len(raw_dataset.index), inplace=True)
raw_dataset.dropna(axis=1, how='any', inplace=True)

# Replace NaN value with 0
#raw_dataset.fillna(0, inplace=True)

# set date as the index of the dataset
raw_dataset.shape



(1575, 103)

In [7]:
# Check nan value
raw_dataset.isna().any().sum()

0

In [8]:
raw_dataset

Unnamed: 0_level_0,TGKA.JK,TPIA.JK,AALI.JK,ABDA.JK,ADMG.JK,AHAP.JK,AISA.JK,AKRA.JK,ALMI.JK,AMFG.JK,...,STTP.JK,TBIG.JK,TMAS.JK,TRIS.JK,TRUS.JK,TSPC.JK,UNVR.JK,VRNA.JK,WAPO.JK,ZBRA.JK
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-01-02,2331.057373,965.105957,14254.732422,6203.368164,126.0,104.017502,1945.0,954.749207,183.0,6454.588379,...,3160.301758,854.648682,271.667847,315.879578,192.0,1583.534912,6642.215332,148.192505,53.0,50.0
2017-01-03,2331.057373,965.105957,14106.024414,6203.368164,127.0,103.484077,1975.0,926.902283,183.0,6430.503906,...,3160.301758,858.081055,248.382034,315.879578,192.0,1563.439453,6646.494629,148.192505,53.0,50.0
2017-01-04,2331.057373,969.779663,13872.340820,6203.368164,130.0,106.684624,1950.0,942.814819,183.0,6358.250977,...,3160.301758,858.081055,232.858154,315.879578,192.0,1543.343750,6881.882324,148.192505,53.0,50.0
2017-01-05,2331.057373,974.453125,13893.583984,6203.368164,131.0,112.018852,2000.0,930.880554,183.0,6358.250977,...,3160.301758,858.081055,233.634354,315.879578,192.0,1543.343750,6941.798340,100.452637,52.0,50.0
2017-01-06,2771.684082,979.126892,13914.829102,6203.368164,137.0,109.885162,1990.0,938.836731,183.0,6406.419434,...,3160.301758,875.242737,239.843918,315.879578,192.0,1559.420288,6950.358887,105.425537,52.0,50.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-04-03,6875.000000,2320.000000,8300.000000,6475.000000,128.0,54.000000,150.0,1555.000000,252.0,4900.000000,...,7525.000000,2150.000000,2890.000000,248.000000,414.0,1385.000000,4290.000000,91.000000,134.0,482.0
2023-04-04,6600.000000,2280.000000,8275.000000,6475.000000,128.0,53.000000,147.0,1565.000000,246.0,4740.000000,...,7575.000000,2100.000000,3040.000000,238.000000,416.0,1390.000000,4290.000000,92.000000,135.0,480.0
2023-04-05,6600.000000,2290.000000,8200.000000,6475.000000,128.0,53.000000,145.0,1550.000000,242.0,4820.000000,...,7450.000000,2120.000000,2970.000000,232.000000,400.0,1390.000000,4250.000000,92.000000,138.0,478.0
2023-04-06,6675.000000,2290.000000,8225.000000,6475.000000,127.0,54.000000,146.0,1575.000000,244.0,4820.000000,...,7450.000000,2090.000000,2960.000000,230.000000,436.0,1390.000000,4250.000000,92.000000,166.0,478.0


In [9]:
# Save raw dataset to file
util.pickle_dump(raw_dataset, config_data['raw_dataset_path'])

## 2. Data Definition

## 3. Data Validation

### 3.1 Data type

In [10]:
# Chek data type each variable
raw_dataset.dtypes

TGKA.JK    float64
TPIA.JK    float64
AALI.JK    float64
ABDA.JK    float64
ADMG.JK    float64
            ...   
TSPC.JK    float64
UNVR.JK    float64
VRNA.JK    float64
WAPO.JK    float64
ZBRA.JK    float64
Length: 103, dtype: object

In [11]:
# since there are a lot of columns, need to check the dtypes in summarry
# showing that there are only 2 dtypes, datetime & float.
raw_dataset.dtypes.value_counts()

float64    103
dtype: int64

### 3.2 Data Range

In [12]:
raw_dataset.describe()

Unnamed: 0,TGKA.JK,TPIA.JK,AALI.JK,ABDA.JK,ADMG.JK,AHAP.JK,AISA.JK,AKRA.JK,ALMI.JK,AMFG.JK,...,STTP.JK,TBIG.JK,TMAS.JK,TRIS.JK,TRUS.JK,TSPC.JK,UNVR.JK,VRNA.JK,WAPO.JK,ZBRA.JK
count,1575.0,1575.0,1575.0,1575.0,1575.0,1575.0,1575.0,1575.0,1575.0,1575.0,...,1575.0,1575.0,1575.0,1575.0,1575.0,1575.0,1575.0,1575.0,1575.0,1575.0
mean,4714.468968,1804.714171,10364.46253,6319.512038,210.276825,74.226615,428.363175,837.974538,299.194286,4707.902089,...,5721.353324,1593.707297,563.718496,228.193014,308.126349,1348.751584,6804.412444,114.041758,101.739683,254.263661
std,2286.706742,508.393686,1867.083245,584.907119,74.500852,19.41569,535.176894,271.960536,108.011631,1257.474393,...,2124.810008,834.819487,828.444479,57.812007,86.509762,144.717446,1748.025306,21.683638,43.252305,307.731194
min,1418.266113,965.105957,3983.26709,3099.111328,72.0,50.0,136.0,266.488281,165.0,2222.727295,...,3071.139648,555.341492,43.307297,100.710892,127.0,832.039429,3177.114014,50.0,51.0,50.0
25%,2162.954834,1314.592468,9218.847168,6113.463379,156.0,59.0,168.0,628.631378,228.0,3655.151611,...,3715.088379,967.180054,113.505714,192.990379,220.0,1259.508179,4806.386719,99.0,75.0,50.0
50%,4353.03418,1783.68457,10306.355469,6456.48291,189.0,68.0,176.0,769.433472,262.0,4840.605957,...,4557.175293,1158.237915,174.804016,237.329269,332.0,1356.647949,7418.30127,110.398438,89.0,50.0
75%,7000.0,2320.0,11886.562012,6700.0,250.0,88.392838,384.0,1024.683594,336.0,5725.0,...,7525.0,2350.0,276.907654,282.385437,360.0,1408.362427,8154.360352,124.0,113.0,545.0
max,9437.601562,2793.200684,14594.637695,7717.808105,406.0,148.0,2360.0,1615.0,765.0,8199.393555,...,12903.740234,3356.285645,3340.0,315.879578,468.0,1842.556274,9740.731445,230.0,298.0,1270.0


In [13]:
# Check data statistics, since the column qty is a lot, then we summarize the describe feature in following function
def raw_dataset_describe(dataset):

    # Get the date index statistic info
    date_df = pd.Series(dataset.index).describe(include='datetime64', datetime_is_numeric=True)

    df = dataset.describe()
    # Get the minimum value for each row across all columns and convert it to a DataFrame
    row_min_df = df.min(axis=1).to_frame('Min')

    # Get the maximum value for each row across all columns and convert it to a DataFrame
    row_max_df = df.max(axis=1).to_frame('Max')

    # Concatenate the min and max DataFrames horizontally
    result_df = pd.concat([date_df, row_min_df, row_max_df], axis=1)


    return result_df

In [14]:
# Call Dataset statistics
raw_dataset_describe(raw_dataset)

Unnamed: 0,Date,Min,Max
count,1575,1575.0,1575.0
mean,2020-02-07 12:19:39.428571648,50.0,2898571.0
min,2017-01-02 00:00:00,20.0,6500.0
25%,2018-07-14 12:00:00,46.821411,13775.0
50%,2020-01-22 00:00:00,50.0,5000000.0
75%,2021-09-07 12:00:00,50.0,5000000.0
max,2023-04-10 00:00:00,50.0,5000000.0
std,,0.0,2468648.0


### 3.3 Data Dimension

In [15]:
raw_dataset.shape

(1575, 103)

## 4. Data Defense

In [16]:
def check_data(input_data, params, print_errors=True):

    error_messages = []
    error_stock_tickers = []
    #input_data = input_data.fillna(0)
    try:
        # Check index data types
        assert input_data.index.dtype == params['datetime_index'], 'an error occurs in index format, should be datetime.'

        # Check index data type & range
        for column in input_data.columns:
            if input_data[column].dtype != 'float64':
                error_messages.append(f"Column ({column}) has a non-float data type")
                error_stock_tickers.append(column)

            if not (input_data[column] >= 0).sum() == len(input_data):
                error_messages.append(f'an error occurs in {column} column')
                if column not in error_stock_tickers:
                    error_stock_tickers.append(column)
        
        if error_messages:
            total_errors = len(error_messages)
            error_summary = f"\nTotal errors: {total_errors} errors out of {len(input_data.columns)}\n"
            raise AssertionError(error_summary + "\n".join(error_messages))
    
    except AssertionError as e:
        if print_errors:
            print(e)
    
    return error_stock_tickers



In [17]:
# Check the error stock in the dataset
check_data(raw_dataset, config_data)

[]

In [18]:
# Found error in stock above (SCPI.JK), which after checking through news and yfinance data,
# it is already delisted since 2013. So this stock should be removed since it is considered
# as an anomaly.
error_stock_tickers = check_data(raw_dataset, config_data, print_errors=False)
raw_dataset.drop(error_stock_tickers, axis=1, inplace=True)

In [19]:
# Recheck the data and found no issue
check_data(raw_dataset, config_data)


[]

In [20]:
# Anomaly Handling; change into actual value based on other source
raw_dataset['BMRI.JK'].loc['2023-03-30'] = float(5112)
raw_dataset['MYOR.JK'].loc['2022-06-14'] = float(1602.730957)


In [21]:
util.pickle_dump(raw_dataset, config_data["clean_dataset_path"])


## 5. Data Splitting

### 5.1 Time Series Split

In [22]:
# Initialize TimeSeriesSplit object
n_splits = 3
tscv = TimeSeriesSplit(n_splits = n_splits)

# Get the train & test_val indices at the last split
for train_index, test_val_index in tscv.split(raw_dataset):
    pass

# Calculate the size of the test and validation sets
test_val_size = len(test_val_index)
test_size = val_size = test_val_size // 2

# Define test and validation indices
val_index = test_val_index[:test_size]
test_index = test_val_index[test_size:]

# Extract the train, test, and validation sets
train = raw_dataset.iloc[train_index]
val = raw_dataset.iloc[val_index]
test = raw_dataset.iloc[test_index]





In [23]:
# Split feature and target columns for train, test, and validation sets
feature_columns = raw_dataset.drop([config_data['target']], axis=1).columns
target_column = config_data['target']
X_train, y_train = train[feature_columns], train[target_column]
X_test, y_test = test[feature_columns], test[target_column]
X_val, y_val = val[feature_columns], val[target_column]

print("TRAIN Set:")
display(X_train, display_id='X_train')
display(y_train, display_id='y_train')

print("TEST Set:")
display(X_test, display_id='X_test')
display(y_test, display_id='y_test')

print("VALIDATION Set:")
display(X_val, display_id='X_val')
display(y_val, display_id='y_val')





TRAIN Set:


Unnamed: 0_level_0,TGKA.JK,TPIA.JK,AALI.JK,ABDA.JK,ADMG.JK,AHAP.JK,AISA.JK,AKRA.JK,ALMI.JK,AMFG.JK,...,STTP.JK,TBIG.JK,TMAS.JK,TRIS.JK,TRUS.JK,TSPC.JK,UNVR.JK,VRNA.JK,WAPO.JK,ZBRA.JK
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-01-02,2331.057373,965.105957,14254.732422,6203.368164,126.0,104.017502,1945.0,954.749207,183.0,6454.588379,...,3160.301758,854.648682,271.667847,315.879578,192.0,1583.534912,6642.215332,148.192505,53.0,50.0
2017-01-03,2331.057373,965.105957,14106.024414,6203.368164,127.0,103.484077,1975.0,926.902283,183.0,6430.503906,...,3160.301758,858.081055,248.382034,315.879578,192.0,1563.439453,6646.494629,148.192505,53.0,50.0
2017-01-04,2331.057373,969.779663,13872.340820,6203.368164,130.0,106.684624,1950.0,942.814819,183.0,6358.250977,...,3160.301758,858.081055,232.858154,315.879578,192.0,1543.343750,6881.882324,148.192505,53.0,50.0
2017-01-05,2331.057373,974.453125,13893.583984,6203.368164,131.0,112.018852,2000.0,930.880554,183.0,6358.250977,...,3160.301758,858.081055,233.634354,315.879578,192.0,1543.343750,6941.798340,100.452637,52.0,50.0
2017-01-06,2771.684082,979.126892,13914.829102,6203.368164,137.0,109.885162,1990.0,938.836731,183.0,6406.419434,...,3160.301758,875.242737,239.843918,315.879578,192.0,1559.420288,6950.358887,105.425537,52.0,50.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-09-02,6909.985352,1898.643799,8005.515625,6854.794434,218.0,60.000000,208.0,742.435852,236.0,3872.484863,...,7225.000000,2971.300049,264.787109,110.840233,324.0,1356.647949,3962.521973,127.000000,106.0,820.0
2021-09-03,6909.985352,1886.152710,8219.948242,6854.794434,216.0,60.000000,210.0,740.507385,246.0,3872.484863,...,7025.000000,3030.528564,264.787109,110.840233,312.0,1356.647949,4048.456543,128.000000,102.0,840.0
2021-09-06,6838.005859,1848.679565,8243.774414,6854.794434,216.0,60.000000,212.0,750.149475,250.0,3872.484863,...,7025.000000,3079.885742,264.787109,107.871300,314.0,1351.969849,4058.004639,129.000000,102.0,830.0
2021-09-07,6790.020020,1779.978638,8124.645508,6854.794434,216.0,60.000000,208.0,750.149475,240.0,3803.333496,...,6950.000000,3000.914307,264.787109,108.860947,316.0,1370.682251,4058.004639,127.000000,102.0,820.0


Date
2017-01-02    2120.893799
2017-01-03    2070.505371
2017-01-04    2079.666992
2017-01-05    2061.343750
2017-01-06    2084.248047
                 ...     
2021-09-02    2747.125977
2021-09-03    2781.181396
2021-09-06    2815.236572
2021-09-07    2837.940186
2021-09-08    2837.940186
Name: BMRI.JK, Length: 1182, dtype: float64

TEST Set:


Unnamed: 0_level_0,TGKA.JK,TPIA.JK,AALI.JK,ABDA.JK,ADMG.JK,AHAP.JK,AISA.JK,AKRA.JK,ALMI.JK,AMFG.JK,...,STTP.JK,TBIG.JK,TMAS.JK,TRIS.JK,TRUS.JK,TSPC.JK,UNVR.JK,VRNA.JK,WAPO.JK,ZBRA.JK
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-07-01,7300.0,2187.50,9549.969727,5500.0,170.0,55.0,140.0,941.322937,290.0,5825.0,...,8400.0,2910.0,2463.699951,198.0,424.0,1375.000000,4759.091309,104.0,96.0,565.0
2022-07-04,7250.0,2187.50,9228.338867,5500.0,165.0,56.0,138.0,882.490295,298.0,5625.0,...,8250.0,2950.0,2346.844727,187.0,430.0,1365.178589,4631.000000,101.0,129.0,570.0
2022-07-05,7250.0,2162.50,9648.932617,5500.0,171.0,55.0,139.0,916.809326,304.0,5600.0,...,8250.0,2930.0,2502.651855,182.0,404.0,1360.267822,4690.119141,105.0,120.0,570.0
2022-07-06,7250.0,2143.75,9302.561523,5500.0,167.0,54.0,136.0,931.517517,292.0,5625.0,...,8250.0,2970.0,2502.651855,173.0,400.0,1355.357178,4709.825684,102.0,113.0,570.0
2022-07-07,7250.0,2175.00,9302.561523,6050.0,170.0,54.0,138.0,941.322937,292.0,5675.0,...,8250.0,2990.0,2541.603516,174.0,414.0,1355.357178,4926.595703,104.0,113.0,565.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-04-03,6875.0,2320.00,8300.000000,6475.0,128.0,54.0,150.0,1555.000000,252.0,4900.0,...,7525.0,2150.0,2890.000000,248.0,414.0,1385.000000,4290.000000,91.0,134.0,482.0
2023-04-04,6600.0,2280.00,8275.000000,6475.0,128.0,53.0,147.0,1565.000000,246.0,4740.0,...,7575.0,2100.0,3040.000000,238.0,416.0,1390.000000,4290.000000,92.0,135.0,480.0
2023-04-05,6600.0,2290.00,8200.000000,6475.0,128.0,53.0,145.0,1550.000000,242.0,4820.0,...,7450.0,2120.0,2970.000000,232.0,400.0,1390.000000,4250.000000,92.0,138.0,478.0
2023-04-06,6675.0,2290.00,8225.000000,6475.0,127.0,54.0,146.0,1575.000000,244.0,4820.0,...,7450.0,2090.0,2960.000000,230.0,436.0,1390.000000,4250.000000,92.0,166.0,478.0


Date
2022-07-01    3615.460693
2022-07-04    3520.316895
2022-07-05    3639.246582
2022-07-06    3579.781738
2022-07-07    3532.209961
                 ...     
2023-04-03    5262.500000
2023-04-04    5200.000000
2023-04-05    5225.000000
2023-04-06    5175.000000
2023-04-10    5100.000000
Name: BMRI.JK, Length: 197, dtype: float64

VALIDATION Set:


Unnamed: 0_level_0,TGKA.JK,TPIA.JK,AALI.JK,ABDA.JK,ADMG.JK,AHAP.JK,AISA.JK,AKRA.JK,ALMI.JK,AMFG.JK,...,STTP.JK,TBIG.JK,TMAS.JK,TRIS.JK,TRUS.JK,TSPC.JK,UNVR.JK,VRNA.JK,WAPO.JK,ZBRA.JK
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-09-09,7005.957031,1711.277710,8172.296387,6854.794434,208.0,61.0,206.0,727.008606,250.0,3803.333496,...,6925.0,2961.428467,257.328308,108.860947,312.0,1356.647949,3914.781006,123.0,100.0,755.0
2021-09-10,6981.963867,1798.715210,8100.819824,6854.794434,218.0,62.0,206.0,721.223328,246.0,3803.333496,...,6925.0,2921.942871,257.328308,107.871300,318.0,1347.291748,3895.684326,124.0,101.0,770.0
2021-09-13,6909.985352,1761.241943,7957.863770,6854.794434,216.0,61.0,212.0,728.936951,246.0,3803.333496,...,6925.0,2872.585693,261.057709,107.871300,314.0,1342.613647,3857.491455,124.0,98.0,755.0
2021-09-14,6814.013184,1767.487549,7981.689453,6854.794434,216.0,60.0,212.0,752.077820,240.0,3872.484863,...,6925.0,2951.557129,268.516510,107.871300,320.0,1351.969849,3867.039551,130.0,99.0,740.0
2021-09-15,7053.943359,1748.750854,8100.819824,6854.794434,230.0,60.0,214.0,744.364197,240.0,3803.333496,...,6925.0,2951.557129,275.975311,107.871300,324.0,1356.647949,3857.491455,129.0,100.0,725.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-06-24,7300.000000,2362.500000,9698.415039,5900.000000,177.0,57.0,152.0,1039.377441,292.0,6520.000000,...,8400.0,2890.000000,2405.272217,200.000000,422.0,1375.000000,4916.742676,103.0,122.0,575.0
2022-06-27,7100.000000,2362.500000,9624.192383,5900.000000,176.0,56.0,151.0,1049.182861,292.0,6375.000000,...,8400.0,2920.000000,2580.555420,202.000000,430.0,1379.910767,4906.889160,107.0,123.0,580.0
2022-06-28,7100.000000,2362.500000,9549.969727,5900.000000,176.0,57.0,151.0,1034.474731,292.0,6125.000000,...,8250.0,2910.000000,2629.245117,204.000000,430.0,1379.910767,4975.861816,107.0,118.0,575.0
2022-06-29,7250.000000,2293.750000,9549.969727,5900.000000,175.0,57.0,150.0,1014.863831,306.0,6125.000000,...,8250.0,2870.000000,2531.865479,200.000000,424.0,1370.089355,4867.476562,106.0,110.0,575.0


Date
2021-09-09    2883.347168
2021-09-10    2815.236572
2021-09-13    2792.532959
2021-09-14    2769.829590
2021-09-15    2781.181396
                 ...     
2022-06-24    3948.463623
2022-06-27    3912.784668
2022-06-28    3865.212891
2022-06-29    3781.962158
2022-06-30    3770.069092
Name: BMRI.JK, Length: 196, dtype: float64

<DisplayHandle display_id=y_val>

In [24]:
util.pickle_dump(X_train, config_data["train_set_path"][0])
util.pickle_dump(y_train, config_data["train_set_path"][1])

util.pickle_dump(X_val, config_data["valid_set_path"][0])
util.pickle_dump(y_val, config_data["valid_set_path"][1])

util.pickle_dump(X_test, config_data["test_set_path"][0])
util.pickle_dump(y_test, config_data["test_set_path"][1])

### 5.2 Random Split

In [25]:
from sklearn.model_selection import train_test_split

X = raw_dataset.iloc[:-2,:]
y = raw_dataset[config_data['target']].shift(periods=-2).iloc[:-2]

X_train_ran, X_test_ran, y_train_ran, y_test_ran = train_test_split(X, y, test_size = 0.3, random_state = 123)
X_val_ran, X_test_ran, y_val_ran, y_test_ran = train_test_split(X_test_ran, y_test_ran, test_size = 0.5, random_state = 123)


In [26]:
util.pickle_dump(X_train_ran, config_data["train_ran_set_path"][0])
util.pickle_dump(y_train_ran, config_data["train_ran_set_path"][1])

util.pickle_dump(X_val_ran, config_data["valid_ran_set_path"][0])
util.pickle_dump(y_val_ran, config_data["valid_ran_set_path"][1])

util.pickle_dump(X_test_ran, config_data["test_ran_set_path"][0])
util.pickle_dump(y_test_ran, config_data["test_ran_set_path"][1])