## 0. Load Required Libraries

In [16]:
import pandas as pd
import numpy as np
import seaborn as sns
import src.util as util

## 1. Import Configuration File

In [17]:
config_data = util.load_config()

## 2. Load Dataset

In [18]:
def load_dataset(config_data: dict) -> pd.DataFrame:
    
    # Load every set of data
    clean_data = util.pickle_load(config_data['clean_dataset_path'])

    x_train = util.pickle_load(config_data["train_set_path"][0])
    y_train = util.pickle_load(config_data["train_set_path"][1])

    x_valid = util.pickle_load(config_data["valid_set_path"][0])
    y_valid = util.pickle_load(config_data["valid_set_path"][1])

    x_test = util.pickle_load(config_data["test_set_path"][0])
    y_test = util.pickle_load(config_data["test_set_path"][1])

    # Concatenate x and y each set
    train_set = pd.concat([x_train, y_train], axis = 1)
    valid_set = pd.concat([x_valid, y_valid], axis = 1)
    test_set = pd.concat([x_test, y_test], axis = 1)

    # Return 3 set of data
    return clean_data, train_set, valid_set, test_set

In [19]:
clean_data, train_set, valid_set, test_set = load_dataset(config_data)

## 3. Feature Engineering

### 3.1 Stock Return Data Transform

In [20]:
# as the way to normalize all of data value, its relevant if we change them into return percentage.
# the advantage are: 
# 1. the data value will vary from -0.5 to +0.5. While its possible, its less likely stock change will be up/down more than 50% within 2 days. 
# 2. the stock return is something we want to know anyway therefore its a representative approach in this case

def transform_to_stock_return(dataset, params):
    # define the return for all stock based on the next day of its price change percentage 
    dataset = (dataset.shift(periods=1)-dataset)*100/dataset
    
    #define the target return column name
    target_return_column_name = f"{params['target']} Return D+2"
    
    # add additional column of our targeted stock return
    dataset[target_return_column_name] = dataset[params['target']].shift(periods=-2)

    # handling missing value of shifted targeted column & its reference column
    dataset.dropna(subset=params['target'], inplace=True)
    dataset.dropna(subset=target_return_column_name, inplace=True)

    # handling missing value of the remaining columns
    #dataset.fillna(0, inplace=True)

    return dataset



In [21]:
def remove_outliers(df,n_std):
    for col in df.columns:
        #print('Working on column: {}'.format(col))
        
        mean = df[col].mean()
        sd = df[col].std()
        
        df = df[(df[col] <= mean+(n_std*sd))]
        
    return df



In [22]:
train_set_feng = transform_to_stock_return(dataset=train_set, params=config_data)
#train_set_feng = remove_outliers(train_set_feng, 3)

val_set_feng = transform_to_stock_return(dataset=valid_set, params=config_data)
#val_set_feng = remove_outliers(val_set_feng, 3)

test_set_feng = transform_to_stock_return(dataset=test_set, params=config_data)
#test_set_feng = remove_outliers(test_set_feng, 3)

In [23]:
display(train_set_feng.isna().any().sum())
display(util.summary_dataset_describe(dataset=train_set_feng))
display(train_set_feng.shape)

0

Unnamed: 0,Date,Min,Max
count,232,232.0,232.0
mean,2022-06-27 12:18:37.241379328,-0.901008,42.069239
min,2022-01-04 00:00:00,-98.994475,0.0
25%,2022-03-29 18:00:00,-3.267432,0.0
50%,2022-07-02 12:00:00,-0.320349,1.777813
75%,2022-09-22 06:00:00,0.0,7.057646
max,2022-12-13 00:00:00,0.0,9847.367509
std,,0.0,646.571208


(232, 760)

In [24]:
display(val_set_feng.isna().any().sum())
display(util.summary_dataset_describe(dataset=val_set_feng))
display(val_set_feng.shape)

0

Unnamed: 0,Date,Min,Max
count,36,36.0,36.0
mean,2023-01-12 03:20:00,-2.049681,11.239143
min,2022-12-19 00:00:00,-25.862069,0.0
25%,2022-12-29 18:00:00,-8.746693,1.158666
50%,2023-01-11 12:00:00,-3.919022,6.959274
75%,2023-01-25 06:00:00,0.0,7.333442
max,2023-02-07 00:00:00,0.0,368.366183
std,,0.0,61.401242


(36, 760)

In [25]:
display(test_set_feng.isna().any().sum())
display(util.summary_dataset_describe(dataset=test_set_feng))
display(test_set_feng.shape)

0

Unnamed: 0,Date,Min,Max
count,36,36.0,36.0
mean,2023-03-09 10:40:00,-2.209924,3.308173
min,2023-02-13 00:00:00,-25.903614,0.0
25%,2023-02-23 18:00:00,-7.449495,6.818182
50%,2023-03-08 12:00:00,-1.098934,7.284768
75%,2023-03-21 18:00:00,0.0,7.431793
max,2023-04-05 00:00:00,0.0,15.789474
std,,0.0,8.742476


(36, 760)

## 4. Feature Selection

### 4.1 Filter Correlated Feature

In [26]:
def keep_correlated_features(train_set, val_set, test_set, params):
    #define the target return column name
    target_return_column_name = f"{params['target']} Return D+2"

    # define the correlated features
    corr_stock = train_set.corrwith(train_set[target_return_column_name], axis=0).nlargest(10).sort_values(ascending=True)

    # keep correlated features
    train_set = train_set[corr_stock.index]
    val_set = val_set[corr_stock.index]
    test_set = test_set[corr_stock.index]

    return corr_stock, train_set, val_set, test_set

In [27]:
corr_stock, train_set_feng, val_set_feng, test_set_feng = keep_correlated_features(train_set= train_set_feng, val_set= val_set_feng, test_set= test_set_feng,params= config_data)
display(corr_stock, display_id='corr_stock_list')
display(train_set_feng, display_id='train set')
display(val_set_feng, display_id='val set')
display(test_set_feng, display_id='test set')

INTD.JK               0.169867
ULTJ.JK               0.174778
PDES.JK               0.175399
KICI.JK               0.190171
PGJO.JK               0.198470
IKBI.JK               0.200998
APII.JK               0.202692
TLKM.JK               0.203662
JKON.JK               0.236510
BMRI.JK Return D+2    1.000000
dtype: float64

Unnamed: 0_level_0,INTD.JK,ULTJ.JK,PDES.JK,KICI.JK,PGJO.JK,IKBI.JK,APII.JK,TLKM.JK,JKON.JK,BMRI.JK Return D+2
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2022-01-04,0.775194,0.321537,0.000000,-1.388889,0.000000,3.333333,0.952381,0.239804,3.361345,0.000000
2022-01-05,0.781250,-0.320507,0.000000,-0.689655,0.000000,-2.439024,0.961538,2.962968,-0.833333,-0.354611
2022-01-06,3.225806,0.645157,0.000000,2.112676,0.000000,0.819672,6.122449,-1.699030,0.000000,0.000000
2022-01-07,-0.800000,0.000000,0.000000,-2.068966,-4.878049,0.000000,-1.507538,-1.199044,1.694915,0.000000
2022-01-10,1.626016,-0.321545,0.000000,2.836879,1.234568,-0.813008,-2.450980,1.707318,2.608696,-1.398600
...,...,...,...,...,...,...,...,...,...,...
2022-12-07,1.242236,-0.357143,0.653595,6.000000,4.166667,0.000000,0.502513,-2.173913,7.518797,1.259447
2022-12-08,-2.424242,1.083032,-0.649351,0.502513,0.000000,0.961538,-0.500000,-1.604278,3.906250,-0.251258
2022-12-09,5.769231,-1.071429,0.000000,4.188482,-1.369863,0.000000,-0.990099,2.465753,4.918033,0.505045
2022-12-12,-2.500000,-1.060071,-0.645161,-4.500000,0.000000,0.000000,1.000000,-1.351351,-1.612903,-0.502507


Unnamed: 0_level_0,INTD.JK,ULTJ.JK,PDES.JK,KICI.JK,PGJO.JK,IKBI.JK,APII.JK,TLKM.JK,JKON.JK,BMRI.JK Return D+2
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2022-12-19,0.621118,0.344828,0.0,0.980392,2.531646,2.884615,2.020202,-1.075269,1.470588,0.499994
2022-12-20,0.0,-0.343643,0.653595,2.0,-1.25,-2.803738,1.020408,0.0,-2.857143,0.502517
2022-12-21,-4.166667,0.692042,0.0,3.626943,1.265823,0.0,0.0,-1.846966,6.060606,0.251891
2022-12-22,3.703704,0.696864,0.0,0.520833,0.0,2.884615,2.617801,1.066667,0.0,-1.243782
2022-12-23,-4.705882,-1.034483,0.0,-3.030303,0.0,-5.454545,-4.5,-0.793651,0.763359,1.005024
2022-12-26,2.409639,-2.356902,-0.649351,0.0,1.282051,0.917431,1.010101,0.8,3.149606,-0.250629
2022-12-27,3.10559,0.337838,0.0,-1.0,-1.265823,2.830189,-1.980198,-1.574803,0.0,0.0
2022-12-28,1.898734,0.338983,0.0,0.0,-1.25,2.912621,1.507538,2.144772,3.252033,0.503783
2022-12-29,-1.863354,0.0,-0.645161,-2.912621,2.564103,0.0,-0.5,-1.322751,0.0,0.506323
2022-12-30,-1.226994,0.0,1.30719,0.0,2.631579,-1.904762,1.522843,0.8,3.361345,-0.753765


Unnamed: 0_level_0,INTD.JK,ULTJ.JK,PDES.JK,KICI.JK,PGJO.JK,IKBI.JK,APII.JK,TLKM.JK,JKON.JK,BMRI.JK Return D+2
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2023-02-13,1.980198,0.0,-0.662252,-0.518135,1.315789,2.857143,-0.505051,0.263852,1.526718,1.466995
2023-02-14,2.538071,0.0,0.0,1.578947,10.144928,-0.943396,0.507614,-0.263158,-4.379562,0.491405
2023-02-15,-1.5,1.010101,0.666667,0.0,-4.166667,-0.934579,0.0,0.263852,2.238806,-1.213593
2023-02-16,1.010101,-0.668896,0.0,0.0,4.347826,-0.925926,1.025641,0.530504,0.0,0.243301
2023-02-17,0.0,-0.333333,-0.662252,-1.041667,-2.816901,1.886792,-1.015228,0.265957,-2.189781,0.735301
2023-02-20,1.538462,-0.332226,0.666667,1.052632,-1.388889,1.923077,1.025641,-1.570681,-4.861111,1.999997
2023-02-21,-0.510204,0.668896,-0.662252,0.0,1.408451,-0.952381,0.0,-0.520833,2.12766,-0.744413
2023-02-22,-2.0,0.673401,0.0,-1.041667,-1.388889,-0.943396,0.0,-0.518135,5.223881,-0.493831
2023-02-23,2.564103,-1.655629,0.666667,2.673797,1.408451,0.952381,0.515464,-2.770781,1.515152,-0.246308
2023-02-24,0.515464,-0.983607,0.0,1.630435,0.0,0.961538,-1.020408,-1.732673,0.0,1.500003


<DisplayHandle display_id=test set>

## 5. Dump Dataset

In [28]:
X_train = train_set_feng.iloc[:,:-1]
y_train = train_set_feng.iloc[:,-1]

X_val = val_set_feng.iloc[:,:-1]
y_val = val_set_feng.iloc[:,-1]

X_test = test_set_feng.iloc[:,:-1]
y_test = test_set_feng.iloc[:,-1]



In [29]:
util.pickle_dump(X_train, config_data["train_feng_set_path"][0])
util.pickle_dump(y_train, config_data["train_feng_set_path"][1])

util.pickle_dump(X_val, config_data["valid_feng_set_path"][0])
util.pickle_dump(y_val, config_data["valid_feng_set_path"][1])

util.pickle_dump(X_test, config_data["test_feng_set_path"][0])
util.pickle_dump(y_test, config_data["test_feng_set_path"][1])

In [30]:
display(X_train)
display(y_train)
display(X_val)
display(y_val)
display(X_test)
display(y_test)

Unnamed: 0_level_0,INTD.JK,ULTJ.JK,PDES.JK,KICI.JK,PGJO.JK,IKBI.JK,APII.JK,TLKM.JK,JKON.JK
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2022-01-04,0.775194,0.321537,0.000000,-1.388889,0.000000,3.333333,0.952381,0.239804,3.361345
2022-01-05,0.781250,-0.320507,0.000000,-0.689655,0.000000,-2.439024,0.961538,2.962968,-0.833333
2022-01-06,3.225806,0.645157,0.000000,2.112676,0.000000,0.819672,6.122449,-1.699030,0.000000
2022-01-07,-0.800000,0.000000,0.000000,-2.068966,-4.878049,0.000000,-1.507538,-1.199044,1.694915
2022-01-10,1.626016,-0.321545,0.000000,2.836879,1.234568,-0.813008,-2.450980,1.707318,2.608696
...,...,...,...,...,...,...,...,...,...
2022-12-07,1.242236,-0.357143,0.653595,6.000000,4.166667,0.000000,0.502513,-2.173913,7.518797
2022-12-08,-2.424242,1.083032,-0.649351,0.502513,0.000000,0.961538,-0.500000,-1.604278,3.906250
2022-12-09,5.769231,-1.071429,0.000000,4.188482,-1.369863,0.000000,-0.990099,2.465753,4.918033
2022-12-12,-2.500000,-1.060071,-0.645161,-4.500000,0.000000,0.000000,1.000000,-1.351351,-1.612903


Date
2022-01-04    0.000000
2022-01-05   -0.354611
2022-01-06    0.000000
2022-01-07    0.000000
2022-01-10   -1.398600
                ...   
2022-12-07    1.259447
2022-12-08   -0.251258
2022-12-09    0.505045
2022-12-12   -0.502507
2022-12-13    0.505045
Name: BMRI.JK Return D+2, Length: 232, dtype: float64

Unnamed: 0_level_0,INTD.JK,ULTJ.JK,PDES.JK,KICI.JK,PGJO.JK,IKBI.JK,APII.JK,TLKM.JK,JKON.JK
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2022-12-19,0.621118,0.344828,0.0,0.980392,2.531646,2.884615,2.020202,-1.075269,1.470588
2022-12-20,0.0,-0.343643,0.653595,2.0,-1.25,-2.803738,1.020408,0.0,-2.857143
2022-12-21,-4.166667,0.692042,0.0,3.626943,1.265823,0.0,0.0,-1.846966,6.060606
2022-12-22,3.703704,0.696864,0.0,0.520833,0.0,2.884615,2.617801,1.066667,0.0
2022-12-23,-4.705882,-1.034483,0.0,-3.030303,0.0,-5.454545,-4.5,-0.793651,0.763359
2022-12-26,2.409639,-2.356902,-0.649351,0.0,1.282051,0.917431,1.010101,0.8,3.149606
2022-12-27,3.10559,0.337838,0.0,-1.0,-1.265823,2.830189,-1.980198,-1.574803,0.0
2022-12-28,1.898734,0.338983,0.0,0.0,-1.25,2.912621,1.507538,2.144772,3.252033
2022-12-29,-1.863354,0.0,-0.645161,-2.912621,2.564103,0.0,-0.5,-1.322751,0.0
2022-12-30,-1.226994,0.0,1.30719,0.0,2.631579,-1.904762,1.522843,0.8,3.361345


Date
2022-12-19    0.499994
2022-12-20    0.502517
2022-12-21    0.251891
2022-12-22   -1.243782
2022-12-23    1.005024
2022-12-26   -0.250629
2022-12-27    0.000000
2022-12-28    0.503783
2022-12-29    0.506323
2022-12-30   -0.753765
2023-01-02   -0.748136
2023-01-03    2.035631
2023-01-04    0.255094
2023-01-05    0.771215
2023-01-06    4.851750
2023-01-09    3.631282
2023-01-10   -2.717393
2023-01-11    0.272482
2023-01-12   -1.344087
2023-01-13   -4.615383
2023-01-16    2.094238
2023-01-17   -2.051279
2023-01-18   -2.255638
2023-01-19    0.757572
2023-01-20    2.061863
2023-01-24   -1.272265
2023-01-25   -1.995020
2023-01-26    0.753775
2023-01-27    0.000000
2023-01-30    2.577321
2023-01-31   -0.257072
2023-02-01   -2.015110
2023-02-02    0.506323
2023-02-03   -2.948399
2023-02-06   -0.245100
2023-02-07   -0.487809
Name: BMRI.JK Return D+2, dtype: float64

Unnamed: 0_level_0,INTD.JK,ULTJ.JK,PDES.JK,KICI.JK,PGJO.JK,IKBI.JK,APII.JK,TLKM.JK,JKON.JK
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2023-02-13,1.980198,0.0,-0.662252,-0.518135,1.315789,2.857143,-0.505051,0.263852,1.526718
2023-02-14,2.538071,0.0,0.0,1.578947,10.144928,-0.943396,0.507614,-0.263158,-4.379562
2023-02-15,-1.5,1.010101,0.666667,0.0,-4.166667,-0.934579,0.0,0.263852,2.238806
2023-02-16,1.010101,-0.668896,0.0,0.0,4.347826,-0.925926,1.025641,0.530504,0.0
2023-02-17,0.0,-0.333333,-0.662252,-1.041667,-2.816901,1.886792,-1.015228,0.265957,-2.189781
2023-02-20,1.538462,-0.332226,0.666667,1.052632,-1.388889,1.923077,1.025641,-1.570681,-4.861111
2023-02-21,-0.510204,0.668896,-0.662252,0.0,1.408451,-0.952381,0.0,-0.520833,2.12766
2023-02-22,-2.0,0.673401,0.0,-1.041667,-1.388889,-0.943396,0.0,-0.518135,5.223881
2023-02-23,2.564103,-1.655629,0.666667,2.673797,1.408451,0.952381,0.515464,-2.770781,1.515152
2023-02-24,0.515464,-0.983607,0.0,1.630435,0.0,0.961538,-1.020408,-1.732673,0.0


Date
2023-02-13    1.466995
2023-02-14    0.491405
2023-02-15   -1.213593
2023-02-16    0.243301
2023-02-17    0.735301
2023-02-20    1.999997
2023-02-21   -0.744413
2023-02-22   -0.493831
2023-02-23   -0.246308
2023-02-24    1.500003
2023-02-27   -0.990098
2023-02-28   -1.222495
2023-03-01    1.741299
2023-03-02   -0.740747
2023-03-03   -0.491395
2023-03-06   -0.973244
2023-03-07   -1.674636
2023-03-08    0.722888
2023-03-09    0.241548
2023-03-10    4.282122
2023-03-13   -1.243782
2023-03-14    2.030454
2023-03-15   -2.475249
2023-03-16    0.999998
2023-03-17   -4.761907
2023-03-20   -3.669719
2023-03-21    1.673168
2023-03-24    0.740741
2023-03-27   -2.173913
2023-03-28    1.232394
2023-03-29   -0.978208
2023-03-30   -1.900238
2023-03-31    1.201923
2023-04-03   -0.478469
2023-04-04    0.966184
2023-04-05    1.470588
Name: BMRI.JK Return D+2, dtype: float64