## 0. Load Required Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import src.util as util

## 1. Import Configuration File

In [2]:
config_data = util.load_config()

## 2. Load Dataset

In [3]:
def load_dataset(config_data: dict) -> pd.DataFrame:
    
    # Load every set of data
    clean_data = util.pickle_load(config_data['clean_dataset_path'])

    x_train = util.pickle_load(config_data["train_set_path"][0])
    y_train = util.pickle_load(config_data["train_set_path"][1])

    x_valid = util.pickle_load(config_data["valid_set_path"][0])
    y_valid = util.pickle_load(config_data["valid_set_path"][1])

    x_test = util.pickle_load(config_data["test_set_path"][0])
    y_test = util.pickle_load(config_data["test_set_path"][1])

    # Concatenate x and y each set
    train_set = pd.concat([x_train, y_train], axis = 1)
    valid_set = pd.concat([x_valid, y_valid], axis = 1)
    test_set = pd.concat([x_test, y_test], axis = 1)

    # Return 3 set of data
    return clean_data, train_set, valid_set, test_set

In [4]:
clean_data, train_set, valid_set, test_set = load_dataset(config_data)

## 3. Feature Engineering

### 3.1 Stock Return Data Transform

In [5]:
# as the way to normalize all of data value, its relevant if we change them into return percentage.
# the advantage are: 
# 1. the data value will vary from -0.5 to +0.5. While its possible, its less likely stock change will be up/down more than 50% within 2 days. 
# 2. the stock return is something we want to know anyway therefore its a representative approach in this case

def transform_to_stock_return(dataset, params):
    # define the return for all stock based on the next day of its price change percentage 
    dataset = (dataset.shift(periods=1)-dataset)*100/dataset
    
    #define the target return column name
    target_return_column_name = f"{params['target']} Return D+2"
    
    # add additional column of our targeted stock return
    dataset[target_return_column_name] = dataset[params['target']].shift(periods=-2)

    # handling missing value of shifted targeted column & its reference column
    dataset.dropna(subset=params['target'], inplace=True)
    dataset.dropna(subset=target_return_column_name, inplace=True)

    # handling missing value of the remaining columns
    #dataset.fillna(0, inplace=True)

    return dataset



In [6]:
def remove_outliers(df,n_std):
    for col in df.columns:
        #print('Working on column: {}'.format(col))
        
        mean = df[col].mean()
        sd = df[col].std()
        
        df = df[(df[col] <= mean+(n_std*sd))]
        
    return df



In [7]:
train_set_feng = transform_to_stock_return(dataset=train_set, params=config_data)
train_set_feng = remove_outliers(train_set_feng, 3)

val_set_feng = transform_to_stock_return(dataset=valid_set, params=config_data)
val_set_feng = remove_outliers(val_set_feng, 3)

test_set_feng = transform_to_stock_return(dataset=test_set, params=config_data)
test_set_feng = remove_outliers(test_set_feng, 3)

In [8]:
display(train_set_feng.isna().any().sum())
display(util.summary_dataset_describe(dataset=train_set_feng))
display(train_set_feng.shape)

0

Unnamed: 0,Date,Min,Max
count,24,24.0,24.0
mean,2022-09-11 03:00:00,-4.051934,2.385383
min,2022-07-05 00:00:00,-25.925926,0.0
25%,2022-08-02 00:00:00,-7.204922,0.0
50%,2022-09-12 12:00:00,-3.253968,3.738513
75%,2022-10-10 06:00:00,-0.161988,6.952519
max,2022-11-25 00:00:00,0.0,10.071942
std,,0.0,9.062522


(24, 760)

In [9]:
display(val_set_feng.isna().any().sum())
display(util.summary_dataset_describe(dataset=val_set_feng))
display(val_set_feng.shape)

0

Unnamed: 0,Date,Min,Max
count,10,10.0,10.0
mean,2023-01-25 04:48:00,-3.703387,6.892291
min,2023-01-12 00:00:00,-25.700447,4.938272
25%,2023-01-21 00:00:00,-13.162202,7.168676
50%,2023-01-25 12:00:00,-4.083333,7.325424
75%,2023-01-29 06:00:00,-1.992017,9.318182
max,2023-02-06 00:00:00,0.0,11.111111
std,,0.0,13.046288


(10, 760)

In [10]:
display(test_set_feng.isna().any().sum())
display(util.summary_dataset_describe(dataset=test_set_feng))
display(test_set_feng.shape)

0

Unnamed: 0,Date,Min,Max
count,10,10.0,10.0
mean,2023-03-06 16:48:00,-4.481199,7.346397
min,2023-02-15 00:00:00,-25.438596,7.051282
25%,2023-02-21 18:00:00,-11.341463,7.171001
50%,2023-03-04 12:00:00,-3.383872,7.445716
75%,2023-03-22 12:00:00,-1.860587,7.46912
max,2023-03-30 00:00:00,-0.617284,7.526882
std,,0.0,9.505263


(10, 760)

## 4. Feature Selection

### 4.1 Filter Correlated Feature

In [11]:
def keep_correlated_features(train_set, val_set, test_set, params):
    #define the target return column name
    target_return_column_name = f"{params['target']} Return D+2"

    # define the correlated features
    corr_stock = train_set.corrwith(train_set[target_return_column_name], axis=0).nlargest(10).sort_values(ascending=True)

    # keep correlated features
    train_set = train_set[corr_stock.index]
    val_set = val_set[corr_stock.index]
    test_set = test_set[corr_stock.index]

    return corr_stock, train_set, val_set, test_set

In [12]:
corr_stock, train_set_feng, val_set_feng, test_set_feng = keep_correlated_features(train_set= train_set_feng, val_set= val_set_feng, test_set= train_set_feng,params= config_data)
display(corr_stock, display_id='corr_stock_list')
display(train_set_feng, display_id='train set')
display(val_set_feng, display_id='val set')
display(test_set_feng, display_id='test set')

CPIN.JK               0.419408
IFSH.JK               0.421512
PURI.JK               0.423632
MPRO.JK               0.433791
JSPT.JK               0.455741
PUDP.JK               0.527219
LUCY.JK               0.564065
MMLP.JK               0.582978
CMNP.JK               0.584937
BMRI.JK Return D+2    1.000000
dtype: float64

Unnamed: 0_level_0,CPIN.JK,IFSH.JK,PURI.JK,MPRO.JK,JSPT.JK,PUDP.JK,LUCY.JK,MMLP.JK,CMNP.JK,BMRI.JK Return D+2
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2022-07-05,3.587444,0.763359,2.24215,7.38255,0.0,1.796407,0.0,-3.414634,0.0,1.346799
2022-07-18,-2.608696,0.0,-0.446429,0.0,-1.840491,-4.519774,-8.72093,-10.454545,-6.829268,-6.012652
2022-07-21,-1.666667,1.612903,6.635071,6.622517,0.0,0.0,8.994709,0.952381,3.030303,0.964625
2022-07-22,1.265823,-2.362205,-11.715481,0.0,-4.216867,-2.777778,-7.352941,-3.225806,0.0,-2.812499
2022-07-26,2.136752,0.389105,-1.244813,3.921569,5.732484,-1.104972,3.517588,4.0,-1.25,0.621117
2022-07-27,-0.847458,0.784314,0.83682,0.0,0.0,-1.092896,-1.485149,-4.761905,-1.960784,-2.719032
2022-08-04,1.287554,0.0,-0.444444,0.649351,0.0,-1.129944,0.505051,1.0,0.980392,-1.197609
2022-08-08,-1.709402,-0.423729,6.666667,-0.689655,0.0,-2.173913,1.546392,1.814516,-2.392344,-1.453489
2022-08-23,1.276596,3.673469,-0.454545,1.388889,0.0,1.477833,1.0,-2.92887,0.0,0.583095
2022-09-08,1.287554,-0.900901,2.564103,3.816794,0.0,3.804348,-2.094241,0.0,1.269036,-1.089917


Unnamed: 0_level_0,CPIN.JK,IFSH.JK,PURI.JK,MPRO.JK,JSPT.JK,PUDP.JK,LUCY.JK,MMLP.JK,CMNP.JK,BMRI.JK Return D+2
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2023-01-12,0.0,3.030303,3.703704,-3.216374,0.0,5.347594,0.529101,1.276596,3.013699,-1.344087
2023-01-18,0.0,1.99005,5.504587,0.0,0.0,0.0,-0.534759,-2.44898,3.125,-2.255638
2023-01-20,1.680672,0.0,0.0,2.639296,0.621118,0.0,0.0,0.414938,1.06383,2.061863
2023-01-24,-0.41841,-0.497512,-0.943396,-2.571429,0.0,4.787234,1.648352,-0.413223,-0.265252,-1.272265
2023-01-25,1.702128,1.515152,0.952381,0.0,0.0,-3.589744,0.0,2.109705,-0.26455,-1.99502
2023-01-26,-1.260504,-0.502513,0.961538,0.0,0.0,0.0,-5.699482,1.282051,0.531915,0.753775
2023-01-27,1.709402,-1.485149,-20.0,0.0,0.0,3.174603,-1.025641,-0.425532,-2.337662,0.0
2023-01-30,-0.425532,1.507538,-19.753086,0.0,0.0,0.0,0.515464,2.620087,0.78534,2.577321
2023-02-02,1.293103,-6.818182,-1.398601,0.0,0.0,0.0,-1.546392,0.0,0.802139,0.506323
2023-02-06,0.431034,0.0,-5.517241,1.714286,-1.840491,0.0,0.526316,-1.67364,0.540541,-0.2451


Unnamed: 0_level_0,CPIN.JK,IFSH.JK,PURI.JK,MPRO.JK,JSPT.JK,PUDP.JK,LUCY.JK,MMLP.JK,CMNP.JK,BMRI.JK Return D+2
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2022-07-05,3.587444,0.763359,2.24215,7.38255,0.0,1.796407,0.0,-3.414634,0.0,1.346799
2022-07-18,-2.608696,0.0,-0.446429,0.0,-1.840491,-4.519774,-8.72093,-10.454545,-6.829268,-6.012652
2022-07-21,-1.666667,1.612903,6.635071,6.622517,0.0,0.0,8.994709,0.952381,3.030303,0.964625
2022-07-22,1.265823,-2.362205,-11.715481,0.0,-4.216867,-2.777778,-7.352941,-3.225806,0.0,-2.812499
2022-07-26,2.136752,0.389105,-1.244813,3.921569,5.732484,-1.104972,3.517588,4.0,-1.25,0.621117
2022-07-27,-0.847458,0.784314,0.83682,0.0,0.0,-1.092896,-1.485149,-4.761905,-1.960784,-2.719032
2022-08-04,1.287554,0.0,-0.444444,0.649351,0.0,-1.129944,0.505051,1.0,0.980392,-1.197609
2022-08-08,-1.709402,-0.423729,6.666667,-0.689655,0.0,-2.173913,1.546392,1.814516,-2.392344,-1.453489
2022-08-23,1.276596,3.673469,-0.454545,1.388889,0.0,1.477833,1.0,-2.92887,0.0,0.583095
2022-09-08,1.287554,-0.900901,2.564103,3.816794,0.0,3.804348,-2.094241,0.0,1.269036,-1.089917


<DisplayHandle display_id=test set>

## 5. Dump Dataset

In [18]:
X_train = train_set_feng.iloc[:,:-1]
y_train = train_set_feng.iloc[:,-1]

X_val = val_set_feng.iloc[:,:-1]
y_val = val_set_feng.iloc[:,-1]

X_test = test_set_feng.iloc[:,:-1]
y_test = test_set_feng.iloc[:,-1]



In [19]:
util.pickle_dump(X_train, config_data["train_feng_set_path"][0])
util.pickle_dump(y_train, config_data["train_feng_set_path"][1])

util.pickle_dump(X_val, config_data["valid_feng_set_path"][0])
util.pickle_dump(y_val, config_data["valid_feng_set_path"][1])

util.pickle_dump(X_test, config_data["test_feng_set_path"][0])
util.pickle_dump(y_test, config_data["test_feng_set_path"][1])

In [20]:
display(X_train)
display(y_train)
display(X_val)
display(y_val)
display(X_test)
display(y_test)

Unnamed: 0_level_0,CPIN.JK,IFSH.JK,PURI.JK,MPRO.JK,JSPT.JK,PUDP.JK,LUCY.JK,MMLP.JK,CMNP.JK
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2022-07-05,3.587444,0.763359,2.24215,7.38255,0.0,1.796407,0.0,-3.414634,0.0
2022-07-18,-2.608696,0.0,-0.446429,0.0,-1.840491,-4.519774,-8.72093,-10.454545,-6.829268
2022-07-21,-1.666667,1.612903,6.635071,6.622517,0.0,0.0,8.994709,0.952381,3.030303
2022-07-22,1.265823,-2.362205,-11.715481,0.0,-4.216867,-2.777778,-7.352941,-3.225806,0.0
2022-07-26,2.136752,0.389105,-1.244813,3.921569,5.732484,-1.104972,3.517588,4.0,-1.25
2022-07-27,-0.847458,0.784314,0.83682,0.0,0.0,-1.092896,-1.485149,-4.761905,-1.960784
2022-08-04,1.287554,0.0,-0.444444,0.649351,0.0,-1.129944,0.505051,1.0,0.980392
2022-08-08,-1.709402,-0.423729,6.666667,-0.689655,0.0,-2.173913,1.546392,1.814516,-2.392344
2022-08-23,1.276596,3.673469,-0.454545,1.388889,0.0,1.477833,1.0,-2.92887,0.0
2022-09-08,1.287554,-0.900901,2.564103,3.816794,0.0,3.804348,-2.094241,0.0,1.269036


Date
2022-07-05    1.346799
2022-07-18   -6.012652
2022-07-21    0.964625
2022-07-22   -2.812499
2022-07-26    0.621117
2022-07-27   -2.719032
2022-08-04   -1.197609
2022-08-08   -1.453489
2022-08-23    0.583095
2022-09-08   -1.089917
2022-09-09   -2.910056
2022-09-12    1.612906
2022-09-13   -0.534753
2022-09-15   -1.355003
2022-09-23   -1.069517
2022-09-27    0.540545
2022-10-04   -0.536198
2022-10-07    0.000000
2022-10-20    0.975608
2022-10-21    0.244501
2022-10-27   -3.317535
2022-11-11    0.000000
2022-11-23    0.491405
2022-11-25   -1.937043
Name: BMRI.JK Return D+2, dtype: float64

Unnamed: 0_level_0,CPIN.JK,IFSH.JK,PURI.JK,MPRO.JK,JSPT.JK,PUDP.JK,LUCY.JK,MMLP.JK,CMNP.JK
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2023-01-12,0.0,3.030303,3.703704,-3.216374,0.0,5.347594,0.529101,1.276596,3.013699
2023-01-18,0.0,1.99005,5.504587,0.0,0.0,0.0,-0.534759,-2.44898,3.125
2023-01-20,1.680672,0.0,0.0,2.639296,0.621118,0.0,0.0,0.414938,1.06383
2023-01-24,-0.41841,-0.497512,-0.943396,-2.571429,0.0,4.787234,1.648352,-0.413223,-0.265252
2023-01-25,1.702128,1.515152,0.952381,0.0,0.0,-3.589744,0.0,2.109705,-0.26455
2023-01-26,-1.260504,-0.502513,0.961538,0.0,0.0,0.0,-5.699482,1.282051,0.531915
2023-01-27,1.709402,-1.485149,-20.0,0.0,0.0,3.174603,-1.025641,-0.425532,-2.337662
2023-01-30,-0.425532,1.507538,-19.753086,0.0,0.0,0.0,0.515464,2.620087,0.78534
2023-02-02,1.293103,-6.818182,-1.398601,0.0,0.0,0.0,-1.546392,0.0,0.802139
2023-02-06,0.431034,0.0,-5.517241,1.714286,-1.840491,0.0,0.526316,-1.67364,0.540541


Date
2023-01-12   -1.344087
2023-01-18   -2.255638
2023-01-20    2.061863
2023-01-24   -1.272265
2023-01-25   -1.995020
2023-01-26    0.753775
2023-01-27    0.000000
2023-01-30    2.577321
2023-02-02    0.506323
2023-02-06   -0.245100
Name: BMRI.JK Return D+2, dtype: float64

Unnamed: 0_level_0,CPIN.JK,IFSH.JK,PURI.JK,MPRO.JK,JSPT.JK,PUDP.JK,LUCY.JK,MMLP.JK,CMNP.JK
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2022-07-05,3.587444,0.763359,2.24215,7.38255,0.0,1.796407,0.0,-3.414634,0.0
2022-07-18,-2.608696,0.0,-0.446429,0.0,-1.840491,-4.519774,-8.72093,-10.454545,-6.829268
2022-07-21,-1.666667,1.612903,6.635071,6.622517,0.0,0.0,8.994709,0.952381,3.030303
2022-07-22,1.265823,-2.362205,-11.715481,0.0,-4.216867,-2.777778,-7.352941,-3.225806,0.0
2022-07-26,2.136752,0.389105,-1.244813,3.921569,5.732484,-1.104972,3.517588,4.0,-1.25
2022-07-27,-0.847458,0.784314,0.83682,0.0,0.0,-1.092896,-1.485149,-4.761905,-1.960784
2022-08-04,1.287554,0.0,-0.444444,0.649351,0.0,-1.129944,0.505051,1.0,0.980392
2022-08-08,-1.709402,-0.423729,6.666667,-0.689655,0.0,-2.173913,1.546392,1.814516,-2.392344
2022-08-23,1.276596,3.673469,-0.454545,1.388889,0.0,1.477833,1.0,-2.92887,0.0
2022-09-08,1.287554,-0.900901,2.564103,3.816794,0.0,3.804348,-2.094241,0.0,1.269036


Date
2022-07-05    1.346799
2022-07-18   -6.012652
2022-07-21    0.964625
2022-07-22   -2.812499
2022-07-26    0.621117
2022-07-27   -2.719032
2022-08-04   -1.197609
2022-08-08   -1.453489
2022-08-23    0.583095
2022-09-08   -1.089917
2022-09-09   -2.910056
2022-09-12    1.612906
2022-09-13   -0.534753
2022-09-15   -1.355003
2022-09-23   -1.069517
2022-09-27    0.540545
2022-10-04   -0.536198
2022-10-07    0.000000
2022-10-20    0.975608
2022-10-21    0.244501
2022-10-27   -3.317535
2022-11-11    0.000000
2022-11-23    0.491405
2022-11-25   -1.937043
Name: BMRI.JK Return D+2, dtype: float64