## 0. Load Required Libraries

In [21]:
import pandas as pd
import numpy as np
import seaborn as sns
import src.util as util

## 1. Import Configuration File

In [22]:
config_data = util.load_config()

## 2. Load Dataset

In [23]:
def load_dataset(config_data: dict) -> pd.DataFrame:
    
    
    # Load every set of data
    clean_data = util.pickle_load(config_data['clean_dataset_path'])

    x_train = util.pickle_load(config_data["train_set_path"][0])
    y_train = util.pickle_load(config_data["train_set_path"][1])

    x_valid = util.pickle_load(config_data["valid_set_path"][0])
    y_valid = util.pickle_load(config_data["valid_set_path"][1])

    x_test = util.pickle_load(config_data["test_set_path"][0])
    y_test = util.pickle_load(config_data["test_set_path"][1])

    # Concatenate x and y each set
    train_set = pd.concat([x_train, y_train], axis = 1)
    valid_set = pd.concat([x_valid, y_valid], axis = 1)
    test_set = pd.concat([x_test, y_test], axis = 1)

    # Return 3 set of data
    return clean_data, train_set, valid_set, test_set

In [24]:
clean_data, train_set, valid_set, test_set = load_dataset(config_data)

FileNotFoundError: [Errno 2] No such file or directory: 'data/processed/clean_dataset.pkl'

In [29]:
def load_dataset_ran(config_data: dict) -> pd.DataFrame:
    
    # Load every set of data
    #clean_data = util.pickle_load(config_data['clean_dataset_path'])

    x_train_ran = util.pickle_load(config_data["train_ran_set_path"][0])
    y_train_ran = util.pickle_load(config_data["train_ran_set_path"][1])

    x_valid_ran = util.pickle_load(config_data["valid_ran_set_path"][0])
    y_valid_ran = util.pickle_load(config_data["valid_ran_set_path"][1])

    x_test_ran = util.pickle_load(config_data["test_ran_set_path"][0])
    y_test_ran = util.pickle_load(config_data["test_ran_set_path"][1])

    # Concatenate x and y each set
    train_set_ran = pd.concat([x_train_ran, y_train_ran], axis = 1)
    valid_set_ran = pd.concat([x_valid_ran, y_valid_ran], axis = 1)
    test_set_ran = pd.concat([x_test_ran, y_test_ran], axis = 1)

    # Return 3 set of data
    return train_set_ran, valid_set_ran, test_set_ran

In [30]:
train_set_ran, valid_set_ran, test_set_ran = load_dataset_ran(config_data)

## 3. Feature Engineering

### 3.1 Stock Return Data Transform

In [31]:
# as the way to normalize all of data value, its relevant if we change them into return percentage.
# the advantage are: 
# 1. the data value will vary from -0.5 to +0.5. While its possible, its less likely stock change will be up/down more than 50% within 2 days. 
# 2. the stock return is something we want to know anyway therefore its a representative approach in this case

def transform_to_stock_return(dataset, params):
    # define the return for all stock based on the next day of its price change percentage 
    dataset = (dataset.shift(periods=1)-dataset)*100/dataset
    
    #define the target return column name
    target_return_column_name = f"{params['target']} Return D+2"
    
    # add additional column of our targeted stock return
    dataset[target_return_column_name] = dataset[params['target']].shift(periods=-2)

    # handling missing value of shifted targeted column & its reference column
    dataset.dropna(subset=params['target'], inplace=True)
    dataset.dropna(subset=target_return_column_name, inplace=True)

    # handling missing value of the remaining columns
    #dataset.fillna(0, inplace=True)

    return dataset



In [32]:
def remove_outliers(df,n_std):
    for col in df.columns:
        #print('Working on column: {}'.format(col))
        
        mean = df[col].mean()
        sd = df[col].std()
        
        df = df[(df[col] <= mean+(n_std*sd))]
        
    return df



In [7]:
train_set_feng = transform_to_stock_return(dataset=train_set, params=config_data)
train_set_feng = remove_outliers(train_set_feng, 3)



In [8]:
display(train_set_feng.isna().any().sum())
display(util.summary_dataset_describe(dataset=train_set_feng))
display(train_set_feng.shape)

0

Unnamed: 0,Date,Min,Max
count,572,572.0,572.0
mean,2019-08-23 17:37:20.559440640,-0.397805,0.115837
min,2017-01-11 00:00:00,-58.644068,0.0
25%,2018-06-14 18:00:00,-1.603226,0.0
50%,2019-09-05 12:00:00,0.0,0.0
75%,2020-11-04 06:00:00,0.0,1.762764
max,2021-08-27 00:00:00,0.0,52.884615
std,,0.0,5.517171


(572, 105)

In [9]:
test_set_feng = transform_to_stock_return(dataset=test_set, params=config_data)
test_set_feng = remove_outliers(test_set_feng, 3)

In [10]:
display(test_set_feng.isna().any().sum())
display(util.summary_dataset_describe(dataset=test_set_feng))
display(test_set_feng.shape)

0

Unnamed: 0,Date,Min,Max
count,146,146.0,146.0
mean,2022-01-28 10:11:30.410958848,-0.955964,0.45966
min,2021-09-10 00:00:00,-25.925926,0.0
25%,2021-11-17 06:00:00,-3.832011,0.0
50%,2022-01-25 12:00:00,-0.471961,1.452997
75%,2022-03-31 18:00:00,0.0,5.263162
max,2022-06-27 00:00:00,0.0,7.526882
std,,0.0,6.528549


(146, 105)

In [11]:
val_set_feng = transform_to_stock_return(dataset=valid_set, params=config_data)
val_set_feng = remove_outliers(val_set_feng, 3)

In [12]:
display(val_set_feng.isna().any().sum())
display(util.summary_dataset_describe(dataset=val_set_feng))
display(val_set_feng.shape)

0

Unnamed: 0,Date,Min,Max
count,133,133.0,133.0
mean,2022-11-17 22:22:33.383458560,-0.756308,0.41187
min,2022-07-05 00:00:00,-25.882353,0.0
25%,2022-09-20 00:00:00,-3.030303,0.0
50%,2022-11-21 00:00:00,-0.46729,0.961538
75%,2023-01-24 00:00:00,0.0,4.201681
max,2023-04-04 00:00:00,0.0,7.526882
std,,0.0,6.351772


(133, 105)

In [33]:
train_set_ran_feng = transform_to_stock_return(dataset=train_set_ran, params=config_data)
train_set_ran_feng = remove_outliers(train_set_ran_feng, 3)



ValueError: Cannot set a DataFrame with multiple columns to the single column BMRI.JK Return D+2

In [34]:
display(train_set_ran_feng.isna().any().sum())
display(util.summary_dataset_describe(dataset=train_set_ran_feng))
display(train_set_ran_feng.shape)

NameError: name 'train_set_ran_feng' is not defined

In [None]:
test_set_ran_feng = transform_to_stock_return(dataset=test_set_ran, params=config_data)
test_set_ran_feng = remove_outliers(test_set_ran_feng, 3)

In [None]:
display(test_set_ran_feng.isna().any().sum())
display(util.summary_dataset_describe(dataset=test_set_ran_feng))
display(test_set_ran_feng.shape)

0

Unnamed: 0,Date,Min,Max
count,146,146.0,146.0
mean,2022-01-28 10:11:30.410958848,-0.955964,0.45966
min,2021-09-10 00:00:00,-25.925926,0.0
25%,2021-11-17 06:00:00,-3.832011,0.0
50%,2022-01-25 12:00:00,-0.471961,1.452997
75%,2022-03-31 18:00:00,0.0,5.263162
max,2022-06-27 00:00:00,0.0,7.526882
std,,0.0,6.528549


(146, 105)

In [None]:
val_set_ran_feng = transform_to_stock_return(dataset=valid_set_ran, params=config_data)
val_set_ran_feng = remove_outliers(val_set_ran_feng, 3)

In [None]:
display(val_set_ran_feng.isna().any().sum())
display(util.summary_dataset_describe(dataset=val_set_ran_feng))
display(val_set_ran_feng.shape)

0

Unnamed: 0,Date,Min,Max
count,133,133.0,133.0
mean,2022-11-17 22:22:33.383458560,-0.756308,0.41187
min,2022-07-05 00:00:00,-25.882353,0.0
25%,2022-09-20 00:00:00,-3.030303,0.0
50%,2022-11-21 00:00:00,-0.46729,0.961538
75%,2023-01-24 00:00:00,0.0,4.201681
max,2023-04-04 00:00:00,0.0,7.526882
std,,0.0,6.351772


(133, 105)

## 4. Feature Selection

### 4.1 Filter Correlated Feature

In [13]:
def keep_correlated_features(dataset, params):
    #define the target return column name
    target_return_column_name = f"{params['target']} Return D+2"

    # define the correlated features
    corr_stock = dataset.corrwith(dataset[target_return_column_name], axis=0).nlargest(10)

    # keep correlated features
    dataset = dataset[corr_stock]

    return dataset

In [20]:
#keep_correlated_features(dataset=train_set_feng, params=config_data)
target_return_column_name = f"{config_data['target']} Return D+2"
#corr_stock = val_set_feng.corr()['BMRI.JK Return D+2'].nlargest(10)
corr_stock = test_set_feng.corrwith(test_set_feng[target_return_column_name], axis=0).nlargest(10)
corr_stock


BMRI.JK Return D+2    1.000000
JKON.JK               0.280301
PTIS.JK               0.242115
BBCA.JK               0.211299
JPFA.JK               0.177304
BNBR.JK               0.167255
TMAS.JK               0.155400
CASS.JK               0.146708
MLIA.JK               0.138005
AISA.JK               0.122294
dtype: float64

In [None]:
data = transform_to_stock_return(dataset=valid_set, params=config_data)
corr_stock = data.corrwith(data[target_return_column_name], axis=0, numeric_only=True).nlargest(10)


In [None]:
corr_stock

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

raw_dataset = util.pickle_load(config_data['raw_dataset_path'])
df_nan = raw_dataset.isna().sum()
plt.figure(figsize=(10,5))
sns.histplot(df_nan)
plt.xlabel('Nan Range per column')
plt.ylabel('Frequency')
plt.title('Distribution of BMRI Return')
plt.show()

df_nan

In [None]:
corr_stock = raw_dataset.corrwith(raw_dataset['BMRI.JK'], axis=0, numeric_only=True).nlargest(10)


In [None]:
corr_stock

In [None]:
raw_dataset_feng = transform_to_stock_return(dataset=raw_dataset, params=config_data)


In [None]:
raw_dataset_feng.corrwith(raw_dataset_feng['BMRI.JK Return D+2'], axis=0, numeric_only=True).nlargest(10)


In [None]:
valid_set_feng = transform_to_stock_return(dataset=valid_set, params=config_data)

In [None]:
test_set.corrwith(test_set['BMRI.JK'], axis=0, numeric_only=True).nlargest(10)


In [None]:
valid_set.describe()

In [None]:
train_set_trial = (train_set-train_set.shift(periods=1))*100/train_set.shift(periods=1)



In [None]:
plt.figure(figsize=(10,5))
sns.histplot(train_set_feng.isna().sum())
plt.xlabel('Nan Range per column')
plt.ylabel('Frequency')
plt.title('Distribution of BMRI Return')
plt.show()

In [None]:
plt.figure(figsize=(10,5))
sns.histplot(train_set.isna().sum())
plt.xlabel('Nan Range per column')
plt.ylabel('Frequency')
plt.title('Distribution of BMRI Return')
plt.show()

In [None]:
raw_dataset