## 0. Load Required Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import src.util as util

## 1. Import Configuration File

In [2]:
config_data = util.load_config()

## 2. Load Dataset

In [3]:
def load_dataset(config_data: dict) -> pd.DataFrame:
    
    
    # Load every set of data
    clean_data = util.pickle_load(config_data['clean_dataset_path'])

    x_train = util.pickle_load(config_data["train_set_path"][0])
    y_train = util.pickle_load(config_data["train_set_path"][1])

    x_valid = util.pickle_load(config_data["valid_set_path"][0])
    y_valid = util.pickle_load(config_data["valid_set_path"][1])

    x_test = util.pickle_load(config_data["test_set_path"][0])
    y_test = util.pickle_load(config_data["test_set_path"][1])

    # Concatenate x and y each set
    train_set = pd.concat([x_train, y_train], axis = 1)
    valid_set = pd.concat([x_valid, y_valid], axis = 1)
    test_set = pd.concat([x_test, y_test], axis = 1)

    # Return 3 set of data
    return clean_data, train_set, valid_set, test_set

In [4]:
clean_data, train_set, valid_set, test_set = load_dataset(config_data)

## 3. Feature Engineering

### 3.1 Stock Return Data Transform

In [5]:
# as the way to normalize all of data value, its relevant if we change them into return percentage.
# the advantage are: 
# 1. the data value will vary from -0.5 to +0.5. While its possible, its less likely stock change will be up/down more than 50% within 2 days. 
# 2. the stock return is something we want to know anyway therefore its a representative approach in this case

def transform_to_stock_return(dataset, params):
    # define the return for all stock based on the next day of its price change percentage 
    dataset = (dataset.shift(periods=1)-dataset)*100/dataset
    
    #define the target return column name
    target_return_column_name = f"{params['target']} Return D+2"
    
    # add additional column of our targeted stock return
    dataset[target_return_column_name] = dataset[params['target']].shift(periods=-2)

    # handling missing value of shifted targeted column & its reference column
    dataset.dropna(subset=params['target'], inplace=True)
    dataset.dropna(subset=target_return_column_name, inplace=True)

    # handling missing value of the remaining columns
    #dataset.fillna(0, inplace=True)

    return dataset



In [6]:
def remove_outliers(df,n_std):
    for col in df.columns:
        #print('Working on column: {}'.format(col))
        
        mean = df[col].mean()
        sd = df[col].std()
        
        df = df[(df[col] <= mean+(n_std*sd))]
        
    return df



In [7]:
train_set_feng = transform_to_stock_return(dataset=train_set, params=config_data)
train_set_feng = remove_outliers(train_set_feng, 3)

In [8]:
val_set_feng = transform_to_stock_return(dataset=valid_set, params=config_data)
val_set_feng = remove_outliers(val_set_feng, 3)

In [9]:
test_set_feng = transform_to_stock_return(dataset=test_set, params=config_data)
test_set_feng = remove_outliers(test_set_feng, 3)

In [10]:
display(train_set_feng.isna().any().sum())
display(util.summary_dataset_describe(dataset=train_set_feng))
display(train_set_feng.shape)

0

Unnamed: 0,Date,Min,Max
count,24,24.0,24.0
mean,2022-09-11 03:00:00,-4.051934,2.385383
min,2022-07-05 00:00:00,-25.925926,0.0
25%,2022-08-02 00:00:00,-7.204922,0.0
50%,2022-09-12 12:00:00,-3.253968,3.738513
75%,2022-10-10 06:00:00,-0.161988,6.952519
max,2022-11-25 00:00:00,0.0,10.071942
std,,0.0,9.062522


(24, 760)

In [11]:
display(val_set_feng.isna().any().sum())
display(util.summary_dataset_describe(dataset=val_set_feng))
display(val_set_feng.shape)

0

Unnamed: 0,Date,Min,Max
count,10,10.0,10.0
mean,2023-01-25 04:48:00,-3.703387,6.892291
min,2023-01-12 00:00:00,-25.700447,4.938272
25%,2023-01-21 00:00:00,-13.162202,7.168676
50%,2023-01-25 12:00:00,-4.083333,7.325424
75%,2023-01-29 06:00:00,-1.992017,9.318182
max,2023-02-06 00:00:00,0.0,11.111111
std,,0.0,13.046288


(10, 760)

In [12]:
display(test_set_feng.isna().any().sum())
display(util.summary_dataset_describe(dataset=test_set_feng))
display(test_set_feng.shape)

0

Unnamed: 0,Date,Min,Max
count,10,10.0,10.0
mean,2023-03-06 16:48:00,-4.481199,7.346397
min,2023-02-15 00:00:00,-25.438596,7.051282
25%,2023-02-21 18:00:00,-11.341463,7.171001
50%,2023-03-04 12:00:00,-3.383872,7.445716
75%,2023-03-22 12:00:00,-1.860587,7.46912
max,2023-03-30 00:00:00,-0.617284,7.526882
std,,0.0,9.505263


(10, 760)

## 4. Feature Selection

### 4.1 Filter Correlated Feature

In [13]:
def keep_correlated_features(dataset, params):
    #define the target return column name
    target_return_column_name = f"{params['target']} Return D+2"

    # define the correlated features
    corr_stock = dataset.corrwith(dataset[target_return_column_name], axis=0).nlargest(10)

    # keep correlated features
    dataset = dataset[corr_stock]

    return dataset

In [14]:
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error

def feat_selection(dataset):
    X = dataset.iloc[:,:-1]
    y = dataset.iloc[:,-1]
    model = Lasso(alpha=0.1)
    model.fit(X,y)

    # Get feature coefficients from the Lasso model
    feature_coefficients = model.coef_

    # Create a DataFrame with feature names and their corresponding coefficients
    feature_importances = pd.DataFrame({"feature": X.columns, "coefficient": feature_coefficients})

    # Sort the DataFrame by the absolute value of the coefficients in descending order
    feature_importances = feature_importances.reindex(feature_importances["coefficient"].abs().sort_values(ascending=False).index)

    # Get the top 10 features
    top_10_features = feature_importances.head(10)["feature"].values

    # Print the top 10 features
    print("Top 10 features:", top_10_features)

In [15]:
feat_selection(train_set_feng)
feat_selection(val_set_feng)
feat_selection(test_set_feng)

Top 10 features: ['LUCY.JK' 'MMLP.JK' 'IBST.JK' 'SLIS.JK' 'FOOD.JK' 'HITS.JK' 'MPPA.JK'
 'MPRO.JK' 'PORT.JK' 'YPAS.JK']
Top 10 features: ['AIMS.JK' 'BPTR.JK' 'TRUK.JK' 'KONI.JK' 'GZCO.JK' 'PICO.JK' 'HITS.JK'
 'PSKT.JK' 'GOLD.JK' 'TGRA.JK']
Top 10 features: ['MTSM.JK' 'HDFA.JK' 'FIRE.JK' 'MARI.JK' 'ESTA.JK' 'ALKA.JK' 'MPPA.JK'
 'UANG.JK' 'CITY.JK' 'PORT.JK']


  model = cd_fast.enet_coordinate_descent(


In [16]:
clean_set_feng = transform_to_stock_return(dataset=clean_data, params=config_data)


In [17]:
def split_data(df):

    train_size = int(len(df)*0.7)
    val_size = int(len(df)*0.85)

    train = df.iloc[:train_size]
    validation = df.iloc[train_size:val_size]
    test = df.iloc[val_size:]

    return train, validation, test

In [18]:
train, val, test = split_data(clean_set_feng)

In [19]:
y_train = train.iloc[:,-1:]
X_train = train.drop(y_train.columns, axis=1)

In [20]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LassoCV

from sklearn.preprocessing import StandardScaler

# Normalize the data
scaler = StandardScaler()
X_train_normalized = scaler.fit_transform(X_train)


target_col_name = y_train.columns
# Perform feature selection using Lasso with TimeSeriesSplit cross-validation
n_splits = 3
tscv = TimeSeriesSplit(n_splits=n_splits)

# Initialize Lasso with cross-validation
alphas = np.logspace(0, 1, 1000)
lasso_cv = LassoCV(alphas= alphas, cv=tscv)
lasso_cv.fit(X_train, y_train)

# Get the selected features
selected_features = X_train.columns[lasso_cv.coef_ != 0]
display(selected_features)

# 4. Train the final model using the selected features
final_model = Lasso(alpha=lasso_cv.alpha_, random_state=42)
final_model.fit(X_train[selected_features], y_train)

# 5. Evaluate the model on the validation set
y_validation = val[target_col_name]
X_validation = val.drop(target_col_name, axis=1)

y_pred_validation = final_model.predict(X_validation[selected_features])
mse_validation = mean_squared_error(y_validation, y_pred_validation)
print('Mean Squared Error on validation set:', mse_validation)

# 6. Evaluate the final model on the test set
y_test = test[target_col_name]
X_test = test.drop(target_col_name, axis=1)

y_pred_test = final_model.predict(X_test[selected_features])
mse_test = mean_squared_error(y_test, y_pred_test)
print('Mean Squared Error on test set:', mse_test)



  y = column_or_1d(y, warn=True)


Index(['BJBR.JK', 'EXCL.JK', 'SIDO.JK', 'PANI.JK'], dtype='object')

Mean Squared Error on validation set: 3.6844094375838603
Mean Squared Error on test set: 2.726104534533604


In [23]:
# Function to create the features for the next few days
def create_features_for_next_days(last_day_data, num_days, selected_features):
    # last_day_data: the last row of your original dataset (as a pandas Series or DataFrame)
    # num_days: number of days you want to forecast
    # selected_features: the list of selected features
    
    # Create a DataFrame containing the features for the next few days
    next_days_data = pd.DataFrame(columns=selected_features, index=pd.date_range(last_day_data.index[-1] + pd.DateOffset(1), periods=num_days, closed='left'))

    # Fill in the feature values based on your feature engineering method
    # For example, if you have lagged features, you can use the last known values to create the new features

    # For this example, let's assume you have lagged features
    # We'll use the last known values to create the features for the next few days
    for feature in selected_features:
        
        lag = num_days  # Extract the lag value from the feature name
        next_days_data[feature] = last_day_data['BMRI.JK Return D+2'].iloc[-lag:].values

    return next_days_data

# Create the features for the next few days
num_days_to_forecast = 1
last_day_data = clean_set_feng.iloc[-1:]
next_days_data = create_features_for_next_days(last_day_data, num_days_to_forecast, selected_features)

# Predict the target value for the next few days
forecast = final_model.predict(next_days_data)

# Print the forecast
forecast_dates = next_days_data.index
for date, value in zip(forecast_dates, forecast):
    print(f"Forecast for {date.strftime('%Y-%m-%d')}: {value}")


Forecast for 2023-04-06: -0.19599519499760143


  next_days_data = pd.DataFrame(columns=selected_features, index=pd.date_range(last_day_data.index[-1] + pd.DateOffset(1), periods=num_days, closed='left'))


### 5.2 Random Split

In [None]:
from sklearn.model_selection import train_test_split

X = raw_dataset.iloc[:-2,:]
y = raw_dataset[config_data['target']].shift(periods=-2).iloc[:-2]

X_train_ran, X_test_ran, y_train_ran, y_test_ran = train_test_split(X, y, test_size = 0.3, random_state = 123)
X_val_ran, X_test_ran, y_val_ran, y_test_ran = train_test_split(X_test_ran, y_test_ran, test_size = 0.5, random_state = 123)


In [None]:
util.pickle_dump(X_train_ran, config_data["train_ran_set_path"][0])
util.pickle_dump(y_train_ran, config_data["train_ran_set_path"][1])

util.pickle_dump(X_val_ran, config_data["valid_ran_set_path"][0])
util.pickle_dump(y_val_ran, config_data["valid_ran_set_path"][1])

util.pickle_dump(X_test_ran, config_data["test_ran_set_path"][0])
util.pickle_dump(y_test_ran, config_data["test_ran_set_path"][1])