In [1]:
import csv

import numpy as np
import pandas as pd
from pycaret.regression import *

In [2]:
def substitute_values(data, sorted_index, prefix, column):
    sub = 1

    # This list stores the values at which the labels were changed
    divider_list = ["EURUSD", column]

    for i, index in enumerate(sorted_index):
        if i == 33:
            sub += 1
            divider_list.append(data.at[index, column])
        if i == 66:
            sub += 1
            divider_list.append(data.at[index, column])

        data.at[index, column] = (prefix * 10) + sub

    # with open("divider_list.csv", "a", newline="") as csvfile:
    #     writer = csv.writer(csvfile)
    #     writer.writerow(divider_list)

    return data

In [3]:
data = pd.read_csv("../results/EURUSD.csv")

# Preprocessing

# Removing the first row since it is an outlier - return is 0
data = data.drop(data.index[0])

# Delete any rows with null values
data.dropna(inplace=True)

# Delete any rows with 0 Vol and 0 FD
data = data[data.vol != 0]
data = data[data.fd != 0]

# Multiply the returns column by 10000 to have a usable column
data.return_val *= 100000

# Resetting the indexes in the dataframe
data.reset_index(drop=True, inplace=True)

data.head(5)

Unnamed: 0,inserttime,period,max,min,mean,vol,fd,return_val
0,2022-12-02 02:42:24,2,1.0542,1.053025,1.05361,0.001175,525957.446809,74.130009
1,2022-12-02 02:50:04,3,1.05418,1.05355,1.053971,0.00063,526984.126984,34.217177
2,2022-12-02 02:57:32,4,1.053655,1.05296,1.053365,0.000695,818705.035971,-57.476468
3,2022-12-02 03:05:09,5,1.05312,1.051465,1.05232,0.001655,433836.858006,-99.246035
4,2022-12-02 03:12:50,6,1.05195,1.05133,1.051634,0.00062,485483.870968,-65.163673


In [4]:
# Sorting the values independently as it worked the best
volatility_array = np.array(data["vol"])
# Argsort sorts the data and returns the indexes of the sorted values from original list
sorted_index = np.argsort(volatility_array)
# Using the sorted index positions, we can replace the values for
data = substitute_values(data, sorted_index, 0, "vol")

# Do the same actions for Fractal Dimension
fd_array = np.array(data["fd"])
sorted_index = np.argsort(fd_array)
data = substitute_values(data, sorted_index, 0, "fd")

data.head(5)

Unnamed: 0,inserttime,period,max,min,mean,vol,fd,return_val
0,2022-12-02 02:42:24,2,1.0542,1.053025,1.05361,3.0,2.0,74.130009
1,2022-12-02 02:50:04,3,1.05418,1.05355,1.053971,1.0,2.0,34.217177
2,2022-12-02 02:57:32,4,1.053655,1.05296,1.053365,2.0,2.0,-57.476468
3,2022-12-02 03:05:09,5,1.05312,1.051465,1.05232,3.0,1.0,-99.246035
4,2022-12-02 03:12:50,6,1.05195,1.05133,1.051634,1.0,2.0,-65.163673


In [5]:
# Using PyCaret functions to setup the regression model trainer
regression = setup(data=data, target="return_val")

Unnamed: 0,Description,Value
0,session_id,7184
1,Target,return_val
2,Original Data,"(99, 8)"
3,Missing Values,False
4,Numeric Features,6
5,Categorical Features,0
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(69, 25)"


In [6]:
# Compare the best models after training on different types
best = compare_models()
best

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
knn,K Neighbors Regressor,41.5103,5123.5444,58.3869,-0.0045,1.2883,1.6891,0.017
huber,Huber Regressor,42.0982,5492.2429,60.5644,-0.1252,1.1379,1.8804,0.006
par,Passive Aggressive Regressor,48.4521,7152.1107,68.0016,-0.259,1.5071,3.4479,0.006
en,Elastic Net,45.1205,6902.4251,68.011,-0.299,1.8739,1.2139,0.005
dummy,Dummy Regressor,46.5709,7257.1735,69.6049,-0.3625,2.041,1.1327,0.004
lightgbm,Light Gradient Boosting Machine,53.0845,8304.2444,76.6893,-0.7635,1.5645,2.0557,0.007
br,Bayesian Ridge,47.7219,7395.4761,70.9105,-0.7814,1.6775,2.121,0.005
llar,Lasso Least Angle Regression,47.1764,5926.4135,67.2996,-0.8262,1.4621,2.3358,0.005
ridge,Ridge Regression,47.5935,5575.7774,65.7001,-0.9326,1.3528,2.3904,0.005
rf,Random Forest Regressor,46.6589,5904.8806,65.7172,-0.9337,1.536,1.8695,0.088


KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
                    weights='uniform')

In [7]:
# Choose the best model technique and create a model
model = create_model("par")

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,32.9871,1898.0226,43.5663,-0.3256,1.3595,1.3372
1,22.9401,872.0068,29.5298,-0.4573,1.3487,5.845
2,98.8948,22467.1066,149.8903,-0.6123,1.7938,1.9438
3,24.0536,746.8702,27.3289,0.1005,1.5684,1.5368
4,26.5496,1113.5301,33.3696,-0.5015,1.3907,5.5886
5,32.7448,1402.8698,37.4549,-0.1292,1.3655,4.235
6,28.426,1300.8176,36.0668,0.2727,1.1493,3.8211
7,95.906,30358.2274,174.2361,-0.0139,2.0369,5.8324
8,51.9355,3787.9595,61.5464,-0.6989,1.6145,3.3002
9,70.0829,7573.6961,87.027,-0.224,1.4438,1.0394


In [8]:
# Run model tune to get a better fit
tuned_model = tune_model(model)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,42.7768,3242.9172,56.9466,-1.2649,1.351,2.1889
1,15.5797,285.2482,16.8893,0.5233,0.7541,1.9555
2,79.8593,18252.1854,135.1006,-0.3098,1.736,0.8946
3,31.8172,1264.8946,35.5654,-0.5235,1.8514,1.3515
4,21.9439,853.3789,29.2126,-0.1507,1.0782,3.4996
5,22.8696,541.9281,23.2793,0.5638,1.0979,2.2436
6,25.5595,882.9697,29.7148,0.5063,1.4074,2.2455
7,93.7203,29701.4879,172.3412,0.0081,2.2655,2.4035
8,38.8436,2677.5234,51.7448,-0.2009,1.419,1.9838
9,68.1026,6510.6013,80.6883,-0.0522,2.5329,1.0656


In [9]:
# Mark the tuned model as final and save it
model = finalize_model(tuned_model)
save_model(model, "../models/EURUSD")

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=['period'],
                                       ml_usecase='regression',
                                       numerical_features=[],
                                       target='return_val', time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numer...
                 ('feature_select', 'passthrough'), ('fix_multi', 'passthrough'),
                 ('dfs', 'passthrough'), ('pca', 'passthrough'),
                 ['trained_model',
                  PassiveAggressiveRegressor(C=0.274, average=False,
                       