In [1]:
import csv

import numpy as np
import pandas as pd
from pycaret.regression import *

In [2]:
def substitute_values(data, sorted_index, prefix, column):
    sub = 1

    # This list stores the values at which the labels were changed
    divider_list = ["GBPUSD", column]

    for i, index in enumerate(sorted_index):
        if i == 33:
            sub += 1
            divider_list.append(data.at[index, column])
        if i == 66:
            sub += 1
            divider_list.append(data.at[index, column])

        data.at[index, column] = (prefix * 10) + sub

    # with open("divider_list.csv", "a", newline="") as csvfile:
    #     writer = csv.writer(csvfile)
    #     writer.writerow(divider_list)

    return data

In [3]:
data = pd.read_csv("../results/GBPUSD.csv")

# Preprocessing

# Removing the first row since it is an outlier - return is 0
data = data.drop(data.index[0])

# Delete any rows with null values
data.dropna(inplace=True)

# Delete any rows with 0 Vol and 0 FD
data = data[data.vol != 0]
data = data[data.fd != 0]

# Multiply the returns column by 100000 to have a usable column
data.return_val *= 10000

# Resetting the indexes in the dataframe
data.reset_index(drop=True, inplace=True)

data.head(5)

Unnamed: 0,inserttime,period,max,min,mean,vol,fd,return_val
0,2022-12-02 02:42:25,2,1.226365,1.224,1.225241,0.002365,215644.820296,12.65844
1,2022-12-02 02:50:06,3,1.22676,1.22587,1.22632,0.00089,273033.707865,8.812349
2,2022-12-02 02:57:32,4,1.226725,1.2257,1.226235,0.001025,512195.121951,-0.700845
3,2022-12-02 03:05:09,5,1.226805,1.2248,1.225611,0.002005,326184.538653,-5.085517
4,2022-12-02 03:12:50,6,1.22567,1.224815,1.225292,0.000855,385964.912281,-2.60477


In [4]:
# Sorting the values independently as it worked the best
volatility_array = np.array(data["vol"])
# Argsort sorts the data and returns the indexes of the sorted values from original list
sorted_index = np.argsort(volatility_array)
# Using the sorted index positions, we can replace the values for
data = substitute_values(data, sorted_index, 0, "vol")

# Do the same actions for Fractal Dimension
fd_array = np.array(data["fd"])
sorted_index = np.argsort(fd_array)
data = substitute_values(data, sorted_index, 0, "fd")

data.head(5)

Unnamed: 0,inserttime,period,max,min,mean,vol,fd,return_val
0,2022-12-02 02:42:25,2,1.226365,1.224,1.225241,3.0,1.0,12.65844
1,2022-12-02 02:50:06,3,1.22676,1.22587,1.22632,1.0,1.0,8.812349
2,2022-12-02 02:57:32,4,1.226725,1.2257,1.226235,2.0,2.0,-0.700845
3,2022-12-02 03:05:09,5,1.226805,1.2248,1.225611,3.0,1.0,-5.085517
4,2022-12-02 03:12:50,6,1.22567,1.224815,1.225292,1.0,2.0,-2.60477


In [5]:
# Using PyCaret functions to setup the regression model trainer
regression = setup(data=data, target="return_val")

Unnamed: 0,Description,Value
0,session_id,6704
1,Target,return_val
2,Original Data,"(97, 8)"
3,Missing Values,False
4,Numeric Features,6
5,Categorical Features,0
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(67, 23)"


In [6]:
# Compare the best models after training on different types
best = compare_models()
best

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
omp,Orthogonal Matching Pursuit,4.678,62.8254,6.2202,-0.3077,0.9378,3.3706,0.005
en,Elastic Net,4.8954,72.4218,6.6296,-0.3343,1.2921,1.1586,0.005
br,Bayesian Ridge,4.9865,69.9072,6.6114,-0.3461,1.3058,1.079,0.004
dummy,Dummy Regressor,4.9011,72.4198,6.6694,-0.3518,1.3245,1.0194,0.006
llar,Lasso Least Angle Regression,4.9011,72.4198,6.6694,-0.3518,1.3245,1.0194,0.005
lasso,Lasso Regression,4.9029,72.4377,6.6726,-0.3554,1.3193,1.0126,0.219
knn,K Neighbors Regressor,4.6461,66.7075,6.2161,-0.4184,0.9696,1.7768,0.019
ridge,Ridge Regression,4.9555,67.5219,6.5371,-0.4808,0.8893,3.3903,0.005
lightgbm,Light Gradient Boosting Machine,5.056,76.5361,6.916,-0.5044,1.1697,1.1264,0.009
rf,Random Forest Regressor,4.9839,68.9075,6.6639,-0.5836,0.8803,2.4528,0.091


OrthogonalMatchingPursuit(fit_intercept=True, n_nonzero_coefs=None,
                          normalize=True, precompute='auto', tol=None)

In [7]:
# Choose the best model technique and create a model
model = create_model(best)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,2.6853,10.8812,3.2987,0.3286,0.8248,0.6233
1,3.8933,21.828,4.672,-1.2344,0.6419,1.2016
2,4.6876,36.9545,6.079,0.2289,1.3042,0.7915
3,2.9076,12.4176,3.5239,0.4618,0.7568,0.7591
4,5.3015,35.222,5.9348,-1.0023,1.4385,20.2795
5,2.9405,12.7032,3.5642,-1.0157,0.54,1.4638
6,4.7401,33.8425,5.8174,0.1336,1.1339,0.8915
7,4.1651,31.8229,5.6412,-1.2937,0.9745,5.5308
8,2.8659,9.6444,3.1055,0.2849,0.7476,1.0458
9,12.5928,422.9374,20.5654,0.0309,1.0155,1.1194


In [8]:
# Run model tune to get a better fit
tuned_model = tune_model(model)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,3.4194,16.2631,4.0327,-0.0035,1.1587,0.8913
1,4.9091,30.9793,5.5659,-2.1711,0.8398,1.4201
2,4.8441,40.223,6.3422,0.1607,1.6696,0.8658
3,4.2529,30.0174,5.4788,-0.3009,1.5855,1.0
4,5.4877,38.0845,6.1713,-1.165,1.8133,20.2795
5,2.4632,9.5755,3.0944,-0.5194,0.8698,1.097
6,5.9085,57.1776,7.5616,-0.4638,1.8301,1.0
7,2.626,14.6657,3.8296,-0.057,1.235,1.0
8,3.2241,13.4139,3.6625,0.0054,1.0941,1.0268
9,12.9076,495.8734,22.2682,-0.1362,1.4273,1.311


In [9]:
# Mark the tuned model as final and save it
model = finalize_model(tuned_model)
save_model(model, "../models/GBPUSD")

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=['period'],
                                       ml_usecase='regression',
                                       numerical_features=[],
                                       target='return_val', time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numer...
                 ('dummy', Dummify(target='return_val')),
                 ('fix_perfect', Remove_100(target='return_val')),
                 ('clean_names', Clean_Colum_Names()),
                 ('feature_select', 'passthrough'), ('fix_multi', 'passthrough'),
            