In [1]:
import csv

import numpy as np
import pandas as pd
from pycaret.regression import *

In [2]:
def substitute_values(data, sorted_index, prefix, column):
    sub = 1

    # This list stores the values at which the labels were changed
    divider_list = ["GBPUSD", column]

    for i, index in enumerate(sorted_index):
        if i == 33:
            sub += 1
            divider_list.append(data.at[index, column])
        if i == 66:
            sub += 1
            divider_list.append(data.at[index, column])

        data.at[index, column] = (prefix * 10) + sub

    with open("../divider_list.csv", "a", newline="") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(divider_list)

    return data

In [3]:
data = pd.read_csv("../data/GBPUSD.csv")

# Preprocessing

# Removing the first row since it is an outlier - return is 0
data = data.drop(data.index[0])

# Delete any rows with null values
data.dropna(inplace=True)

# Delete any rows with 0 Vol and 0 FD
data = data[data.vol != 0]
data = data[data.fd != 0]

# Multiply the returns column by 100000 to have a usable column
data.return_val *= 10000

# Resetting the indexes in the dataframe
data.reset_index(drop=True, inplace=True)

# Select only the required columns - mean, fd, vol, return
data = data[["mean", "vol", "fd", "return_val"]]

data.head(5)

Unnamed: 0,mean,vol,fd,return_val
0,1.22702,0.001059,695626.003144,3.209945
1,1.228177,0.001653,308556.797815,9.429112
2,1.228799,0.000916,345156.011681,5.064695
3,1.228575,0.00105,553335.042393,-1.821852
4,1.228367,0.000639,747973.58811,-1.697548


In [4]:
# Sorting the values independently as it worked the best
volatility_array = np.array(data["vol"])
# Argsort sorts the data and returns the indexes of the sorted values from original list
sorted_index = np.argsort(volatility_array)
# Using the sorted index positions, we can replace the values for
data = substitute_values(data, sorted_index, 0, "vol")

# Do the same actions for Fractal Dimension
fd_array = np.array(data["fd"])
sorted_index = np.argsort(fd_array)
data = substitute_values(data, sorted_index, 0, "fd")

data.head(5)

Unnamed: 0,mean,vol,fd,return_val
0,1.22702,3.0,2.0,3.209945
1,1.228177,3.0,1.0,9.429112
2,1.228799,3.0,1.0,5.064695
3,1.228575,3.0,1.0,-1.821852
4,1.228367,2.0,2.0,-1.697548


In [6]:
# Using PyCaret functions to setup the regression model trainer
regression = setup(data=data, target="return_val")

Unnamed: 0,Description,Value
0,session_id,7575
1,Target,return_val
2,Original Data,"(139, 4)"
3,Missing Values,False
4,Numeric Features,3
5,Categorical Features,0
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(97, 3)"


In [7]:
# Compare the best models after training on different types
best = compare_models()
best

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
llar,Lasso Least Angle Regression,4.1211,54.6656,6.2643,-0.1525,1.0119,1.3639,0.005
dummy,Dummy Regressor,4.1211,54.6656,6.2643,-0.1525,1.0119,1.3639,0.005
lasso,Lasso Regression,4.1293,54.8218,6.2781,-0.1604,1.0069,1.3645,0.25
en,Elastic Net,4.145,55.1073,6.3024,-0.173,0.9956,1.3722,0.005
br,Bayesian Ridge,4.1739,55.4325,6.3368,-0.1944,0.9938,1.3914,0.006
ridge,Ridge Regression,4.2721,55.356,6.3772,-0.2422,0.9527,1.5474,0.005
huber,Huber Regressor,4.1759,57.4594,6.4537,-0.2534,1.1257,1.196,0.007
omp,Orthogonal Matching Pursuit,4.2858,56.1137,6.4349,-0.2736,0.9501,1.5777,0.005
lar,Least Angle Regression,4.3087,56.6234,6.4649,-0.2897,0.9409,1.6267,0.005
lr,Linear Regression,4.3087,56.6235,6.4649,-0.2897,0.9409,1.6266,0.423


LassoLars(alpha=1.0, copy_X=True, eps=2.220446049250313e-16, fit_intercept=True,
          fit_path=True, jitter=None, max_iter=500, normalize=True,
          positive=False, precompute='auto', random_state=7575, verbose=False)

In [20]:
# Choose the best model technique and create a model
model = create_model(best)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,8.2431,248.7038,15.7703,-0.0556,1.6275,1.1805
1,3.4517,19.7031,4.4388,-0.0497,0.9558,2.311
2,6.4979,99.9097,9.9955,-0.0652,1.5148,1.2525
3,1.569,4.3461,2.0847,-0.0001,0.549,1.1001
4,2.9372,14.3536,3.7886,-0.0711,0.8234,1.0196
5,2.4505,11.5099,3.3926,-0.093,0.8683,0.8906
6,4.5597,40.8772,6.3935,-0.7756,0.8719,1.5689
7,3.1373,13.1788,3.6303,-0.0112,0.861,1.2474
8,3.3346,21.3464,4.6202,-0.3606,0.8334,1.1146
9,5.03,72.7271,8.528,-0.0434,1.2135,1.9541


In [21]:
# Run model tune to get a better fit
tuned_model = tune_model(model)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,8.1752,251.1394,15.8474,-0.0659,1.8551,1.0105
1,3.5546,19.9579,4.4674,-0.0633,1.0052,2.2788
2,6.5416,101.4543,10.0725,-0.0816,1.7066,0.9578
3,1.5336,4.3034,2.0745,0.0098,0.6221,0.9607
4,3.0677,14.5783,3.8182,-0.0878,0.9104,1.0926
5,2.5274,11.9308,3.4541,-0.133,1.0333,0.9164
6,4.4206,40.1801,6.3388,-0.7453,0.9178,1.3725
7,3.0269,12.6854,3.5617,0.0267,0.994,1.1065
8,3.2493,20.8251,4.5634,-0.3274,0.8629,0.9812
9,4.908,73.5205,8.5744,-0.0548,1.3599,1.4259


In [22]:
# Mark the tuned model as final and save it
model = finalize_model(tuned_model)
save_model(model, "../models/GBPUSD")

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=[], ml_usecase='regression',
                                       numerical_features=[],
                                       target='return_val', time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_strat...
                 ('clean_names', Clean_Colum_Names()),
                 ('feature_select', 'passthrough'), ('fix_multi', 'passthrough'),
                 ('dfs', 'passthrough'), ('pca', 'passthrough'),
                 ['trained_model',
                  LassoLars(alpha=1.0, copy_X=True, eps=1e-05,
             