In [1]:
import csv

import numpy as np
import pandas as pd
from pycaret.regression import *

In [2]:
def substitute_values(data, sorted_index, prefix, column):
    sub = 1

    # This list stores the values at which the labels were changed
    divider_list = ["EURUSD", column]

    for i, index in enumerate(sorted_index):
        if i == 33:
            sub += 1
            divider_list.append(data.at[index, column])
        if i == 66:
            sub += 1
            divider_list.append(data.at[index, column])

        data.at[index, column] = (prefix * 10) + sub

    with open("../divider_list.csv", "a", newline="") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(divider_list)

    return data

In [3]:
with open("../divider_list.csv", "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["Currency", "Column", "33rd", "66th"])

data = pd.read_csv("../data/EURUSD.csv")

# Preprocessing

# Removing the first row since it is an outlier - return is 0
data = data.drop(data.index[0])

# Delete any rows with null values
data.dropna(inplace=True)

# Delete any rows with 0 Vol and 0 FD
data = data[data.vol != 0]
data = data[data.fd != 0]

# Multiply the returns column by 10000 to have a usable column
data.return_val *= 100000

# Resetting the indexes in the dataframe
data.reset_index(drop=True, inplace=True)

# Select only the required columns - mean, fd, vol, return
data = data[["mean", "vol", "fd", "return_val"]]

data.head(5)

Unnamed: 0,mean,vol,fd,return_val
0,1.053042,0.000522,1223443.0,-28.699135
1,1.053346,0.000778,1001963.0,28.893472
2,1.05366,0.000569,792001.4,29.833513
3,1.053411,0.000911,717636.4,-23.653078
4,1.053035,0.000456,809520.8,-35.696425


In [4]:
# Sorting the values independently as it worked the best
volatility_array = np.array(data["vol"])
# Argsort sorts the data and returns the indexes of the sorted values from original list
sorted_index = np.argsort(volatility_array)
# Using the sorted index positions, we can replace the values for
data = substitute_values(data, sorted_index, 0, "vol")

# Do the same actions for Fractal Dimension
fd_array = np.array(data["fd"])
sorted_index = np.argsort(fd_array)
data = substitute_values(data, sorted_index, 0, "fd")

data.head(5)

Unnamed: 0,mean,vol,fd,return_val
0,1.053042,2.0,3.0,-28.699135
1,1.053346,3.0,2.0,28.893472
2,1.05366,3.0,2.0,29.833513
3,1.053411,3.0,2.0,-23.653078
4,1.053035,2.0,2.0,-35.696425


In [5]:
# Using PyCaret functions to setup the regression model trainer
regression = setup(data=data, target="return_val")

Unnamed: 0,Description,Value
0,session_id,5621
1,Target,return_val
2,Original Data,"(139, 4)"
3,Missing Values,False
4,Numeric Features,3
5,Categorical Features,0
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(97, 3)"


In [6]:
# Compare the best models after training on different types
best = compare_models()
best

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
dummy,Dummy Regressor,36.8153,4699.46,57.382,-0.1006,1.2746,2.0821,0.006
en,Elastic Net,36.7494,4721.4551,57.635,-0.1274,1.2706,1.5952,0.005
huber,Huber Regressor,36.3916,4778.0178,58.0454,-0.1321,1.9615,1.2734,0.006
llar,Lasso Least Angle Regression,37.1542,4801.6547,58.1317,-0.1367,1.215,1.8089,0.005
lasso,Lasso Regression,37.0348,4797.4725,58.2857,-0.1755,1.391,1.5192,0.21
br,Bayesian Ridge,37.6943,4889.7458,58.8565,-0.1758,1.2652,1.6982,0.005
lr,Linear Regression,37.1171,4827.0902,58.4345,-0.1766,1.1237,1.6381,0.411
lar,Least Angle Regression,37.1171,4827.0917,58.4345,-0.1766,1.1237,1.6381,0.005
ridge,Ridge Regression,36.8695,4785.5312,58.2602,-0.1796,1.2254,1.6304,0.005
omp,Orthogonal Matching Pursuit,37.4151,4797.9662,58.3759,-0.1882,1.4759,1.6646,0.005


DummyRegressor(constant=None, quantile=None, strategy='mean')

In [7]:
# Choose the best model technique and create a model
model = create_model("en")

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,27.8483,1182.8912,34.3932,0.1695,1.0903,1.1485
1,26.4763,998.9518,31.6062,0.103,1.4421,1.5375
2,58.8305,5817.7837,76.2744,-0.6541,0.8969,0.9904
3,26.4349,1824.3215,42.7121,-0.2757,1.3224,1.5037
4,31.851,2240.8936,47.3381,-0.0263,1.2981,1.145
5,71.5694,24434.7266,156.3161,-0.0376,1.629,1.2832
6,30.2447,1460.5652,38.2173,-0.0295,1.102,1.0971
7,23.444,999.4082,31.6134,-0.1453,1.2871,2.2426
8,45.471,7141.2739,84.5061,0.0372,1.3411,1.9811
9,25.3235,1113.735,33.3727,-0.415,1.297,3.0231


In [8]:
# Run model tune to get a better fit
tuned_model = tune_model(model)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,30.7746,1494.3002,38.6562,-0.0492,1.6546,1.167
1,27.2998,1127.1171,33.5726,-0.0121,1.6477,1.2742
2,51.8154,4364.2021,66.0621,-0.2408,1.7204,0.8662
3,25.6026,1506.2986,38.8111,-0.0533,1.4349,1.5037
4,33.7128,2618.2581,51.1689,-0.1992,1.8266,1.1453
5,71.5682,25015.6641,158.1634,-0.0622,2.3343,1.1587
6,28.4647,1465.2194,38.2782,-0.0328,1.3601,0.9367
7,23.4098,979.912,31.3035,-0.123,1.561,2.0661
8,44.6897,7586.5981,87.1011,-0.0229,1.7898,1.2797
9,22.5288,899.6753,29.9946,-0.143,1.2865,1.932


In [9]:
# Mark the tuned model as final and save it
model = finalize_model(tuned_model)
save_model(model, "../models/EURUSD")

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=[], ml_usecase='regression',
                                       numerical_features=[],
                                       target='return_val', time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_strat...
                 ('clean_names', Clean_Colum_Names()),
                 ('feature_select', 'passthrough'), ('fix_multi', 'passthrough'),
                 ('dfs', 'passthrough'), ('pca', 'passthrough'),
                 ['trained_model',
                  ElasticNet(alpha=9.94, copy_X=True, fit_intercept=False,
 