In [1]:
import csv

import numpy as np
import pandas as pd
from pycaret.regression import *

In [2]:
def substitute_values(data, sorted_index, prefix, column):
    sub = 1

    # This list stores the values at which the labels were changed
    divider_list = ["USDCAD", column]

    for i, index in enumerate(sorted_index):
        if i == 33:
            sub += 1
            divider_list.append(data.at[index, column])
        if i == 66:
            sub += 1
            divider_list.append(data.at[index, column])

        data.at[index, column] = (prefix * 10) + sub

    # with open("divider_list.csv", "a", newline="") as csvfile:
    #     writer = csv.writer(csvfile)
    #     writer.writerow(divider_list)

    return data

In [3]:
data = pd.read_csv("../results/USDCAD.csv")

# Preprocessing

# Removing the first row since it is an outlier - return is 0
data = data.drop(data.index[0])

# Delete any rows with null values
data.dropna(inplace=True)

# Delete any rows with 0 Vol and 0 FD
data = data[data.vol != 0]
data = data[data.fd != 0]

# Multiply the returns column by 100000 to have a usable column
data.return_val *= 10000

# Resetting the indexes in the dataframe
data.reset_index(drop=True, inplace=True)

data.head(5)

Unnamed: 0,inserttime,period,max,min,mean,vol,fd,return_val
0,2022-12-02 02:42:23,2,1.34345,1.342305,1.342894,0.001145,474235.80786,-54.340198
1,2022-12-02 02:50:07,3,1.342805,1.342095,1.342329,0.00071,354929.577465,-42.105733
2,2022-12-02 02:57:32,4,1.342955,1.342215,1.34254,0.00074,656756.756757,15.738083
3,2022-12-02 03:05:07,5,1.344845,1.34245,1.343838,0.002395,163674.321503,96.665982
4,2022-12-02 03:12:50,6,1.34515,1.344305,1.344679,0.000845,208284.023669,62.612088


In [4]:
# Sorting the values independently as it worked the best
volatility_array = np.array(data["vol"])
# Argsort sorts the data and returns the indexes of the sorted values from original list
sorted_index = np.argsort(volatility_array)
# Using the sorted index positions, we can replace the values for
data = substitute_values(data, sorted_index, 0, "vol")

# Do the same actions for Fractal Dimension
fd_array = np.array(data["fd"])
sorted_index = np.argsort(fd_array)
data = substitute_values(data, sorted_index, 0, "fd")

data.head(5)

Unnamed: 0,inserttime,period,max,min,mean,vol,fd,return_val
0,2022-12-02 02:42:23,2,1.34345,1.342305,1.342894,3.0,2.0,-54.340198
1,2022-12-02 02:50:07,3,1.342805,1.342095,1.342329,1.0,1.0,-42.105733
2,2022-12-02 02:57:32,4,1.342955,1.342215,1.34254,2.0,3.0,15.738083
3,2022-12-02 03:05:07,5,1.344845,1.34245,1.343838,3.0,1.0,96.665982
4,2022-12-02 03:12:50,6,1.34515,1.344305,1.344679,2.0,1.0,62.612088


In [5]:
# Using PyCaret functions to setup the regression model trainer
regression = setup(data=data, target="return_val")

Unnamed: 0,Description,Value
0,session_id,7247
1,Target,return_val
2,Original Data,"(99, 8)"
3,Missing Values,False
4,Numeric Features,6
5,Categorical Features,0
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(69, 25)"


In [6]:
# Compare the best models after training on different types
best = compare_models()
best

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
dummy,Dummy Regressor,31.7017,1632.0617,39.114,-0.2329,1.9738,1.1173,0.005
en,Elastic Net,32.1429,1670.3735,39.4777,-0.2713,1.9417,1.1452,0.005
br,Bayesian Ridge,32.4885,1705.6441,39.8658,-0.2872,2.0105,1.1538,0.007
llar,Lasso Least Angle Regression,32.5976,1718.0868,40.1892,-0.3195,1.8858,1.1763,0.005
ada,AdaBoost Regressor,32.8601,1735.7151,40.4168,-0.3823,1.4378,1.5551,0.014
lightgbm,Light Gradient Boosting Machine,33.3321,1793.0998,40.8694,-0.4167,1.8112,1.4787,0.01
par,Passive Aggressive Regressor,33.4428,1815.7311,40.4494,-0.4234,1.4428,1.9424,0.006
lasso,Lasso Regression,33.5961,1799.9815,41.0877,-0.4245,1.7488,1.38,0.236
rf,Random Forest Regressor,33.0232,1719.4942,40.5025,-0.4384,1.4625,1.777,0.09
ridge,Ridge Regression,33.5527,1809.0763,41.2933,-0.4753,1.4396,1.7645,0.005


DummyRegressor(constant=None, quantile=None, strategy='mean')

In [10]:
# Choose the best model technique and create a model
model = create_model("en")

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,36.6041,2134.8906,46.2049,0.0505,1.9897,1.1602
1,39.2936,2203.9304,46.946,-0.799,2.8878,0.9282
2,31.7513,2274.2104,47.6887,-0.0686,2.2528,1.2758
3,28.9548,1075.2072,32.7904,-0.0056,2.0502,0.9773
4,24.1205,947.4097,30.78,-0.2711,1.7324,1.1935
5,25.8286,873.4994,29.555,-0.9,1.4803,1.521
6,22.2401,717.6767,26.7895,0.0723,1.6287,1.0773
7,34.7983,1575.1663,39.6884,-0.1952,1.4309,1.0784
8,49.8506,3869.4021,62.2045,-0.6246,1.4926,1.4069
9,27.9872,1032.3419,32.1301,0.0287,2.4715,0.833


In [11]:
# Run model tune to get a better fit
tuned_model = tune_model(model)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,36.9432,2247.6855,47.4098,0.0003,2.4523,0.9398
1,39.9953,2303.9915,47.9999,-0.8807,3.471,0.9968
2,32.3209,2407.1133,49.0623,-0.1311,2.6692,1.0656
3,29.1213,1068.8772,32.6937,0.0003,2.228,0.9783
4,24.2005,984.8137,31.3817,-0.3213,2.4927,0.9919
5,22.2484,661.5956,25.7215,-0.4391,1.6568,1.1699
6,22.4005,761.3616,27.5928,0.0158,1.9352,1.0723
7,33.5562,1448.7852,38.0629,-0.0993,2.0549,1.0013
8,45.4363,3245.5559,56.9698,-0.3626,2.1095,1.2064
9,29.1643,1076.4203,32.8088,-0.0128,2.4761,1.0511


In [12]:
# Mark the tuned model as final and save it
model = finalize_model(tuned_model)
save_model(model, "../models/USDCAD")

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=['period'],
                                       ml_usecase='regression',
                                       numerical_features=[],
                                       target='return_val', time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numer...
                 ('clean_names', Clean_Colum_Names()),
                 ('feature_select', 'passthrough'), ('fix_multi', 'passthrough'),
                 ('dfs', 'passthrough'), ('pca', 'passthrough'),
                 ['trained_model',
                  ElasticNet(alpha=6.