In [1]:
import csv

import numpy as np
import pandas as pd
from pycaret.regression import *

In [2]:
def substitute_values(data, sorted_index, prefix, column):
    sub = 1

    # This list stores the values at which the labels were changed
    divider_list = ["USDAUD", column]

    for i, index in enumerate(sorted_index):
        if i == 33:
            sub += 1
            divider_list.append(data.at[index, column])
        if i == 66:
            sub += 1
            divider_list.append(data.at[index, column])

        data.at[index, column] = (prefix * 10) + sub

    # with open("divider_list.csv", "a", newline="") as csvfile:
    #     writer = csv.writer(csvfile)
    #     writer.writerow(divider_list)

    return data

In [3]:
data = pd.read_csv("../results/USDAUD.csv")

# Preprocessing

# Removing the first row since it is an outlier - return is 0
data = data.drop(data.index[0])

# Delete any rows with null values
data.dropna(inplace=True)

# Delete any rows with 0 Vol and 0 FD
data = data[data.vol != 0]
data = data[data.fd != 0]

# Multiply the returns column by 100000 to have a usable column
data.return_val *= 10000

# Resetting the indexes in the dataframe
data.reset_index(drop=True, inplace=True)

data.head(5)

Unnamed: 0,inserttime,period,max,min,mean,vol,fd,return_val
0,2022-12-02 02:42:25,2,1.466706,1.46455,1.46584,0.002156,303810.385213,-5.79555
1,2022-12-02 02:50:07,3,1.465631,1.463985,1.464539,0.001646,306178.239475,-8.871618
2,2022-12-02 02:57:32,4,1.465846,1.46455,1.465032,0.001296,425170.724179,3.364743
3,2022-12-02 03:05:09,5,1.46845,1.4654,1.467009,0.00305,238360.655738,13.492343
4,2022-12-02 03:12:50,6,1.468695,1.467495,1.468151,0.0012,259166.666667,7.78231


In [4]:
# Sorting the values independently as it worked the best
volatility_array = np.array(data["vol"])
# Argsort sorts the data and returns the indexes of the sorted values from original list
sorted_index = np.argsort(volatility_array)
# Using the sorted index positions, we can replace the values for
data = substitute_values(data, sorted_index, 0, "vol")

# Do the same actions for Fractal Dimension
fd_array = np.array(data["fd"])
sorted_index = np.argsort(fd_array)
data = substitute_values(data, sorted_index, 0, "fd")

data.head(5)

Unnamed: 0,inserttime,period,max,min,mean,vol,fd,return_val
0,2022-12-02 02:42:25,2,1.466706,1.46455,1.46584,3.0,1.0,-5.79555
1,2022-12-02 02:50:07,3,1.465631,1.463985,1.464539,2.0,1.0,-8.871618
2,2022-12-02 02:57:32,4,1.465846,1.46455,1.465032,2.0,2.0,3.364743
3,2022-12-02 03:05:09,5,1.46845,1.4654,1.467009,3.0,1.0,13.492343
4,2022-12-02 03:12:50,6,1.468695,1.467495,1.468151,1.0,1.0,7.78231


In [5]:
# Using PyCaret functions to setup the regression model trainer
regression = setup(data=data, target="return_val")

Unnamed: 0,Description,Value
0,session_id,8481
1,Target,return_val
2,Original Data,"(97, 8)"
3,Missing Values,False
4,Numeric Features,6
5,Categorical Features,0
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(67, 25)"


In [6]:
# Compare the best models after training on different types
best = compare_models()
best

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lasso,Lasso Regression,4.527,59.9662,6.3695,-0.4617,1.263,1.4102,0.235
llar,Lasso Least Angle Regression,4.527,59.9662,6.3695,-0.4617,1.263,1.4102,0.005
dummy,Dummy Regressor,4.527,59.9662,6.3695,-0.4617,1.263,1.4102,0.005
br,Bayesian Ridge,4.5545,62.2083,6.4283,-0.4694,1.2557,1.4153,0.005
en,Elastic Net,4.5299,60.7837,6.4047,-0.4703,1.241,1.4188,0.005
knn,K Neighbors Regressor,4.8072,72.3207,6.7651,-0.8005,0.8882,3.2224,0.017
huber,Huber Regressor,4.9719,61.2056,6.6153,-0.9581,0.9497,3.4568,0.006
ridge,Ridge Regression,5.1206,70.0678,6.9344,-0.9836,0.984,4.2531,0.006
lightgbm,Light Gradient Boosting Machine,4.9492,69.2676,6.9853,-0.9849,1.0135,2.4057,0.006
omp,Orthogonal Matching Pursuit,4.9953,71.5381,7.0368,-1.0413,1.1511,2.2499,0.004


Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=8481,
      selection='cyclic', tol=0.0001, warm_start=False)

In [16]:
# Choose the best model technique and create a model
model = create_model("br")

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,5.1695,40.1152,6.3337,-0.0424,1.5695,1.0351
1,3.0646,15.8973,3.9871,-0.0195,1.0647,4.4227
2,1.6986,4.8355,2.199,-0.1167,0.8011,1.0189
3,3.7301,16.2141,4.0267,-0.1368,1.1019,1.1356
4,10.9796,374.7885,19.3595,-0.2811,1.3996,1.2041
5,4.76,31.8862,5.6468,-2.4553,1.5492,1.0499
6,3.7189,39.0024,6.2452,-0.222,1.3656,1.0466
7,5.1267,31.6668,5.6273,-0.1039,1.3846,0.9699
8,2.8154,11.1567,3.3402,-0.8331,0.8097,1.2456
9,4.4815,56.5198,7.518,-0.483,1.5111,1.0246


In [17]:
# Run model tune to get a better fit
tuned_model = tune_model(model)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,5.2009,40.7252,6.3816,-0.0583,1.7465,1.001
1,3.0082,15.61,3.9509,-0.0011,1.3452,0.9185
2,1.7292,5.3072,2.3037,-0.2256,1.0004,1.0003
3,3.5206,15.0883,3.8844,-0.0579,1.4743,1.0042
4,10.5427,352.7424,18.7814,-0.2057,1.5031,0.9586
5,4.6255,30.624,5.5339,-2.3185,1.6669,1.0013
6,3.7529,39.4294,6.2793,-0.2354,1.4266,1.001
7,5.1267,30.1922,5.4947,-0.0525,1.7755,0.9991
8,2.4512,8.9957,2.9993,-0.4781,1.1981,1.0063
9,4.4654,56.3135,7.5042,-0.4776,1.5302,1.0009


In [18]:
# Mark the tuned model as final and save it
model = finalize_model(model)
save_model(model, "../models/USDAUD")

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=['period'],
                                       ml_usecase='regression',
                                       numerical_features=[],
                                       target='return_val', time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numer...
                 ('clean_names', Clean_Colum_Names()),
                 ('feature_select', 'passthrough'), ('fix_multi', 'passthrough'),
                 ('dfs', 'passthrough'), ('pca', 'passthrough'),
                 ['trained_model',
                  BayesianRidge(alpha