In [10]:
import csv

import numpy as np
import pandas as pd
from pycaret.regression import *

In [11]:
def substitute_values(data, sorted_index, prefix, column):
    sub = 1

    # This list stores the values at which the labels were changed
    divider_list = ["USDNZD", column]

    for i, index in enumerate(sorted_index):
        if i == 33:
            sub += 1
            divider_list.append(data.at[index, column])
        if i == 66:
            sub += 1
            divider_list.append(data.at[index, column])

        data.at[index, column] = (prefix * 10) + sub

    # with open("divider_list.csv", "a", newline="") as csvfile:
    #     writer = csv.writer(csvfile)
    #     writer.writerow(divider_list)

    return data

In [12]:
data = pd.read_csv("../results/USDNZD.csv")

# Preprocessing

# Removing the first row since it is an outlier - return is 0
data = data.drop(data.index[0])

# Delete any rows with null values
data.dropna(inplace=True)

# Delete any rows with 0 Vol and 0 FD
data = data[data.vol != 0]
data = data[data.fd != 0]

# Multiply the returns column by 100000 to have a usable column
data.return_val *= 10000

# Resetting the indexes in the dataframe
data.reset_index(drop=True, inplace=True)

data.head(5)

Unnamed: 0,inserttime,period,max,min,mean,vol,fd,return_val
0,2022-12-02 02:42:24,2,1.564595,1.562665,1.563645,0.00193,165803.108808,-8.058157
1,2022-12-02 02:50:05,3,1.563295,1.561465,1.562178,0.00183,118579.234973,-9.38663
2,2022-12-02 02:57:32,4,1.563585,1.562365,1.562896,0.00122,174590.163934,4.599151
3,2022-12-02 03:05:09,5,1.565935,1.563275,1.564799,0.00266,212781.954887,12.176223
4,2022-12-02 03:12:50,6,1.566115,1.564865,1.565463,0.00125,165600.0,4.241603


In [13]:
# Sorting the values independently as it worked the best
volatility_array = np.array(data["vol"])
# Argsort sorts the data and returns the indexes of the sorted values from original list
sorted_index = np.argsort(volatility_array)
# Using the sorted index positions, we can replace the values for
data = substitute_values(data, sorted_index, 0, "vol")

# Do the same actions for Fractal Dimension
fd_array = np.array(data["fd"])
sorted_index = np.argsort(fd_array)
data = substitute_values(data, sorted_index, 0, "fd")

data.head(5)

Unnamed: 0,inserttime,period,max,min,mean,vol,fd,return_val
0,2022-12-02 02:42:24,2,1.564595,1.562665,1.563645,3.0,2.0,-8.058157
1,2022-12-02 02:50:05,3,1.563295,1.561465,1.562178,2.0,1.0,-9.38663
2,2022-12-02 02:57:32,4,1.563585,1.562365,1.562896,2.0,2.0,4.599151
3,2022-12-02 03:05:09,5,1.565935,1.563275,1.564799,3.0,2.0,12.176223
4,2022-12-02 03:12:50,6,1.566115,1.564865,1.565463,2.0,2.0,4.241603


In [14]:
# Using PyCaret functions to setup the regression model trainer
regression = setup(data=data, target="return_val")

Unnamed: 0,Description,Value
0,session_id,2136
1,Target,return_val
2,Original Data,"(99, 8)"
3,Missing Values,False
4,Numeric Features,6
5,Categorical Features,0
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(69, 24)"


In [15]:
# Compare the best models after training on different types
best = compare_models()
best

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lasso,Lasso Regression,4.6649,62.4686,6.6227,-0.1884,1.2007,1.2166,0.218
llar,Lasso Least Angle Regression,4.6649,62.4686,6.6227,-0.1884,1.2007,1.2166,0.411
dummy,Dummy Regressor,4.6649,62.4686,6.6227,-0.1884,1.2007,1.2166,0.006
br,Bayesian Ridge,4.6949,64.1058,6.6685,-0.1939,1.1781,1.249,0.005
en,Elastic Net,4.6892,63.2601,6.6612,-0.2014,1.1875,1.2383,0.005
lightgbm,Light Gradient Boosting Machine,4.9551,69.4805,7.1164,-0.4682,0.9121,1.379,0.009
knn,K Neighbors Regressor,4.98,76.2638,7.3846,-0.6129,0.9103,1.7211,0.017
ridge,Ridge Regression,5.647,84.8489,7.7834,-0.9031,0.9511,2.2519,0.005
rf,Random Forest Regressor,5.4664,97.398,8.2548,-1.0439,0.8699,1.9591,0.086
omp,Orthogonal Matching Pursuit,5.764,97.9574,8.2908,-1.3591,1.0783,1.6851,0.217


Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=2136,
      selection='cyclic', tol=0.0001, warm_start=False)

In [33]:
# Choose the best model technique and create a model
model = create_model(best)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,5.6599,53.1176,7.2882,-0.1593,1.5906,0.9358
1,9.6852,313.6906,17.7113,-0.0337,1.5924,0.5914
2,5.91,60.8635,7.8015,-0.4435,1.2064,2.5769
3,2.6861,9.6606,3.1082,-0.2042,0.7697,1.6776
4,3.18,21.173,4.6014,-0.2771,0.908,0.7348
5,2.7984,12.9616,3.6002,-0.0719,1.0325,1.237
6,6.0306,62.3699,7.8975,-0.2927,1.6976,1.034
7,5.9312,74.8312,8.6505,-0.2648,1.6615,1.0638
8,2.6082,10.8379,3.2921,-0.0067,0.9226,1.0798
9,2.1592,5.18,2.276,-0.13,0.626,1.2345


In [34]:
# Run model tune to get a better fit
tuned_model = tune_model(model)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,5.7828,54.7494,7.3993,-0.1949,1.8132,1.0
1,10.2435,308.8105,17.573,-0.0176,2.0856,1.0
2,5.4816,53.2181,7.2951,-0.2621,1.7658,1.0
3,2.3901,8.3696,2.893,-0.0433,1.2115,1.0
4,3.2912,18.4424,4.2945,-0.1124,1.4032,1.0
5,2.9982,14.0486,3.7481,-0.1618,1.358,1.0
6,6.0563,63.7521,7.9845,-0.3213,1.8501,1.0
7,5.9539,76.1176,8.7245,-0.2866,1.7899,1.0
8,2.6709,11.4094,3.3778,-0.0598,1.2783,1.0
9,1.9498,4.6046,2.1458,-0.0045,1.0791,1.0


In [35]:
# Mark the tuned model as final and save it
model = finalize_model(tuned_model)
save_model(model, "../models/USDNZD")

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=['period'],
                                       ml_usecase='regression',
                                       numerical_features=[],
                                       target='return_val', time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numer...
                 ('clean_names', Clean_Colum_Names()),
                 ('feature_select', 'passthrough'), ('fix_multi', 'passthrough'),
                 ('dfs', 'passthrough'), ('pca', 'passthrough'),
                 ['trained_model',
                  Lasso(alpha=6.17, c