In [1]:
import csv

import numpy as np
import pandas as pd
from pycaret.regression import *

In [2]:
def substitute_values(data, sorted_index, prefix, column):
    sub = 1

    # This list stores the values at which the labels were changed
    divider_list = ["USDSGD", column]

    for i, index in enumerate(sorted_index):
        if i == 33:
            sub += 1
            divider_list.append(data.at[index, column])
        if i == 66:
            sub += 1
            divider_list.append(data.at[index, column])

        data.at[index, column] = (prefix * 10) + sub

    # with open("divider_list.csv", "a", newline="") as csvfile:
    #     writer = csv.writer(csvfile)
    #     writer.writerow(divider_list)

    return data

In [3]:
data = pd.read_csv("../results/USDSGD.csv")

# Preprocessing

# Removing the first row since it is an outlier - return is 0
data = data.drop(data.index[0])

# Delete any rows with null values
data.dropna(inplace=True)

# Delete any rows with 0 Vol and 0 FD
data = data[data.vol != 0]
data = data[data.fd != 0]

# Multiply the returns column by 100000 to have a usable column
data.return_val *= 10000

# Resetting the indexes in the dataframe
data.reset_index(drop=True, inplace=True)

data.head(5)

Unnamed: 0,inserttime,period,max,min,mean,vol,fd,return_val
0,2022-12-02 02:42:24,2,1.352095,1.351185,1.351638,0.00091,929670.32967,-4.026274
1,2022-12-02 02:50:05,3,1.3516,1.3509,1.351139,0.0007,720000.0,-3.687591
2,2022-12-02 02:57:32,4,1.3518,1.35115,1.351397,0.00065,729230.769231,1.906111
3,2022-12-02 03:05:09,5,1.352615,1.351475,1.352035,0.00114,504385.964912,4.721448
4,2022-12-02 03:12:50,6,1.352615,1.352135,1.352374,0.00048,600000.0,2.509882


In [4]:
# Sorting the values independently as it worked the best
volatility_array = np.array(data["vol"])
# Argsort sorts the data and returns the indexes of the sorted values from original list
sorted_index = np.argsort(volatility_array)
# Using the sorted index positions, we can replace the values for
data = substitute_values(data, sorted_index, 0, "vol")

# Do the same actions for Fractal Dimension
fd_array = np.array(data["fd"])
sorted_index = np.argsort(fd_array)
data = substitute_values(data, sorted_index, 0, "fd")

data.head(5)

Unnamed: 0,inserttime,period,max,min,mean,vol,fd,return_val
0,2022-12-02 02:42:24,2,1.352095,1.351185,1.351638,2.0,2.0,-4.026274
1,2022-12-02 02:50:05,3,1.3516,1.3509,1.351139,2.0,2.0,-3.687591
2,2022-12-02 02:57:32,4,1.3518,1.35115,1.351397,1.0,2.0,1.906111
3,2022-12-02 03:05:09,5,1.352615,1.351475,1.352035,3.0,1.0,4.721448
4,2022-12-02 03:12:50,6,1.352615,1.352135,1.352374,1.0,2.0,2.509882


In [16]:
# Using PyCaret functions to setup the regression model trainer
regression = setup(data=data, target="return_val")

Unnamed: 0,Description,Value
0,session_id,8125
1,Target,return_val
2,Original Data,"(97, 8)"
3,Missing Values,False
4,Numeric Features,6
5,Categorical Features,0
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(67, 24)"


In [17]:
# Compare the best models after training on different types
best = compare_models()
best

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
ada,AdaBoost Regressor,2.2217,8.2105,2.6842,-0.4029,0.6621,1.9709,0.012
par,Passive Aggressive Regressor,2.3419,9.2715,2.8109,-0.4098,0.6916,2.468,0.006
ridge,Ridge Regression,2.3564,8.7148,2.7702,-0.4103,0.7508,2.6258,0.006
omp,Orthogonal Matching Pursuit,2.2677,8.6299,2.7027,-0.4544,0.7641,2.6,0.006
llar,Lasso Least Angle Regression,2.2231,8.313,2.7277,-0.4564,0.8248,1.8751,0.006
dummy,Dummy Regressor,2.2231,8.313,2.7277,-0.4564,0.8248,1.8751,0.006
en,Elastic Net,2.2231,8.313,2.7277,-0.4564,0.8248,1.8751,0.005
lasso,Lasso Regression,2.2231,8.313,2.7277,-0.4564,0.8248,1.8751,0.005
rf,Random Forest Regressor,2.3482,9.0585,2.8261,-0.4932,0.6563,2.3782,0.09
br,Bayesian Ridge,2.2971,8.6349,2.7852,-0.5127,0.8433,1.9951,0.007


AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
                  n_estimators=50, random_state=8125)

In [18]:
# Choose the best model technique and create a model
model = create_model(best)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,2.0873,5.1369,2.2665,0.1276,0.546,2.3292
1,2.673,8.2954,2.8802,-0.123,0.8634,0.9303
2,3.0754,14.102,3.7553,-1.0862,0.7867,2.11
3,3.1602,12.7211,3.5667,-0.357,1.0974,1.1829
4,1.59,3.7326,1.932,-2.1554,0.5409,5.696
5,3.245,18.2212,4.2686,0.1096,0.6791,1.3863
6,1.1167,1.8833,1.3723,-0.0536,0.5052,1.9149
7,0.8148,0.9528,0.9761,0.4984,0.243,1.1949
8,2.2159,7.2587,2.6942,0.1038,0.9894,0.9499
9,2.2389,9.8007,3.1306,-1.0936,0.3702,2.0147


In [19]:
# Run model tune to get a better fit
tuned_model = tune_model(model)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,2.2415,5.9761,2.4446,-0.015,0.5217,2.8898
1,2.5571,7.4324,2.7262,-0.0061,0.8571,0.9123
2,2.7778,11.7625,3.4297,-0.7401,0.9318,1.7169
3,3.181,12.7566,3.5716,-0.3608,1.0241,1.2021
4,1.388,2.1593,1.4695,-0.8254,0.4376,10.2478
5,3.2943,18.603,4.3131,0.0909,0.7245,1.4674
6,0.9366,1.4542,1.2059,0.1865,0.5116,1.5929
7,1.0385,1.5222,1.2338,0.1987,0.3458,1.1791
8,2.1947,6.9505,2.6364,0.1418,0.9841,0.9472
9,2.3217,9.9719,3.1578,-1.1302,0.3379,1.8074


In [20]:
# Mark the tuned model as final and save it
model = finalize_model(tuned_model)
save_model(model, "../models/USDSGD")

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=['period'],
                                       ml_usecase='regression',
                                       numerical_features=[],
                                       target='return_val', time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numer...
                 ('dummy', Dummify(target='return_val')),
                 ('fix_perfect', Remove_100(target='return_val')),
                 ('clean_names', Clean_Colum_Names()),
                 ('feature_select', 'passthrough'), ('fix_multi', 'passthrough'),
            