In [1]:
import csv

import numpy as np
import pandas as pd
from pycaret.regression import *

In [2]:
def substitute_values(data, sorted_index, prefix, column):
    sub = 1

    # This list stores the values at which the labels were changed
    divider_list = ["USDHKD", column]

    for i, index in enumerate(sorted_index):
        if i == 33:
            sub += 1
            divider_list.append(data.at[index, column])
        if i == 66:
            sub += 1
            divider_list.append(data.at[index, column])

        data.at[index, column] = (prefix * 10) + sub

    # with open("divider_list.csv", "a", newline="") as csvfile:
    #     writer = csv.writer(csvfile)
    #     writer.writerow(divider_list)

    return data

In [3]:
data = pd.read_csv("../results/USDHKD.csv")

# Preprocessing

# Removing the first row since it is an outlier - return is 0
data = data.drop(data.index[0])

# Delete any rows with null values
data.dropna(inplace=True)

# Delete any rows with 0 Vol and 0 FD
data = data[data.vol != 0]
data = data[data.fd != 0]

# Multiply the returns column by 100000 to have a usable column
data.return_val *= 10000

# Resetting the indexes in the dataframe
data.reset_index(drop=True, inplace=True)

data.head(5)

Unnamed: 0,inserttime,period,max,min,mean,vol,fd,return_val
0,2022-12-02 02:42:25,2,7.78115,7.78005,7.780652,0.0011,1662727.0,-0.526521
1,2022-12-02 02:50:04,3,7.780135,7.778345,7.779159,0.00179,326257.0,-1.918902
2,2022-12-02 02:57:32,4,7.78065,7.77828,7.779545,0.00237,142194.1,0.496553
3,2022-12-02 03:05:08,5,7.78055,7.779345,7.779933,0.001205,162655.6,0.49893
4,2022-12-02 03:12:49,6,7.77975,7.77905,7.779465,0.0007,644285.7,-0.602043


In [4]:
# Sorting the values independently as it worked the best
volatility_array = np.array(data["vol"])
# Argsort sorts the data and returns the indexes of the sorted values from original list
sorted_index = np.argsort(volatility_array)
# Using the sorted index positions, we can replace the values for
data = substitute_values(data, sorted_index, 0, "vol")

# Do the same actions for Fractal Dimension
fd_array = np.array(data["fd"])
sorted_index = np.argsort(fd_array)
data = substitute_values(data, sorted_index, 0, "fd")

data.head(5)

Unnamed: 0,inserttime,period,max,min,mean,vol,fd,return_val
0,2022-12-02 02:42:25,2,7.78115,7.78005,7.780652,2.0,3.0,-0.526521
1,2022-12-02 02:50:04,3,7.780135,7.778345,7.779159,3.0,1.0,-1.918902
2,2022-12-02 02:57:32,4,7.78065,7.77828,7.779545,3.0,1.0,0.496553
3,2022-12-02 03:05:08,5,7.78055,7.779345,7.779933,2.0,1.0,0.49893
4,2022-12-02 03:12:49,6,7.77975,7.77905,7.779465,1.0,2.0,-0.602043


In [5]:
# Using PyCaret functions to setup the regression model trainer
regression = setup(data=data, target="return_val")

Unnamed: 0,Description,Value
0,session_id,5772
1,Target,return_val
2,Original Data,"(98, 8)"
3,Missing Values,False
4,Numeric Features,6
5,Categorical Features,0
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(68, 25)"


In [6]:
# Compare the best models after training on different types
best = compare_models()
best

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
knn,K Neighbors Regressor,0.7044,1.3466,0.9744,-0.2403,0.3795,1.6726,0.017
llar,Lasso Least Angle Regression,0.7708,1.3716,0.9912,-0.3028,0.5382,1.0481,0.418
lasso,Lasso Regression,0.7708,1.3716,0.9912,-0.3028,0.5382,1.0481,0.222
dummy,Dummy Regressor,0.7708,1.3716,0.9912,-0.3028,0.5382,1.0481,0.006
en,Elastic Net,0.7708,1.3716,0.9912,-0.3028,0.5382,1.0481,0.005
br,Bayesian Ridge,0.7706,1.3727,0.9915,-0.303,0.5383,1.0449,0.005
lightgbm,Light Gradient Boosting Machine,0.7763,1.4045,1.0334,-0.6557,0.3957,2.2862,0.011
ridge,Ridge Regression,0.8008,1.6009,1.1014,-0.9427,0.4232,1.8432,0.005
omp,Orthogonal Matching Pursuit,0.8178,1.5001,1.0914,-1.0414,0.4357,2.3301,0.221
rf,Random Forest Regressor,0.8278,1.6693,1.1258,-1.2073,0.4362,1.8163,0.085


KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
                    weights='uniform')

In [7]:
# Choose the best model technique and create a model
model = create_model(best)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.4188,0.2637,0.5135,-0.159,0.2945,1.9549
1,0.5932,0.7169,0.8467,-0.5485,0.3366,1.3593
2,0.1571,0.0428,0.2068,0.2177,0.1093,1.4037
3,1.0805,1.6377,1.2797,-0.0053,0.4018,5.083
4,0.3768,0.2685,0.5182,-0.5425,0.2838,0.9353
5,0.4057,0.2209,0.47,-0.8979,0.2094,1.5263
6,1.4156,5.7394,2.3957,-0.1367,0.536,0.9306
7,0.7389,1.0371,1.0184,-0.3998,0.4012,1.7144
8,1.2436,2.9224,1.7095,-0.1939,0.7965,0.7703
9,0.6137,0.6163,0.785,0.2628,0.4259,1.048


In [8]:
# Run model tune to get a better fit
tuned_model = tune_model(model)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.3675,0.2189,0.4678,0.0381,0.3185,1.0839
1,0.4696,0.4571,0.6761,0.0126,0.4186,0.8995
2,0.2287,0.085,0.2915,-0.5551,0.2338,0.8512
3,1.1107,1.6536,1.2859,-0.0151,0.718,1.5866
4,0.4413,0.2661,0.5159,-0.529,0.3518,1.082
5,0.306,0.1275,0.3571,-0.0953,0.2422,0.9727
6,1.6316,5.4357,2.3315,-0.0766,0.9188,1.0737
7,0.8392,1.4893,1.2204,-1.0102,0.5752,0.9443
8,1.3381,3.1457,1.7736,-0.2852,0.756,0.9526
9,0.7465,0.8614,0.9281,-0.0305,0.5519,1.4175


In [9]:
# Mark the tuned model as final and save it
model = finalize_model(tuned_model)
save_model(model, "../models/USDHKD")

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=['period'],
                                       ml_usecase='regression',
                                       numerical_features=[],
                                       target='return_val', time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numer...
                 ('fix_perfect', Remove_100(target='return_val')),
                 ('clean_names', Clean_Colum_Names()),
                 ('feature_select', 'passthrough'), ('fix_multi', 'passthrough'),
                 ('dfs', 'passthrough'), ('pca', 'passthrough'),
     