In [1]:
import csv

import numpy as np
import pandas as pd
from pycaret.regression import *

In [2]:
def substitute_values(data, sorted_index, prefix, column):
    sub = 1

    # This list stores the values at which the labels were changed
    divider_list = ["USDCHF", column]

    for i, index in enumerate(sorted_index):
        if i == 33:
            sub += 1
            divider_list.append(data.at[index, column])
        if i == 66:
            sub += 1
            divider_list.append(data.at[index, column])

        data.at[index, column] = (prefix * 10) + sub

    # with open("divider_list.csv", "a", newline="") as csvfile:
    #     writer = csv.writer(csvfile)
    #     writer.writerow(divider_list)

    return data

In [3]:
data = pd.read_csv("../results/USDCHF.csv")

# Preprocessing

# Removing the first row since it is an outlier - return is 0
data = data.drop(data.index[0])

# Delete any rows with null values
data.dropna(inplace=True)

# Delete any rows with 0 Vol and 0 FD
data = data[data.vol != 0]
data = data[data.fd != 0]

# Multiply the returns column by 100000 to have a usable column
data.return_val *= 10000

# Resetting the indexes in the dataframe
data.reset_index(drop=True, inplace=True)

data.head(5)

Unnamed: 0,inserttime,period,max,min,mean,vol,fd,return_val
0,2022-12-02 02:42:25,2,0.9365,0.9358,0.936178,0.0007,722857.142857,-1.044095
1,2022-12-02 02:50:06,3,0.9361,0.93483,0.935454,0.00127,600000.0,-7.738212
2,2022-12-02 02:57:32,4,0.9359,0.935075,0.935412,0.000825,444848.484849,-0.444419
3,2022-12-02 03:05:09,5,0.93653,0.9352,0.935636,0.00133,461654.135338,2.390416
4,2022-12-02 03:12:50,6,0.9362,0.935495,0.935781,0.000705,499290.780142,1.551614


In [4]:
# Sorting the values independently as it worked the best
volatility_array = np.array(data["vol"])
# Argsort sorts the data and returns the indexes of the sorted values from original list
sorted_index = np.argsort(volatility_array)
# Using the sorted index positions, we can replace the values for
data = substitute_values(data, sorted_index, 0, "vol")

# Do the same actions for Fractal Dimension
fd_array = np.array(data["fd"])
sorted_index = np.argsort(fd_array)
data = substitute_values(data, sorted_index, 0, "fd")

data.head(5)

Unnamed: 0,inserttime,period,max,min,mean,vol,fd,return_val
0,2022-12-02 02:42:25,2,0.9365,0.9358,0.936178,2.0,2.0,-1.044095
1,2022-12-02 02:50:06,3,0.9361,0.93483,0.935454,3.0,2.0,-7.738212
2,2022-12-02 02:57:32,4,0.9359,0.935075,0.935412,2.0,1.0,-0.444419
3,2022-12-02 03:05:09,5,0.93653,0.9352,0.935636,3.0,1.0,2.390416
4,2022-12-02 03:12:50,6,0.9362,0.935495,0.935781,2.0,2.0,1.551614


In [5]:
# Using PyCaret functions to setup the regression model trainer
regression = setup(data=data, target="return_val")

Unnamed: 0,Description,Value
0,session_id,217
1,Target,return_val
2,Original Data,"(99, 8)"
3,Missing Values,False
4,Numeric Features,6
5,Categorical Features,0
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(69, 25)"


In [6]:
# Compare the best models after training on different types
best = compare_models()
best

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
llar,Lasso Least Angle Regression,4.642,74.6669,7.2421,-0.1493,1.0914,1.2311,0.005
dummy,Dummy Regressor,4.642,74.6669,7.2421,-0.1493,1.0914,1.2311,0.006
lasso,Lasso Regression,4.721,75.8916,7.2941,-0.1611,1.1885,1.1706,0.224
huber,Huber Regressor,4.6779,65.7908,6.89,-0.2157,0.844,2.2521,0.008
en,Elastic Net,4.728,74.59,7.3252,-0.2196,1.0801,1.2446,0.005
br,Bayesian Ridge,4.9689,79.5439,7.599,-0.4148,0.9977,1.3504,0.005
ridge,Ridge Regression,4.9747,66.4357,7.2733,-0.5844,0.838,2.3127,0.004
lightgbm,Light Gradient Boosting Machine,5.2005,81.7615,7.8859,-0.6054,0.9542,1.5002,0.011
omp,Orthogonal Matching Pursuit,4.8884,69.9027,7.4984,-0.6118,1.1792,1.4888,0.006
par,Passive Aggressive Regressor,5.0806,71.1319,7.4513,-0.6136,0.7175,2.1779,0.005


LassoLars(alpha=1.0, copy_X=True, eps=2.220446049250313e-16, fit_intercept=True,
          fit_path=True, jitter=None, max_iter=500, normalize=True,
          positive=False, precompute='auto', random_state=217, verbose=False)

In [13]:
# Choose the best model technique and create a model
model = create_model("huber")

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,2.436,7.6993,2.7748,0.3572,0.7405,1.3486
1,4.3714,35.6259,5.9687,-2.3935,0.8006,4.0786
2,2.7576,12.5144,3.5376,0.2111,0.4209,1.2735
3,2.3466,7.0309,2.6516,0.5051,0.8532,1.5272
4,8.0794,198.4845,14.0885,0.0468,1.1541,2.0039
5,3.4687,18.1972,4.2658,-0.7705,0.9944,2.9798
6,5.9237,69.7684,8.3527,-0.7112,0.9269,1.3439
7,6.5458,52.2954,7.2316,0.143,0.8078,5.7685
8,7.1641,233.8611,15.2925,0.1273,0.9108,1.1375
9,3.686,22.4306,4.7361,0.3272,0.8305,1.0596


In [14]:
# Run model tune to get a better fit
tuned_model = tune_model(model)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,2.8451,10.5812,3.2529,0.1166,0.9743,1.0876
1,2.8068,18.1717,4.2628,-0.7309,0.6379,1.7709
2,2.628,9.6441,3.1055,0.392,0.5839,1.0685
3,2.5076,9.9714,3.1578,0.2982,0.9578,0.7736
4,8.5034,232.6412,15.2526,-0.1172,1.5364,1.7498
5,2.8982,12.8384,3.5831,-0.2491,0.9935,2.3105
6,5.4471,48.0324,6.9305,-0.1781,1.2868,1.0298
7,6.8412,63.7283,7.983,-0.0444,1.1152,4.4646
8,7.8347,277.2583,16.6511,-0.0347,1.2709,0.9016
9,3.149,22.036,4.6943,0.3391,0.9616,0.8108


In [15]:
# Mark the tuned model as final and save it
model = finalize_model(tuned_model)
save_model(model, "../models/USDCHF")

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=['period'],
                                       ml_usecase='regression',
                                       numerical_features=[],
                                       target='return_val', time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numer...
                 ('dummy', Dummify(target='return_val')),
                 ('fix_perfect', Remove_100(target='return_val')),
                 ('clean_names', Clean_Colum_Names()),
                 ('feature_select', 'passthrough'), ('fix_multi', 'passthrough'),
            