In [1]:
import csv

import numpy as np
import pandas as pd
from pycaret.regression import *

In [2]:
def substitute_values(data, sorted_index, prefix, column):
    sub = 1

    # This list stores the values at which the labels were changed
    divider_list = ["USDCHF", column]

    for i, index in enumerate(sorted_index):
        if i == 33:
            sub += 1
            divider_list.append(data.at[index, column])
        if i == 66:
            sub += 1
            divider_list.append(data.at[index, column])

        data.at[index, column] = (prefix * 10) + sub

    with open("../divider_list.csv", "a", newline="") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(divider_list)

    return data

In [3]:
data = pd.read_csv("../data/USDCHF.csv")

# Preprocessing

# Removing the first row since it is an outlier - return is 0
data = data.drop(data.index[0])

# Delete any rows with null values
data.dropna(inplace=True)

# Delete any rows with 0 Vol and 0 FD
data = data[data.vol != 0]
data = data[data.fd != 0]

# Multiply the returns column by 100000 to have a usable column
data.return_val *= 10000

# Resetting the indexes in the dataframe
data.reset_index(drop=True, inplace=True)

# Select only the required columns - mean, fd, vol, return
data = data[["mean", "vol", "fd", "return_val"]]

data.head(5)

Unnamed: 0,mean,vol,fd,return_val
0,0.937113,0.000534,967100.4,0.665564
1,0.936845,0.000582,1172346.0,-2.853271
2,0.936648,0.000673,1137358.0,-2.107489
3,0.936691,0.000641,980403.2,0.459192
4,0.937013,0.000747,752287.8,3.440575


In [4]:
# Sorting the values independently as it worked the best
volatility_array = np.array(data["vol"])
# Argsort sorts the data and returns the indexes of the sorted values from original list
sorted_index = np.argsort(volatility_array)
# Using the sorted index positions, we can replace the values for
data = substitute_values(data, sorted_index, 0, "vol")

# Do the same actions for Fractal Dimension
fd_array = np.array(data["fd"])
sorted_index = np.argsort(fd_array)
data = substitute_values(data, sorted_index, 0, "fd")

data.head(5)

Unnamed: 0,mean,vol,fd,return_val
0,0.937113,2.0,3.0,0.665564
1,0.936845,2.0,3.0,-2.853271
2,0.936648,3.0,3.0,-2.107489
3,0.936691,3.0,3.0,0.459192
4,0.937013,3.0,2.0,3.440575


In [5]:
# Using PyCaret functions to setup the regression model trainer
regression = setup(data=data, target="return_val")

Unnamed: 0,Description,Value
0,session_id,4246
1,Target,return_val
2,Original Data,"(139, 4)"
3,Missing Values,False
4,Numeric Features,3
5,Categorical Features,0
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(97, 3)"


In [6]:
# Compare the best models after training on different types
best = compare_models()
best

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,3.7079,47.4307,5.8754,-0.2367,1.076,4.1535,0.007
llar,Lasso Least Angle Regression,3.8143,47.2075,5.875,-0.2476,0.879,7.5549,0.005
dummy,Dummy Regressor,3.8143,47.2075,5.875,-0.2476,0.879,7.5549,0.005
lasso,Lasso Regression,3.8329,47.6613,5.9169,-0.2718,0.8708,6.8674,0.204
en,Elastic Net,3.8214,47.2993,5.9067,-0.2783,0.8566,5.9984,0.005
br,Bayesian Ridge,3.8391,47.8276,5.967,-0.3264,0.8928,5.3702,0.005
omp,Orthogonal Matching Pursuit,3.8758,47.0522,5.9885,-0.3866,0.8716,7.2447,0.005
ridge,Ridge Regression,3.8875,47.2009,6.0029,-0.3994,0.8519,7.9548,0.005
lar,Least Angle Regression,3.9422,47.801,6.0472,-0.4246,0.8314,8.3231,0.005
lr,Linear Regression,3.9422,47.801,6.0472,-0.4246,0.8314,8.3231,0.449


HuberRegressor(alpha=0.0001, epsilon=1.35, fit_intercept=True, max_iter=100,
               tol=1e-05, warm_start=False)

In [7]:
# Choose the best model technique and create a model
model = create_model(best)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,2.4615,8.0991,2.8459,-0.1855,0.9921,4.003
1,3.1436,26.9039,5.1869,-0.1407,1.0903,1.3189
2,2.3551,12.0082,3.4653,-0.1417,0.8161,14.193
3,2.7975,15.338,3.9164,-0.4658,0.7653,14.5514
4,5.7097,81.0288,9.0016,-0.2972,1.4984,1.0413
5,5.296,146.9211,12.1211,-0.0534,1.3007,0.9658
6,7.8136,149.4345,12.2243,-0.0646,1.7311,1.1494
7,1.9971,7.7739,2.7882,-0.1537,0.8654,1.0657
8,2.4782,8.7107,2.9514,-0.1449,0.9224,1.2545
9,3.0271,18.089,4.2531,-0.719,0.7785,1.9916


In [8]:
# Run model tune to get a better fit
tuned_model = tune_model(model)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,2.5237,8.6811,2.9464,-0.2707,1.0324,2.9989
1,3.093,25.7197,5.0715,-0.0905,1.1421,1.4176
2,2.3228,11.0163,3.3191,-0.0473,0.9379,20.8937
3,2.6364,13.6035,3.6883,-0.3,0.8516,9.2352
4,5.82,83.4448,9.1348,-0.3359,1.6325,1.0771
5,5.3152,149.2429,12.2165,-0.0701,1.399,0.9402
6,7.7905,150.4993,12.2678,-0.0722,1.817,1.1609
7,2.0693,8.2935,2.8798,-0.2309,0.9088,1.1429
8,2.3519,8.1164,2.8489,-0.0668,0.944,1.0233
9,2.7821,16.4003,4.0497,-0.5585,0.7987,1.5967


In [9]:
# Mark the tuned model as final and save it
model = finalize_model(tuned_model)
save_model(model, "../models/USDCHF")

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=[], ml_usecase='regression',
                                       numerical_features=[],
                                       target='return_val', time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_strat...
                 ('dummy', Dummify(target='return_val')),
                 ('fix_perfect', Remove_100(target='return_val')),
                 ('clean_names', Clean_Colum_Names()),
                 ('feature_select', 'passthrough'), ('fix_multi', 'passthrough'),
                 ('dfs', 'passthrough'), ('pca', 'p