In [1]:
import csv

import numpy as np
import pandas as pd
from pycaret.regression import *

In [2]:
def substitute_values(data, sorted_index, prefix, column):
    sub = 1

    # This list stores the values at which the labels were changed
    divider_list = ["USDAUD", column]

    for i, index in enumerate(sorted_index):
        if i == 33:
            sub += 1
            divider_list.append(data.at[index, column])
        if i == 66:
            sub += 1
            divider_list.append(data.at[index, column])

        data.at[index, column] = (prefix * 10) + sub

    with open("../divider_list.csv", "a", newline="") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(divider_list)

    return data

In [3]:
data = pd.read_csv("../data/USDAUD.csv")

# Preprocessing

# Removing the first row since it is an outlier - return is 0
data = data.drop(data.index[0])

# Delete any rows with null values
data.dropna(inplace=True)

# Delete any rows with 0 Vol and 0 FD
data = data[data.vol != 0]
data = data[data.fd != 0]

# Multiply the returns column by 100000 to have a usable column
data.return_val *= 10000

# Resetting the indexes in the dataframe
data.reset_index(drop=True, inplace=True)

# Select only the required columns - mean, fd, vol, return
data = data[["mean", "vol", "fd", "return_val"]]

data.head(5)

Unnamed: 0,mean,vol,fd,return_val
0,1.480386,0.000873,914935.2,0.357907
1,1.479257,0.001351,665491.5,-7.627976
2,1.478147,0.001131,564282.9,-7.499591
3,1.477415,0.000582,1187293.0,-4.955366
4,1.477271,0.000979,1405563.0,-0.972375


In [4]:
# Sorting the values independently as it worked the best
volatility_array = np.array(data["vol"])
# Argsort sorts the data and returns the indexes of the sorted values from original list
sorted_index = np.argsort(volatility_array)
# Using the sorted index positions, we can replace the values for
data = substitute_values(data, sorted_index, 0, "vol")

# Do the same actions for Fractal Dimension
fd_array = np.array(data["fd"])
sorted_index = np.argsort(fd_array)
data = substitute_values(data, sorted_index, 0, "fd")

data.head(5)

Unnamed: 0,mean,vol,fd,return_val
0,1.480386,2.0,2.0,0.357907
1,1.479257,3.0,1.0,-7.627976
2,1.478147,3.0,1.0,-7.499591
3,1.477415,1.0,3.0,-4.955366
4,1.477271,3.0,3.0,-0.972375


In [5]:
# Using PyCaret functions to setup the regression model trainer
regression = setup(data=data, target="return_val")

Unnamed: 0,Description,Value
0,session_id,5639
1,Target,return_val
2,Original Data,"(139, 4)"
3,Missing Values,False
4,Numeric Features,3
5,Categorical Features,0
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(97, 3)"


In [6]:
# Compare the best models after training on different types
best = compare_models()
best

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
huber,Huber Regressor,4.2836,70.8513,6.4778,-0.2766,1.1107,2.7167,0.005
llar,Lasso Least Angle Regression,4.3855,71.6964,6.5866,-0.3456,1.0389,3.6342,0.005
dummy,Dummy Regressor,4.3855,71.6964,6.5866,-0.3456,1.0389,3.6342,0.006
lasso,Lasso Regression,4.3621,72.0959,6.6186,-0.3712,1.0488,2.4316,0.205
en,Elastic Net,4.3408,72.0136,6.6171,-0.3737,1.055,1.7707,0.005
br,Bayesian Ridge,4.3534,72.6458,6.6831,-0.4227,1.043,1.8313,0.005
lr,Linear Regression,4.4797,72.7763,6.7531,-0.4729,0.8336,3.9174,0.385
lar,Least Angle Regression,4.4797,72.7764,6.7531,-0.4729,0.8336,3.9174,0.005
omp,Orthogonal Matching Pursuit,4.4413,74.0666,6.7971,-0.5188,0.888,2.7229,0.005
ridge,Ridge Regression,4.4808,73.5551,6.8169,-0.5386,0.9225,3.4284,0.005


HuberRegressor(alpha=0.0001, epsilon=1.35, fit_intercept=True, max_iter=100,
               tol=1e-05, warm_start=False)

In [7]:
# Choose the best model technique and create a model
model = create_model(best)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,3.5753,17.5176,4.1854,-0.2261,1.1119,1.5265
1,5.8084,114.5004,10.7005,-0.0699,1.3462,1.6129
2,10.7084,454.5288,21.3197,-0.2378,1.6279,1.1145
3,3.7568,21.5166,4.6386,-1.1035,1.2663,1.0387
4,3.1536,15.6047,3.9503,-0.0704,0.9663,1.5981
5,4.0069,29.2642,5.4096,-0.0961,1.1152,15.6529
6,3.2389,17.0223,4.1258,0.1412,1.08,0.9663
7,3.7953,21.4901,4.6357,-0.3003,1.1338,1.0553
8,2.6939,10.2518,3.2018,-0.878,0.6671,1.0603
9,2.0981,6.8159,2.6107,0.0748,0.7924,1.5409


In [8]:
# Run model tune to get a better fit
tuned_model = tune_model(model)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,3.5007,16.8302,4.1025,-0.1779,1.0978,1.6274
1,5.8674,114.2996,10.6911,-0.068,1.3575,1.7545
2,10.8153,458.3354,21.4088,-0.2481,1.5519,1.1403
3,3.6258,20.5119,4.529,-1.0053,1.246,0.9294
4,3.1795,16.0042,4.0005,-0.0978,0.8466,1.685
5,3.9826,29.0635,5.3911,-0.0885,1.0943,14.3689
6,3.2386,17.1312,4.139,0.1357,1.0384,0.9504
7,3.7543,21.0179,4.5845,-0.2717,1.0904,1.0209
8,2.658,10.2211,3.197,-0.8724,0.5511,1.0476
9,2.1316,6.7348,2.5951,0.0858,0.7461,1.6931


In [9]:
# Mark the tuned model as final and save it
model = finalize_model(model)
save_model(model, "../models/USDAUD")

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=[], ml_usecase='regression',
                                       numerical_features=[],
                                       target='return_val', time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_strat...
                 ('dummy', Dummify(target='return_val')),
                 ('fix_perfect', Remove_100(target='return_val')),
                 ('clean_names', Clean_Colum_Names()),
                 ('feature_select', 'passthrough'), ('fix_multi', 'passthrough'),
                 ('dfs', 'passthrough'), ('pca', 'p