In [1]:
import csv

import numpy as np
import pandas as pd
from pycaret.regression import *

In [2]:
def substitute_values(data, sorted_index, prefix, column):
    sub = 1

    # This list stores the values at which the labels were changed
    divider_list = ["USDCAD", column]

    for i, index in enumerate(sorted_index):
        if i == 33:
            sub += 1
            divider_list.append(data.at[index, column])
        if i == 66:
            sub += 1
            divider_list.append(data.at[index, column])

        data.at[index, column] = (prefix * 10) + sub

    with open("../divider_list.csv", "a", newline="") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(divider_list)

    return data

In [3]:
data = pd.read_csv("../data/USDCAD.csv")

# Preprocessing

# Removing the first row since it is an outlier - return is 0
data = data.drop(data.index[0])

# Delete any rows with null values
data.dropna(inplace=True)

# Delete any rows with 0 Vol and 0 FD
data = data[data.vol != 0]
data = data[data.fd != 0]

# Multiply the returns column by 100000 to have a usable column
data.return_val *= 10000

# Resetting the indexes in the dataframe
data.reset_index(drop=True, inplace=True)

# Select only the required columns - mean, fd, vol, return
data = data[["mean", "vol", "fd", "return_val"]]

data.head(5)

Unnamed: 0,mean,vol,fd,return_val
0,1.362958,0.000367,1286632.0,0.867757
1,1.362658,0.000514,2281479.0,-2.197698
2,1.362616,0.000371,1921154.0,-0.308281
3,1.362692,0.00044,2087190.0,0.558158
4,1.362129,0.000532,1397826.0,-4.133921


In [4]:
# Sorting the values independently as it worked the best
volatility_array = np.array(data["vol"])
# Argsort sorts the data and returns the indexes of the sorted values from original list
sorted_index = np.argsort(volatility_array)
# Using the sorted index positions, we can replace the values for
data = substitute_values(data, sorted_index, 0, "vol")

# Do the same actions for Fractal Dimension
fd_array = np.array(data["fd"])
sorted_index = np.argsort(fd_array)
data = substitute_values(data, sorted_index, 0, "fd")

data.head(5)

Unnamed: 0,mean,vol,fd,return_val
0,1.362958,1.0,2.0,0.867757
1,1.362658,2.0,3.0,-2.197698
2,1.362616,2.0,3.0,-0.308281
3,1.362692,2.0,3.0,0.558158
4,1.362129,3.0,3.0,-4.133921


In [5]:
# Using PyCaret functions to setup the regression model trainer
regression = setup(data=data, target="return_val")

Unnamed: 0,Description,Value
0,session_id,5848
1,Target,return_val
2,Original Data,"(139, 4)"
3,Missing Values,False
4,Numeric Features,3
5,Categorical Features,0
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(97, 3)"


In [6]:
# Compare the best models after training on different types
best = compare_models()
best

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lasso,Lasso Regression,2.575,16.6246,3.7116,-0.2072,0.8467,1.4357,0.202
llar,Lasso Least Angle Regression,2.575,16.6246,3.7116,-0.2072,0.8467,1.4357,0.005
dummy,Dummy Regressor,2.575,16.6246,3.7116,-0.2072,0.8467,1.4357,0.005
en,Elastic Net,2.588,16.7857,3.732,-0.2214,0.8359,1.4225,0.005
knn,K Neighbors Regressor,2.722,18.6533,3.8456,-0.2613,0.739,1.833,0.018
br,Bayesian Ridge,2.6289,17.2751,3.7957,-0.2727,0.8237,1.3932,0.005
huber,Huber Regressor,2.699,17.2663,3.8102,-0.301,0.8947,1.3199,0.005
omp,Orthogonal Matching Pursuit,2.6205,17.2083,3.8066,-0.3046,0.8396,1.2636,0.005
ridge,Ridge Regression,2.652,17.3642,3.8286,-0.3257,0.8115,1.3363,0.005
ada,AdaBoost Regressor,2.6713,17.3659,3.8029,-0.3423,0.8547,1.7621,0.012


Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=5848,
      selection='cyclic', tol=0.0001, warm_start=False)

In [7]:
# Choose the best model technique and create a model
model = create_model("knn")

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,3.6841,69.5975,8.3425,-0.2397,0.874,1.4446
1,2.7028,9.8953,3.1457,-0.1364,0.6709,1.3871
2,2.4876,10.3017,3.2096,-0.237,0.6144,1.1007
3,1.4585,4.4427,2.1078,0.0185,0.8377,4.2983
4,2.1667,6.5195,2.5533,-0.048,0.759,1.4645
5,1.9292,5.8911,2.4271,-0.0738,0.7271,3.4285
6,2.8117,11.0438,3.3232,-0.3624,0.6602,1.519
7,4.6161,36.8523,6.0706,-1.3412,0.7182,1.3844
8,3.5995,28.0837,5.2994,0.0261,0.8646,1.0673
9,1.7635,3.9058,1.9763,-0.2195,0.6636,1.2352


In [8]:
# Run model tune to get a better fit
tuned_model = tune_model(model)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,3.5298,68.9797,8.3054,-0.2287,0.9068,1.4119
1,2.8659,11.2837,3.3591,-0.2958,0.8171,1.311
2,2.7832,12.4494,3.5284,-0.4949,0.755,1.1062
3,1.3366,3.8063,1.951,0.1591,0.6744,2.8956
4,1.8949,5.3641,2.3161,0.1377,0.864,1.3484
5,1.734,4.9275,2.2198,0.1018,0.8659,1.908
6,2.6929,10.8124,3.2882,-0.3339,0.8411,1.3047
7,4.0836,27.9343,5.2853,-0.7747,0.8318,1.199
8,2.9954,20.4843,4.526,0.2897,0.9446,0.8194
9,1.6215,3.3424,1.8282,-0.0436,0.6295,1.0697


In [9]:
# Mark the tuned model as final and save it
model = finalize_model(tuned_model)
save_model(model, "../models/USDCAD")

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=[], ml_usecase='regression',
                                       numerical_features=[],
                                       target='return_val', time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_strat...
                 ('fix_perfect', Remove_100(target='return_val')),
                 ('clean_names', Clean_Colum_Names()),
                 ('feature_select', 'passthrough'), ('fix_multi', 'passthrough'),
                 ('dfs', 'passthrough'), ('pca', 'passthrough'),
                 ['trained_model',
         