In [1]:
import csv

import numpy as np
import pandas as pd
from pycaret.regression import *

In [2]:
def substitute_values(data, sorted_index, prefix, column):
    sub = 1

    # This list stores the values at which the labels were changed
    divider_list = ["USDSGD", column]

    for i, index in enumerate(sorted_index):
        if i == 33:
            sub += 1
            divider_list.append(data.at[index, column])
        if i == 66:
            sub += 1
            divider_list.append(data.at[index, column])

        data.at[index, column] = (prefix * 10) + sub

    with open("../divider_list.csv", "a", newline="") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(divider_list)

    return data

In [3]:
data = pd.read_csv("../data/USDSGD.csv")

# Preprocessing

# Removing the first row since it is an outlier - return is 0
data = data.drop(data.index[0])

# Delete any rows with null values
data.dropna(inplace=True)

# Delete any rows with 0 Vol and 0 FD
data = data[data.vol != 0]
data = data[data.fd != 0]

# Multiply the returns column by 100000 to have a usable column
data.return_val *= 10000

# Resetting the indexes in the dataframe
data.reset_index(drop=True, inplace=True)

# Select only the required columns - mean, fd, vol, return
data = data[["mean", "vol", "fd", "return_val"]]

data.head(5)

Unnamed: 0,mean,vol,fd,return_val
0,1.356294,0.00059,1376639.0,-0.3384
1,1.355756,0.000572,1182569.0,-3.971043
2,1.355464,0.000572,1273261.0,-2.154857
3,1.355154,0.000295,1704107.0,-2.2804
4,1.355041,0.000402,3227235.0,-0.838683


In [4]:
# Sorting the values independently as it worked the best
volatility_array = np.array(data["vol"])
# Argsort sorts the data and returns the indexes of the sorted values from original list
sorted_index = np.argsort(volatility_array)
# Using the sorted index positions, we can replace the values for
data = substitute_values(data, sorted_index, 0, "vol")

# Do the same actions for Fractal Dimension
fd_array = np.array(data["fd"])
sorted_index = np.argsort(fd_array)
data = substitute_values(data, sorted_index, 0, "fd")

data.head(5)

Unnamed: 0,mean,vol,fd,return_val
0,1.356294,3.0,2.0,-0.3384
1,1.355756,3.0,2.0,-3.971043
2,1.355464,3.0,2.0,-2.154857
3,1.355154,1.0,2.0,-2.2804
4,1.355041,2.0,3.0,-0.838683


In [5]:
# Using PyCaret functions to setup the regression model trainer
regression = setup(data=data, target="return_val")

Unnamed: 0,Description,Value
0,session_id,5420
1,Target,return_val
2,Original Data,"(139, 4)"
3,Missing Values,False
4,Numeric Features,3
5,Categorical Features,0
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(97, 3)"


In [6]:
# Compare the best models after training on different types
best = compare_models()
best

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lasso,Lasso Regression,2.0699,14.4517,3.2102,-0.1191,0.7687,2.2791,0.198
llar,Lasso Least Angle Regression,2.0699,14.4517,3.2102,-0.1191,0.7687,2.2791,0.004
dummy,Dummy Regressor,2.0699,14.4517,3.2102,-0.1191,0.7687,2.2791,0.005
en,Elastic Net,2.0742,14.487,3.2157,-0.1235,0.766,2.2757,0.005
huber,Huber Regressor,2.0782,14.9399,3.2505,-0.1403,0.8931,1.7529,0.004
br,Bayesian Ridge,2.0957,14.6804,3.2494,-0.1551,0.7502,2.1683,0.005
ridge,Ridge Regression,2.1506,15.0303,3.3118,-0.2317,0.7188,2.0966,0.004
omp,Orthogonal Matching Pursuit,2.1355,15.0113,3.3187,-0.2375,0.7543,2.4053,0.005
lar,Least Angle Regression,2.1407,15.0282,3.3176,-0.2423,0.6837,2.0221,0.005
lr,Linear Regression,2.1407,15.0282,3.3176,-0.2423,0.6837,2.0221,0.383


Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=5420,
      selection='cyclic', tol=0.0001, warm_start=False)

In [7]:
# Choose the best model technique and create a model
model = create_model("huber")

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,3.8469,74.25,8.6168,-0.1282,1.1526,7.19
1,2.0297,9.0412,3.0069,-0.1177,0.9802,0.9733
2,3.2201,32.5206,5.7027,-0.0925,1.1456,1.052
3,1.7234,4.4965,2.1205,-0.1793,0.8823,1.537
4,1.4002,2.6519,1.6285,-0.0933,0.7695,1.0548
5,1.8341,5.2542,2.2922,-0.2091,0.8484,1.1026
6,1.4972,4.4275,2.1042,-0.2884,0.6559,0.9576
7,1.5731,5.147,2.2687,-0.0442,0.9303,0.9113
8,2.1252,7.5211,2.7425,-0.1776,0.9358,0.9539
9,1.5323,4.0891,2.0222,-0.0724,0.6309,1.7966


In [8]:
# Run model tune to get a better fit
tuned_model = tune_model(model)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,3.8388,73.5985,8.579,-0.1183,1.1839,6.0904
1,2.0071,8.9023,2.9837,-0.1005,1.0138,0.9289
2,3.2209,32.5988,5.7095,-0.0952,1.1516,0.9994
3,1.7441,4.4773,2.116,-0.1742,0.8834,1.5017
4,1.3671,2.5584,1.5995,-0.0547,0.8058,0.9553
5,1.8139,5.3291,2.3085,-0.2264,0.8715,1.047
6,1.5327,4.3352,2.0821,-0.2615,0.7105,1.3451
7,1.6058,5.2268,2.2862,-0.0604,0.9145,0.9681
8,2.1006,7.4374,2.7272,-0.1645,0.9735,0.9291
9,1.5043,4.06,2.015,-0.0648,0.7119,1.5912


In [9]:
# Mark the tuned model as final and save it
model = finalize_model(tuned_model)
save_model(model, "../models/USDSGD")

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=[], ml_usecase='regression',
                                       numerical_features=[],
                                       target='return_val', time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_strat...
                 ('dummy', Dummify(target='return_val')),
                 ('fix_perfect', Remove_100(target='return_val')),
                 ('clean_names', Clean_Colum_Names()),
                 ('feature_select', 'passthrough'), ('fix_multi', 'passthrough'),
                 ('dfs', 'passthrough'), ('pca', 'p