In [1]:
import csv

import numpy as np
import pandas as pd
from pycaret.regression import *

In [2]:
def substitute_values(data, sorted_index, prefix, column):
    sub = 1

    # This list stores the values at which the labels were changed
    divider_list = ["USDHKD", column]

    for i, index in enumerate(sorted_index):
        if i == 33:
            sub += 1
            divider_list.append(data.at[index, column])
        if i == 66:
            sub += 1
            divider_list.append(data.at[index, column])

        data.at[index, column] = (prefix * 10) + sub

    with open("../divider_list.csv", "a", newline="") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(divider_list)

    return data

In [3]:
data = pd.read_csv("../data/USDHKD.csv")

# Preprocessing

# Removing the first row since it is an outlier - return is 0
data = data.drop(data.index[0])

# Delete any rows with null values
data.dropna(inplace=True)

# Delete any rows with 0 Vol and 0 FD
data = data[data.vol != 0]
data = data[data.fd != 0]

# Multiply the returns column by 100000 to have a usable column
data.return_val *= 10000

# Resetting the indexes in the dataframe
data.reset_index(drop=True, inplace=True)

# Select only the required columns - mean, fd, vol, return
data = data[["mean", "vol", "fd", "return_val"]]

data.head(5)

Unnamed: 0,mean,vol,fd,return_val
0,7.779482,0.000283,7146515.0,1.069147
1,7.779008,0.000251,15398450.0,-0.608964
2,7.779402,0.000298,10817230.0,0.506919
3,7.780386,8.3e-05,1785267.0,1.264704
4,7.779536,0.000246,2441515.0,-1.09289


In [4]:
# Sorting the values independently as it worked the best
volatility_array = np.array(data["vol"])
# Argsort sorts the data and returns the indexes of the sorted values from original list
sorted_index = np.argsort(volatility_array)
# Using the sorted index positions, we can replace the values for
data = substitute_values(data, sorted_index, 0, "vol")

# Do the same actions for Fractal Dimension
fd_array = np.array(data["fd"])
sorted_index = np.argsort(fd_array)
data = substitute_values(data, sorted_index, 0, "fd")

data.head(5)

Unnamed: 0,mean,vol,fd,return_val
0,7.779482,3.0,1.0,1.069147
1,7.779008,3.0,2.0,-0.608964
2,7.779402,3.0,1.0,0.506919
3,7.780386,1.0,1.0,1.264704
4,7.779536,3.0,1.0,-1.09289


In [5]:
# Using PyCaret functions to setup the regression model trainer
regression = setup(data=data, target="return_val")

Unnamed: 0,Description,Value
0,session_id,5105
1,Target,return_val
2,Original Data,"(135, 4)"
3,Missing Values,False
4,Numeric Features,3
5,Categorical Features,0
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(94, 3)"


In [6]:
# Compare the best models after training on different types
best = compare_models()
best

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lasso,Lasso Regression,0.4195,0.4399,0.604,-0.1468,0.3165,2.0454,0.199
en,Elastic Net,0.4195,0.4399,0.604,-0.1468,0.3165,2.0454,0.005
dummy,Dummy Regressor,0.4195,0.4399,0.604,-0.1468,0.3165,2.0454,0.005
llar,Lasso Least Angle Regression,0.4195,0.4399,0.604,-0.1468,0.3165,2.0454,0.006
br,Bayesian Ridge,0.4225,0.4551,0.6168,-0.221,0.3058,1.8505,0.005
ridge,Ridge Regression,0.4188,0.4511,0.6184,-0.2761,0.2837,1.575,0.005
lr,Linear Regression,0.4291,0.4483,0.6179,-0.2776,0.2756,2.291,0.385
lar,Least Angle Regression,0.4291,0.4483,0.6179,-0.2776,0.2756,2.2909,0.005
omp,Orthogonal Matching Pursuit,0.4298,0.4638,0.628,-0.3017,0.2958,1.6201,0.004
huber,Huber Regressor,0.4285,0.4534,0.624,-0.3119,0.2929,1.6318,0.006


Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=5105,
      selection='cyclic', tol=0.0001, warm_start=False)

In [13]:
# Choose the best model technique and create a model
model = create_model("knn")

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.2669,0.2105,0.4588,-0.6914,0.1892,1.6068
1,0.5666,1.419,1.1912,0.1042,0.4327,1.2074
2,0.453,0.3401,0.5832,-0.0258,0.2045,1.2304
3,0.2695,0.1006,0.3172,-0.3817,0.179,2.4874
4,0.3235,0.225,0.4744,0.3992,0.2435,0.7277
5,0.5166,0.4459,0.6678,-0.1768,0.3479,1.4185
6,0.5208,0.5597,0.7481,-0.7723,0.1438,1.025
7,0.4485,0.2936,0.5419,-2.1834,0.2404,2.259
8,0.5862,0.7568,0.8699,-0.2698,0.4503,1.3795
9,0.5382,0.7742,0.8799,-3.5931,0.237,2.2541


In [14]:
# Run model tune to get a better fit
tuned_model = tune_model(model)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.2316,0.1089,0.33,0.1248,0.2346,1.0308
1,0.5985,1.6135,1.2702,-0.0186,0.5059,1.5888
2,0.4786,0.3217,0.5671,0.0299,0.3503,0.8935
3,0.2518,0.1326,0.3641,-0.8204,0.1554,2.7362
4,0.4346,0.4481,0.6694,-0.1965,0.3809,1.0346
5,0.475,0.3581,0.5984,0.0551,0.337,1.0548
6,0.4776,0.3119,0.5585,0.0123,0.3273,1.1912
7,0.3244,0.1382,0.3718,-0.4988,0.2427,0.9776
8,0.4746,0.7029,0.8384,-0.1795,0.381,1.1852
9,0.3988,0.2471,0.4971,-0.4662,0.2556,3.1644


In [15]:
# Mark the tuned model as final and save it
model = finalize_model(tuned_model)
save_model(model, "../models/USDHKD")

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=[], ml_usecase='regression',
                                       numerical_features=[],
                                       target='return_val', time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_strat...
                 ('fix_perfect', Remove_100(target='return_val')),
                 ('clean_names', Clean_Colum_Names()),
                 ('feature_select', 'passthrough'), ('fix_multi', 'passthrough'),
                 ('dfs', 'passthrough'), ('pca', 'passthrough'),
                 ['trained_model',
         