In [1]:
import csv
import json

import numpy as np
import pandas as pd
from pycaret.regression import *

In [2]:
def substitute_values(data, sorted_index, currency, column):
    """
    Function to classify the column values into buckets.
    Eg, fd values [1,2,3,...100] are converted to [1,1,1,1,1,..,2,2,2,2,...,3,3,3,3]
    """
    
    # Value to substitute initially
    sub = 1

    # This list stores the values at which the labels were changed
    divider_list = [currency, column]

    # For each row in the numpy array
    for i, index in enumerate(sorted_index):
        # If index == 33, increase sub from 1 to 2 and store the value as divider point
        if i == 33:
            sub += 1
            divider_list.append(data.at[index, column])
        
        # If index == 6, increase sub from 2 to 3 and store the value as second divider point
        if i == 66:
            sub += 1
            divider_list.append(data.at[index, column])
        
        # Substitute the value of column, 1 if index < 33, 2 if 33 < index < 66, 3 otherwise 
        data.at[index, column] = sub
    
    # Store the values used to divide the list in a csv file. To be used on day 2 for real time predictions.
    with open("../divider_list.csv", "a", newline="") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(divider_list)

    return data

In [3]:
def main(currency_pairs):
    """
    Main function that performs data cleaning, pre processing, model building and model saving.
    """
    selected_models = {}
    with open("../divider_list.csv", "w", newline="") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["Currency", "Column", "33rd", "66th"])

    for currency in currency_pairs:
        data = pd.read_csv("../data/{}.csv".format(currency))

        # Preprocessing
        # Removing the first row since it is an outlier - return is 0
        data = data.drop(data.index[0])
        
        # Delete any rows with null values
        data.dropna(inplace=True)
        
        # Delete any rows with 0 Vol and 0 FD
        data = data[data.vol != 0]
        data = data[data.fd != 0]
        
        # Multiply the returns column by 100000 to have a usable column
        data.return_val *= 100000
        
        # Resetting the indexes in the dataframe
        data.reset_index(drop=True, inplace=True)

        # Sorting the values independently as it worked the best
        volatility_array = np.array(data["vol"])
        # Argsort sorts the data and returns the indexes of the sorted values from original list
        sorted_index = np.argsort(volatility_array)
        # Using the sorted index positions, we can replace the values for
        data = substitute_values(data, sorted_index, currency, "vol")

        # Do the same actions for Fractal Dimension
        fd_array = np.array(data["fd"])
        sorted_index = np.argsort(fd_array)
        data = substitute_values(data, sorted_index, currency, "fd")

        # Select only the required columns - mean, fd, vol, return
        data = data[["mean", "vol", "fd", "return_val"]]

        # Using PyCaret functions to setup the regression model trainer
        regression = setup(data=data, target="return_val")

        # Compare the best models after training on different types
        best = compare_models(exclude = ['dummy'])
        
        # For understanding purposes, we print the selected models in the end
        selected_models[currency] = best.__str__()

        # Choose the best regressor technique to create the model
        model = create_model(best)

        # Run model tune to get a better fit
        tuned_model = tune_model(model)

        # Mark the tuned model as final and save it
        model = finalize_model(tuned_model)
        save_model(model, "./models/{}".format(currency))
    print(json.dumps(selected_models, indent=4))

In [4]:
# A set of currency pairs
currency_pairs = [
    "EURUSD",
    "GBPUSD",
    "USDCAD",
    "USDCHF",
    "USDHKD",
    "USDAUD",
    "USDNZD",
    "USDSGD",
]

main(currency_pairs)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,7.5009,80.5865,8.977,-0.0823,0.9872,2.5788
1,48.9815,10036.9429,100.1845,-0.1992,2.4624,0.9767
2,28.7994,1581.4945,39.768,-0.0552,1.8463,1.0498
3,26.5606,919.5196,30.3236,-0.06,1.5346,1.1378
4,20.7708,1192.1758,34.5279,-0.1201,1.8691,0.9845
5,11.4726,239.5075,15.476,0.029,1.3229,1.3214
6,48.8239,7887.6976,88.8127,-0.0594,2.6873,0.9965
7,18.1188,767.6742,27.7069,-0.2954,1.703,4.9865
8,18.2584,751.2196,27.4084,-0.4655,1.6315,5.813
9,14.8345,309.3321,17.5878,0.0444,1.5545,0.9642


{
    "EURUSD": "LassoLars(alpha=1.0, copy_X=True, eps=2.220446049250313e-16, fit_intercept=True,\n          fit_path=True, jitter=None, max_iter=500, normalize=True,\n          positive=False, precompute='auto', random_state=3229, verbose=False)",
    "GBPUSD": "HuberRegressor(alpha=0.0001, epsilon=1.35, fit_intercept=True, max_iter=100,\n               tol=1e-05, warm_start=False)",
    "USDCAD": "LassoLars(alpha=1.0, copy_X=True, eps=2.220446049250313e-16, fit_intercept=True,\n          fit_path=True, jitter=None, max_iter=500, normalize=True,\n          positive=False, precompute='auto', random_state=6532, verbose=False)",
    "USDCHF": "HuberRegressor(alpha=0.0001, epsilon=1.35, fit_intercept=True, max_iter=100,\n               tol=1e-05, warm_start=False)",
    "USDHKD": "OrthogonalMatchingPursuit(fit_intercept=True, n_nonzero_coefs=None,\n                          normalize=True, precompute='auto', tol=None)",
    "USDAUD": "PassiveAggressiveRegressor(C=1.0, average=False, early