# Forex Trading Prediction - EURUSD Pair (4H):

Performing Forex Market Prediction for the EUR/USD pair (4H timeframe data) using Random Forest, SVM, and LSTM machine learning models and implementing cross-validation techniques to validate the results.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from sklearn.model_selection import train_test_split, GridSearchCV, TimeSeriesSplit
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import mean_squared_error, accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.metrics import BinaryAccuracy
import warnings
warnings.filterwarnings("ignore")


In [2]:
# importing data
data = pd.read_csv("/kaggle/input/eurusd-15-min/EURUSD_15m.csv")


In [3]:
data.columns=['Date', 'Open', 'High', 'Low', 'Close', 'Volume']

# converting the date column to a datetime object and setting it as the index
data['Date'] = pd.to_datetime(data['Date'])
data.set_index('Date', inplace=True) 
avg_vol = data.loc[data['Volume'].ne(0) & data['Volume'].notna(), 'Volume'].mean()

# Replace 0 and null values with the calculated average
data['Volume'] = data['Volume'].replace(0, avg_vol).fillna(avg_vol)
data.isna().sum()

Open      0
High      0
Low       0
Close     0
Volume    0
dtype: int64

In [4]:
data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-01-01 00:00:00,1.43283,1.43293,1.43224,1.43293,608600007.0
2010-01-01 00:15:00,1.43285,1.43295,1.43229,1.43275,535600003.0
2010-01-01 00:30:00,1.4328,1.43303,1.43239,1.43281,436299999.0
2010-01-01 00:45:00,1.43285,1.43294,1.43229,1.43276,614299997.0
2010-01-01 01:00:00,1.43287,1.43292,1.43206,1.43282,705300009.0


In [5]:
data.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2016-12-31 22:45:00,1.0515,1.0515,1.0515,1.0515,2281625000.0
2016-12-31 23:00:00,1.0515,1.0515,1.0515,1.0515,2281625000.0
2016-12-31 23:15:00,1.0515,1.0515,1.0515,1.0515,2281625000.0
2016-12-31 23:30:00,1.0515,1.0515,1.0515,1.0515,2281625000.0
2016-12-31 23:45:00,1.0515,1.0515,1.0515,1.0515,2281625000.0


In [6]:
!pip install pandas_ta # technical analysis library

Collecting pandas_ta
  Downloading pandas_ta-0.3.14b.tar.gz (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.1/115.1 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: pandas_ta
  Building wheel for pandas_ta (setup.py) ... [?25ldone
[?25h  Created wheel for pandas_ta: filename=pandas_ta-0.3.14b0-py3-none-any.whl size=218923 sha256=ef1c8cd3129d31057c563d04a8100273fa3dc428868a943942f7721b2528ffc8
  Stored in directory: /root/.cache/pip/wheels/7e/c3/40/fb36bba6c91caf81c39791388c71baca9635cbefd8e3bd48a7
Successfully built pandas_ta
Installing collected packages: pandas_ta
Successfully installed pandas_ta-0.3.14b0
[0m

In [7]:
import pandas_ta as ta # technical analysis package

data['RSI'] = data.ta.rsi(length = 14) 
data.head(15)

Unnamed: 0_level_0,Open,High,Low,Close,Volume,RSI
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-01-01 00:00:00,1.43283,1.43293,1.43224,1.43293,608600007.0,
2010-01-01 00:15:00,1.43285,1.43295,1.43229,1.43275,535600003.0,
2010-01-01 00:30:00,1.4328,1.43303,1.43239,1.43281,436299999.0,
2010-01-01 00:45:00,1.43285,1.43294,1.43229,1.43276,614299997.0,
2010-01-01 01:00:00,1.43287,1.43292,1.43206,1.43282,705300009.0,
2010-01-01 01:15:00,1.4329,1.43299,1.43212,1.43292,427300006.0,
2010-01-01 01:30:00,1.43267,1.43305,1.43209,1.43282,399200001.0,
2010-01-01 01:45:00,1.4328,1.43302,1.43218,1.43249,481200008.0,
2010-01-01 02:00:00,1.43279,1.43303,1.43237,1.43301,607599997.0,
2010-01-01 02:15:00,1.43301,1.43303,1.43222,1.43271,469600000.0,


In [8]:
data = data.dropna()
data.isna().sum()

Open      0
High      0
Low       0
Close     0
Volume    0
RSI       0
dtype: int64

In [9]:
# Create a new column that represents the difference between the opening and closing prices of the currency pair
data['Target'] = data.apply(lambda row: 1 if row['Close'] - row['Open'] > 0 else 0, axis=1)
data.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,RSI,Target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-12-31 22:45:00,1.0515,1.0515,1.0515,1.0515,2281625000.0,37.429023,0
2016-12-31 23:00:00,1.0515,1.0515,1.0515,1.0515,2281625000.0,37.429023,0
2016-12-31 23:15:00,1.0515,1.0515,1.0515,1.0515,2281625000.0,37.429023,0
2016-12-31 23:30:00,1.0515,1.0515,1.0515,1.0515,2281625000.0,37.429023,0
2016-12-31 23:45:00,1.0515,1.0515,1.0515,1.0515,2281625000.0,37.429023,0


In [10]:
data = data.drop_duplicates()

In [11]:
# Create the input features (X) and target variable (y).
X = data.drop(['Target'], axis=1)
y = data['Target']

In [12]:
# Perform feature scaling using MinMaxScaler.
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

In [13]:
# Split the data into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.25, shuffle=False)


In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Let's build and train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_train_pred = rf_model.predict(X_train)

train_accuracy = accuracy_score(y_train, rf_train_pred)
test_accuracy = accuracy_score(y_test, rf_pred)

print(f'RF Train Accuracy: {train_accuracy}')
print(f'RF Test Accuracy: {test_accuracy}')

# Check for underfitting or overfitting
if train_accuracy < test_accuracy:
    if abs(train_accuracy - test_accuracy) > 0.1 * train_accuracy:
        print("The model might be overfitting.")
    else:
        print("The model seems to be performing well.")
elif train_accuracy > test_accuracy:
    if abs(train_accuracy - test_accuracy) > 0.1 * test_accuracy:
        print("The model might be underfitting.")
    else:
        print("The model seems to be performing well.")
else:
    print("The model seems to be performing well.")



RF Train Accuracy: 1.0
RF Test Accuracy: 0.5967503361721201
The model might be underfitting.


In [15]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# SVM
svm_model = SVC(kernel='rbf', C=1e3, gamma=0.1)
svm_model.fit(X_train, y_train)
svm_pred = svm_model.predict(X_test)
svm_train_pred = svm_model.predict(X_train)

train_accuracy = accuracy_score(y_train, svm_train_pred)
test_accuracy = accuracy_score(y_test, svm_pred)

print(f'SVM Train Accuracy: {train_accuracy}')
print(f'SVM Test Accuracy: {test_accuracy}')

# Check for underfitting or overfitting
if train_accuracy < test_accuracy:
    if abs(train_accuracy - test_accuracy) > 0.1 * train_accuracy:
        print("The model might be overfitting.")
    else:
        print("The model seems to be performing well.")
elif train_accuracy > test_accuracy:
    if abs(train_accuracy - test_accuracy) > 0.1 * test_accuracy:
        print("The model might be underfitting.")
    else:
        print("The model seems to be performing well.")
else:
    print("The model seems to be performing well.")

SVM Train Accuracy: 0.9562446771952368
SVM Test Accuracy: 0.9316001792917974
The model seems to be performing well.


In [None]:
from tensorflow.keras.metrics import BinaryAccuracy

# Let's create and train the LSTM model.
# Reshape the input data for the LSTM model
X_train_lstm = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
X_test_lstm = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))

# LSTM
lstm_model = Sequential()
lstm_model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2])))
lstm_model.add(Dropout(0.2))
lstm_model.add(LSTM(units=50))
lstm_model.add(Dropout(0.2))
lstm_model.add(Dense(1, activation='sigmoid'))

lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[BinaryAccuracy()])
lstm_model.fit(X_train_lstm, y_train, epochs=50, batch_size=32)
lstm_pred = np.round(lstm_model.predict(X_test_lstm))
lstm_train_pred = np.round(lstm_model.predict(X_train_lstm))

train_accuracy = accuracy_score(y_train, lstm_train_pred)
test_accuracy = accuracy_score(y_test, lstm_pred)

print(f'LSTM Train Accuracy: {train_accuracy}')
print(f'LSTM Test Accuracy: {test_accuracy}')

# Check for underfitting or overfitting
if train_accuracy < test_accuracy:
    if abs(train_accuracy - test_accuracy) > 0.1 * train_accuracy:
        print("The model might be overfitting.")
    else:
        print("The model seems to be performing well.")
elif train_accuracy > test_accuracy:
    if abs(train_accuracy - test_accuracy) > 0.1 * test_accuracy:
        print("The model might be underfitting.")
    else:
        print("The model seems to be performing well.")
else:
    print("The model seems to be performing well.")


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50

In [None]:
# Evaluate the performance of each model using accuracy.
rf_accuracy = accuracy_score(y_test, rf_pred)
svm_accuracy = accuracy_score(y_test, svm_pred)
lstm_accuracy = accuracy_score(y_test, lstm_pred)
print(f'Random Forest Accuracy: {rf_accuracy}')
print(f'SVM Accuracy: {svm_accuracy}')
print(f'LSTM Accuracy: {lstm_accuracy}')

# Validating the results using  cross-validation techniques.
To validate the results using cross-validation techniques, we can use TimeSeriesSplit from scikit-learn. TimeSeriesSplit is a variation of k-fold cross-validation specifically designed for time series data. Let's update the code for the Random Forest and SVM models with cross-validation.

# Performing Gridsearch CV on the 3 models

In [None]:
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import TimeSeriesSplit, cross_val_score

tscv = TimeSeriesSplit(n_splits=5)


def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

rmse_scorer = make_scorer(rmse, greater_is_better=False)
scoring_methods = {'MSE': 'neg_mean_squared_error', 'RMSE': rmse_scorer, 'MAE': 'neg_mean_absolute_error', 'R-squared': 'r2'}


In [None]:
# RF with GS CV
def calc_accuracy(y_true, y_pred):
    return accuracy_score(y_true, np.round(y_pred))

accuracy_scorer = make_scorer(calc_accuracy, greater_is_better=True)

# Random Forest with GridSearchCV
rf_param_grid = {
    'n_estimators': [200],
    'max_depth': [None],
    'min_samples_split': [2]
}

rf_grid = GridSearchCV(RandomForestClassifier(random_state=42), rf_param_grid, cv=tscv, scoring=accuracy_scorer, n_jobs=-1)
rf_grid.fit(X_train, y_train)
rf_best = rf_grid.best_params_
rf_best_accuracy = rf_grid.best_score_
print(f'RF Best Parms: {rf_best}')

# Random Forest with best parameters
rf_train_pred = rf_grid.predict(X_train)
rf_test_pred = rf_grid.predict(X_test)
rf_train_accuracy = calc_accuracy(y_train, rf_train_pred)
rf_test_accuracy = calc_accuracy(y_test, rf_test_pred)
print(f'RF Train Accuracy: {rf_train_accuracy}')
print(f'RF Test Accuracy: {rf_test_accuracy}')


In [None]:
# SVM with GridSearchCV
svm_param_grid = {
    'C': [1e1, 1e2, 1e3],
    'gamma': [0.01, 0.1, 1],
    'kernel': ['rbf']
}
# SVM Best Parms: {'C': 1000.0, 'gamma': 0.1, 'kernel': 'rbf'}
svm_grid = GridSearchCV(SVC(), svm_param_grid, cv=tscv, scoring=accuracy_scorer, n_jobs=-1)
svm_grid.fit(X_train, y_train)
svm_best = svm_grid.best_params_
svm_best_accuracy = svm_grid.best_score_
print(f'SVM Best Parms: {svm_best}')

# SVM with best parameters
svm_train_pred = svm_grid.predict(X_train)
svm_test_pred = svm_grid.predict(X_test)
svm_train_accuracy = calc_accuracy(y_train, svm_train_pred)
svm_test_accuracy = calc_accuracy(y_test, svm_test_pred)
print(f'SVM Train Accuracy: {svm_train_accuracy}')
print(f'SVM Test Accuracy: {svm_test_accuracy}')


For the LSTM model, since Keras doesn't have direct support for GridSearchCV, we need to create a custom wrapper and use KerasRegressor from keras.wrappers.scikit_learn. Let's update the code for the LSTM model with GridSearchCV.

In [None]:
from keras.wrappers.scikit_learn import KerasClassifier

# Reshape the input data for the LSTM model
X_train_lstm = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
X_test_lstm = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))

def create_lstm_model(units=50, dropout_rate=0.2, optimizer='adam'):
    model = Sequential()
    model.add(LSTM(units=units, return_sequences=True, input_shape=(1, X_train.shape[1])))
    model.add(Dropout(dropout_rate))
    model.add(LSTM(units=units))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

lstm_model_grid = KerasClassifier(build_fn=create_lstm_model)

lstm_param_grid = {
    'units': [50],
    'dropout_rate': [0.2],
    'optimizer': ['adam'],
    'epochs': [50],
    'batch_size': [32]
}

lstm_grid = GridSearchCV(lstm_model_grid, lstm_param_grid, cv=tscv, scoring='accuracy', n_jobs=-1)
lstm_grid.fit(X_train_lstm, y_train)
lstm_best = lstm_grid.best_params_
lstm_best_accuracy = lstm_grid.best_score_
print(f'LSTM Best Params: {lstm_best}')

# LSTM with best parameters
lstm_train_pred = (lstm_grid.predict(X_train_lstm) > 0.5).astype(int)
lstm_test_pred = (lstm_grid.predict(X_test_lstm) > 0.5).astype(int)
lstm_train_accuracy = calc_accuracy(y_train, lstm_train_pred)
lstm_test_accuracy = calc_accuracy(y_test, lstm_test_pred)
print(f'LSTM Train Accuracy: {lstm_train_accuracy}')
print(f'LSTM Test Accuracy: {lstm_test_accuracy}')


def get_best_metric(cv_results, scoring_methods):
    best_scores = {}
    for metric in scoring_methods:
        if metric == 'R-squared':
            best_scores[metric] = cv_results[f'mean_test_{metric}'].max()
        else:
            best_scores[metric] = -cv_results[f'mean_test_{metric}'].min()
    best_metric = max(best_scores, key=best_scores.get)
    return best_metric

# Get the best metric for each model
rf_best_metric = get_best_metric(rf_grid.cv_results_, scoring_methods)
svm_best_metric = get_best_metric(svm_grid.cv_results_, scoring_methods)
lstm_best_metric = get_best_metric(lstm_grid.cv_results_, scoring_methods)

print("Best metric for Random Forest:", rf_best_metric)
print("Best metric for SVM:", svm_best_metric)
print("Best metric for LSTM:", lstm_best_metric)

def best_scoring_method(scores):
    best_metric = max(scores, key=scores.get)
    best_value = scores[best_metric]
    return f"{best_metric}: {best_value:.4f}"

In [None]:
def check_fit(train_accuracy, test_accuracy):
    if train_accuracy < test_accuracy:
        if abs(train_accuracy - test_accuracy) > 0.1 * train_accuracy:
            return "Overfitting"
        else:
            return "Good fit"
    elif train_accuracy > test_accuracy:
        if abs(train_accuracy - test_accuracy) > 0.1 * test_accuracy:
            return "Underfitting"
        else:
            return "Good fit"
    else:
        return "Good fit"

rf_fit = check_fit(rf_train_accuracy, rf_test_accuracy)
svm_fit = check_fit(svm_train_accuracy, svm_test_accuracy)
lstm_fit = check_fit(lstm_train_accuracy, lstm_test_accuracy)

final_results = pd.DataFrame({
    'Model': ['Random Forest', 'SVM', 'LSTM'],
    'Best Parameters': [rf_best, svm_best, lstm_best],
    'Train Accuracy': [rf_train_accuracy, svm_train_accuracy, lstm_train_accuracy],
    'Test Accuracy': [rf_test_accuracy, svm_test_accuracy, lstm_test_accuracy],
    'Fit': [rf_fit, svm_fit, lstm_fit],
})

display(final_results)


In [None]:
from keras.models import clone_model

def train_ensemble(X_train, y_train, X_test, y_test, num_models, units, dropout_rate, optimizer, epochs, batch_size):
    models = []
    preds = np.zeros((num_models, y_test.shape[0]))
    best_accuracy = 0.0
    best_model = None
    best_params = None
    
    for i in range(num_models):
        print(f"Training model {i + 1}...")
        model = create_lstm_model(units=units, dropout_rate=dropout_rate, optimizer=optimizer)
        model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=0)
        models.append(model)

        test_preds = model.predict(X_test)
        preds[i, :] = test_preds[:, 0]

        # Evaluate the model's accuracy
        train_preds = model.predict(X_train)
        train_preds_binary = threshold_predictions(train_preds, threshold=0.5)
        train_accuracy = accuracy_score(y_train, train_preds_binary)
        test_accuracy = accuracy_score(y_test, threshold_predictions(test_preds, threshold=0.5))

        # Check if this model has the highest accuracy so far
        if test_accuracy > best_accuracy:
            best_accuracy = test_accuracy
            best_model = model
            best_params = {
                'units': units,
                'dropout_rate': dropout_rate,
                'optimizer': optimizer,
                'epochs': epochs,
                'batch_size': batch_size
            }

        # Output this model's accuracy
        print(f"Model {i + 1} Train Accuracy: {train_accuracy:.4f}")
        print(f"Model {i + 1} Test Accuracy: {test_accuracy:.4f}")
        
    # Output the best model's hyperparameters and accuracy
    print(f"Best Model Hyperparameters: {best_params}")
    print(f"Best Model Test Accuracy: {best_accuracy:.4f}")
    
    # Average the predictions from all models
    ensemble_preds = np.mean(preds, axis=0)
    return models, ensemble_preds

def threshold_predictions(predictions, threshold=0.5):
    return (predictions >= threshold).astype(int)

# Scale the input features
scaler = MinMaxScaler(feature_range=(0, 1))
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Reshape the data for LSTM input
X_train_lstm = np.reshape(X_train_scaled, (X_train_scaled.shape[0], 1, X_train_scaled.shape[1]))
X_test_lstm = np.reshape(X_test_scaled, (X_test_scaled.shape[0], 1, X_test_scaled.shape[1]))

# Use the build_lstm_model and train_ensemble functions provided in the previous response

# Set the hyperparameters for the ensemble of LSTM models
num_models = 5
units = 50
dropout_rate = 0.2
optimizer = Adam()
epochs = 50
batch_size = 32

# Train the ensemble of LSTM models and obtain the averaged predictions
ensemble_models, ensemble_preds = train_ensemble(X_train_lstm, y_train, X_test_lstm, y_test, num_models, units, dropout_rate, optimizer, epochs, batch_size)

# Convert the predictions into binary (0 or 1) using a threshold of 0.5
ensemble_preds_binary = threshold_predictions(ensemble_preds, threshold=0.5)

# Calculate the ensemble's test accuracy
ensemble_accuracy = accuracy_score(y_test, ensemble_preds_binary)
print(f"Ensemble Accuracy: {ensemble_accuracy:.4f}")

In [None]:
#Export the results
#data.to_csv('EURUSD_final.csv', index=False)
#results.to_csv('final_results.csv', index=False)

##### 