# Time Series Cross Validation Experiments

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import TimeSeriesSplit

from torch.utils.data import DataLoader, TensorDataset

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense
import os

In [9]:
# Load datasets
pwd = os.getcwd()

mi_data = pd.read_csv(pwd + '\merged_data.csv')
mbw_data = pd.read_csv(fr'C:\Users\ivane\Desktop\Dissertation\data\merged_data.csv')

# Convert date columns to datetime
mi_data['date'] = pd.to_datetime(mi_data['date'])
mbw_data['Date'] = pd.to_datetime(mbw_data['Date'])

# Sort by date
mi_data = mi_data.sort_values('date')
mbw_data = mbw_data.sort_values('Date')

# Reset index
mi_data = mi_data.reset_index(drop=True)
mbw_data = mbw_data.reset_index(drop=True)

print("Malta Independent Data Shape:", mi_data.shape)
print("Malta Business Weekly Data Shape:", mbw_data.shape)

Malta Independent Data Shape: (1001, 6)
Malta Business Weekly Data Shape: (430, 6)


# Malta Independent Experiments

## LSTM (without sentiment variables)

In [12]:
# Create sequences for LSTM
def create_sequences(data, seq_length):
    xs = []
    ys = []
    for i in range(len(data) - seq_length - 1):
        x = data[i:(i + seq_length)]
        y = data[i + seq_length, -1]  # Target is the last column (Close price)
        xs.append(x)
        ys.append(y)
    return np.array(xs), np.array(ys)

# Scale the data
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(mi_data[['Open', 'Close', 'Change']])

# Create sequences
seq_length = 10
X, y = create_sequences(scaled_data, seq_length)

# Initialize time series cross validation
tscv = TimeSeriesSplit(n_splits=5)

# Store results for each fold
fold_results = []

# Perform cross validation
for fold, (train_idx, val_idx) in enumerate(tscv.split(X)):
    print(f'\nFold {fold + 1}')
    
    # Split data
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    # Build and compile model
    model = Sequential()
    model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
    model.add(LSTM(units=50))
    model.add(Dense(units=1))
    model.compile(optimizer='adam', loss='mean_squared_error')
    
    # Train model
    history = model.fit(
        X_train, y_train,
        epochs=50,
        batch_size=32,
        validation_data=(X_val, y_val),
        verbose=1
    )
    
    # Make predictions
    y_pred = model.predict(X_val)
    
    # Calculate metrics
    mae = mean_absolute_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)
    
    fold_results.append({
        'fold': fold + 1,
        'mae': mae,
        'r2': r2
    })
    
    print(f'Fold {fold + 1} Results:')
    print(f'MAE: {mae:.4f}')
    print(f'R2 Score: {r2:.4f}')

# Print average results
avg_results = pd.DataFrame(fold_results).mean()
print('\nAverage Results:')
print(f'Average MAE: {avg_results["mae"]:.4f}')
print(f'Average R2 Score: {avg_results["r2"]:.4f}')


Fold 1
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Fold 1 Results:
MAE: 0.1169
R2 Score: -1.6497

Fold 2
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30

## LSTM (with sentiment variables)

In [13]:
# Scale the data
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(mi_data[['Open', 'Close', 'Change', 'majority_sentiment', 'average_sentiment']])

# Create sequences
seq_length = 10
X, y = create_sequences(scaled_data, seq_length)

# Initialize time series cross validation
tscv = TimeSeriesSplit(n_splits=5)

# Store results for each fold
fold_results = []

# Perform cross validation
for fold, (train_idx, val_idx) in enumerate(tscv.split(X)):
    print(f'\nFold {fold + 1}')
    
    # Split data
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    # Build and compile model
    model = Sequential()
    model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
    model.add(LSTM(units=50))
    model.add(Dense(units=1))
    model.compile(optimizer='adam', loss='mean_squared_error')
    
    # Train model
    history = model.fit(
        X_train, y_train,
        epochs=50,
        batch_size=32,
        validation_data=(X_val, y_val),
        verbose=1
    )
    
    # Make predictions
    y_pred = model.predict(X_val)
    
    # Calculate metrics
    mae = mean_absolute_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)
    
    fold_results.append({
        'fold': fold + 1,
        'mae': mae,
        'r2': r2
    })
    
    print(f'Fold {fold + 1} Results:')
    print(f'MAE: {mae:.4f}')
    print(f'R2 Score: {r2:.4f}')

# Print average results
avg_results = pd.DataFrame(fold_results).mean()
print('\nAverage Results:')
print(f'Average MAE: {avg_results["mae"]:.4f}')
print(f'Average R2 Score: {avg_results["r2"]:.4f}')


Fold 1
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Fold 1 Results:
MAE: 0.1980
R2 Score: -0.4911

Fold 2
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30

## CNN-LSTM (without sentiment variables)

In [14]:
# Scale the data
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(mi_data[['Open', 'Close', 'Change']])

# Create sequences
seq_length = 10
X, y = create_sequences(scaled_data, seq_length)

# Initialize time series cross validation
tscv = TimeSeriesSplit(n_splits=5)

# Store results for each fold
fold_results = []

# Perform cross validation
for fold, (train_idx, val_idx) in enumerate(tscv.split(X)):
    print(f'\nFold {fold + 1}')
    
    # Split data
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    # Build and compile model
    model = Sequential()
    model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])))
    model.add(MaxPooling1D(pool_size=2))
    model.add(LSTM(50, return_sequences=True))
    model.add(LSTM(50))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mean_squared_error')
    
    # Train model
    history = model.fit(
        X_train, y_train,
        epochs=50,
        batch_size=32,
        validation_data=(X_val, y_val),
        verbose=1
    )
    
    # Make predictions
    y_pred = model.predict(X_val)
    
    # Calculate metrics
    mae = mean_absolute_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)
    
    fold_results.append({
        'fold': fold + 1,
        'mae': mae,
        'r2': r2
    })
    
    print(f'Fold {fold + 1} Results:')
    print(f'MAE: {mae:.4f}')
    print(f'R2 Score: {r2:.4f}')

# Print average results
avg_results = pd.DataFrame(fold_results).mean()
print('\nAverage Results:')
print(f'Average MAE: {avg_results["mae"]:.4f}')
print(f'Average R2 Score: {avg_results["r2"]:.4f}')


Fold 1
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Fold 1 Results:
MAE: 0.1072
R2 Score: -1.2555

Fold 2
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30

## CNN-LSTM (with sentiment variables)

In [15]:
# Scale the data
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(mi_data[['Open', 'Close', 'Change', 'majority_sentiment', 'average_sentiment']])

# Create sequences
seq_length = 10
X, y = create_sequences(scaled_data, seq_length)

# Initialize time series cross validation
tscv = TimeSeriesSplit(n_splits=5)

# Store results for each fold
fold_results = []

# Perform cross validation
for fold, (train_idx, val_idx) in enumerate(tscv.split(X)):
    print(f'\nFold {fold + 1}')
    
    # Split data
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    # Build and compile model
    model = Sequential()
    model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])))
    model.add(MaxPooling1D(pool_size=2))
    model.add(LSTM(50, return_sequences=True))
    model.add(LSTM(50))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mean_squared_error')
    
    # Train model
    history = model.fit(
        X_train, y_train,
        epochs=50,
        batch_size=32,
        validation_data=(X_val, y_val),
        verbose=1
    )
    
    # Make predictions
    y_pred = model.predict(X_val)
    
    # Calculate metrics
    mae = mean_absolute_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)
    
    fold_results.append({
        'fold': fold + 1,
        'mae': mae,
        'r2': r2
    })
    
    print(f'Fold {fold + 1} Results:')
    print(f'MAE: {mae:.4f}')
    print(f'R2 Score: {r2:.4f}')

# Print average results
avg_results = pd.DataFrame(fold_results).mean()
print('\nAverage Results:')
print(f'Average MAE: {avg_results["mae"]:.4f}')
print(f'Average R2 Score: {avg_results["r2"]:.4f}')


Fold 1
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Fold 1 Results:
MAE: 0.1833
R2 Score: -0.2701

Fold 2
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30

# Malta Business Weekly Experiments

## LSTM (without sentiment variables)

In [16]:
# Scale the data
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(mbw_data[['Open', 'Close', 'Change']])

# Create sequences
seq_length = 10
X, y = create_sequences(scaled_data, seq_length)

# Initialize time series cross validation
tscv = TimeSeriesSplit(n_splits=5)

# Store results for each fold
fold_results = []

# Perform cross validation
for fold, (train_idx, val_idx) in enumerate(tscv.split(X)):
    print(f'\nFold {fold + 1}')
    
    # Split data
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    # Build and compile model
    model = Sequential()
    model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
    model.add(LSTM(units=50))
    model.add(Dense(units=1))
    model.compile(optimizer='adam', loss='mean_squared_error')
    
    # Train model
    history = model.fit(
        X_train, y_train,
        epochs=50,
        batch_size=32,
        validation_data=(X_val, y_val),
        verbose=1
    )
    
    # Make predictions
    y_pred = model.predict(X_val)
    
    # Calculate metrics
    mae = mean_absolute_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)
    
    fold_results.append({
        'fold': fold + 1,
        'mae': mae,
        'r2': r2
    })
    
    print(f'Fold {fold + 1} Results:')
    print(f'MAE: {mae:.4f}')
    print(f'R2 Score: {r2:.4f}')

# Print average results
avg_results = pd.DataFrame(fold_results).mean()
print('\nAverage Results:')
print(f'Average MAE: {avg_results["mae"]:.4f}')
print(f'Average R2 Score: {avg_results["r2"]:.4f}')


Fold 1
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Fold 1 Results:
MAE: 0.0955
R2 Score: -0.0353

Fold 2
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30

## LSTM (with sentiment variables)

In [17]:
# Scale the data
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(mbw_data[['Open', 'Close', 'Change', 'sentiment', 'sentiment_score']])

# Create sequences
seq_length = 10
X, y = create_sequences(scaled_data, seq_length)

# Initialize time series cross validation
tscv = TimeSeriesSplit(n_splits=5)

# Store results for each fold
fold_results = []

# Perform cross validation
for fold, (train_idx, val_idx) in enumerate(tscv.split(X)):
    print(f'\nFold {fold + 1}')
    
    # Split data
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    # Build and compile model
    model = Sequential()
    model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
    model.add(LSTM(units=50))
    model.add(Dense(units=1))
    model.compile(optimizer='adam', loss='mean_squared_error')
    
    # Train model
    history = model.fit(
        X_train, y_train,
        epochs=50,
        batch_size=32,
        validation_data=(X_val, y_val),
        verbose=1
    )
    
    # Make predictions
    y_pred = model.predict(X_val)
    
    # Calculate metrics
    mae = mean_absolute_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)
    
    fold_results.append({
        'fold': fold + 1,
        'mae': mae,
        'r2': r2
    })
    
    print(f'Fold {fold + 1} Results:')
    print(f'MAE: {mae:.4f}')
    print(f'R2 Score: {r2:.4f}')

# Print average results
avg_results = pd.DataFrame(fold_results).mean()
print('\nAverage Results:')
print(f'Average MAE: {avg_results["mae"]:.4f}')
print(f'Average R2 Score: {avg_results["r2"]:.4f}')


Fold 1
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Fold 1 Results:
MAE: 0.1938
R2 Score: -0.1109

Fold 2
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30

## CNN-LSTM (without sentiment variables)

In [18]:
# Scale the data
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(mbw_data[['Open', 'Close', 'Change']])

# Create sequences
seq_length = 10
X, y = create_sequences(scaled_data, seq_length)

# Initialize time series cross validation
tscv = TimeSeriesSplit(n_splits=5)

# Store results for each fold
fold_results = []

# Perform cross validation
for fold, (train_idx, val_idx) in enumerate(tscv.split(X)):
    print(f'\nFold {fold + 1}')
    
    # Split data
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    # Build and compile model
    model = Sequential()
    model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])))
    model.add(MaxPooling1D(pool_size=2))
    model.add(LSTM(50, return_sequences=True))
    model.add(LSTM(50))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mean_squared_error')
    
    # Train model
    history = model.fit(
        X_train, y_train,
        epochs=50,
        batch_size=32,
        validation_data=(X_val, y_val),
        verbose=1
    )
    
    # Make predictions
    y_pred = model.predict(X_val)
    
    # Calculate metrics
    mae = mean_absolute_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)
    
    fold_results.append({
        'fold': fold + 1,
        'mae': mae,
        'r2': r2
    })
    
    print(f'Fold {fold + 1} Results:')
    print(f'MAE: {mae:.4f}')
    print(f'R2 Score: {r2:.4f}')

# Print average results
avg_results = pd.DataFrame(fold_results).mean()
print('\nAverage Results:')
print(f'Average MAE: {avg_results["mae"]:.4f}')
print(f'Average R2 Score: {avg_results["r2"]:.4f}')


Fold 1
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Fold 1 Results:
MAE: 0.0960
R2 Score: -0.0572

Fold 2
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30

## CNN-LSTM (with sentiment variables)

In [19]:
# Scale the data
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(mbw_data[['Open', 'Close', 'Change', 'sentiment', 'sentiment_score']])

# Create sequences
seq_length = 10
X, y = create_sequences(scaled_data, seq_length)

# Initialize time series cross validation
tscv = TimeSeriesSplit(n_splits=5)

# Store results for each fold
fold_results = []

# Perform cross validation
for fold, (train_idx, val_idx) in enumerate(tscv.split(X)):
    print(f'\nFold {fold + 1}')
    
    # Split data
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    # Build and compile model
    model = Sequential()
    model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])))
    model.add(MaxPooling1D(pool_size=2))
    model.add(LSTM(50, return_sequences=True))
    model.add(LSTM(50))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mean_squared_error')
    
    # Train model
    history = model.fit(
        X_train, y_train,
        epochs=50,
        batch_size=32,
        validation_data=(X_val, y_val),
        verbose=1
    )
    
    # Make predictions
    y_pred = model.predict(X_val)
    
    # Calculate metrics
    mae = mean_absolute_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)
    
    fold_results.append({
        'fold': fold + 1,
        'mae': mae,
        'r2': r2
    })
    
    print(f'Fold {fold + 1} Results:')
    print(f'MAE: {mae:.4f}')
    print(f'R2 Score: {r2:.4f}')

# Print average results
avg_results = pd.DataFrame(fold_results).mean()
print('\nAverage Results:')
print(f'Average MAE: {avg_results["mae"]:.4f}')
print(f'Average R2 Score: {avg_results["r2"]:.4f}')


Fold 1
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Fold 1 Results:
MAE: 0.1979
R2 Score: -0.1355

Fold 2
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30

# CNN


In [11]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, r2_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense

# Function to create sequences
def create_sequences(data, seq_length):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i + seq_length])
        y.append(data[i + seq_length, -1])  # Assuming the target variable is the last column
    return np.array(X), np.array(y)

# Load your datasets (replace with actual data)
mi_data = pd.read_csv(pwd + '\merged_data.csv')
mbw_data = pd.read_csv(fr'C:\Users\ivane\Desktop\Dissertation\data\merged_data.csv')

# Convert date columns to datetime
mi_data['date'] = pd.to_datetime(mi_data['date'])
mbw_data['Date'] = pd.to_datetime(mbw_data['Date'])

# Sort by date
mi_data = mi_data.sort_values('date')
mbw_data = mbw_data.sort_values('Date')

# Reset index
mi_data = mi_data.reset_index(drop=True)
mbw_data = mbw_data.reset_index(drop=True)

print("Malta Independent Data Shape:", mi_data.shape)
print("Malta Business Weekly Data Shape:", mbw_data.shape)

# Define a function to train and evaluate the CNN model
def train_cnn(data, features, title):
    print(f"\n### {title} ###")
    
    # Scale the data
    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(data[features])
    
    # Create sequences
    seq_length = 10
    X, y = create_sequences(scaled_data, seq_length)
    
    # Initialize time series cross-validation
    tscv = TimeSeriesSplit(n_splits=5)
    
    # Store results for each fold
    fold_results = []
    
    # Perform cross-validation
    for fold, (train_idx, val_idx) in enumerate(tscv.split(X)):
        print(f'\nFold {fold + 1}')
        
        # Split data
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        
        # Build and compile model
        model = Sequential([
            Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])),
            MaxPooling1D(pool_size=2),
            Conv1D(filters=32, kernel_size=3, activation='relu'),
            MaxPooling1D(pool_size=2),
            Flatten(),
            Dense(50, activation='relu'),
            Dense(1)
        ])
        model.compile(optimizer='adam', loss='mean_squared_error')
        
        # Train model
        history = model.fit(
            X_train, y_train,
            epochs=50,
            batch_size=32,
            validation_data=(X_val, y_val),
            verbose=1
        )
        
        # Make predictions
        y_pred = model.predict(X_val)
        
        # Calculate metrics
        mae = mean_absolute_error(y_val, y_pred)
        r2 = r2_score(y_val, y_pred)
        
        fold_results.append({'fold': fold + 1, 'mae': mae, 'r2': r2})
        
        print(f'Fold {fold + 1} Results:')
        print(f'MAE: {mae:.4f}')
        print(f'R2 Score: {r2:.4f}')
    
    # Print average results
    avg_results = pd.DataFrame(fold_results).mean()
    print('\nAverage Results:')
    print(f'Average MAE: {avg_results["mae"]:.4f}')
    print(f'Average R2 Score: {avg_results["r2"]:.4f}')

# Train CNN models for Malta Independent (MI)
train_cnn(mi_data, ['Open', 'Close', 'Change'], "CNN (MI - Without Sentiment)")
train_cnn(mi_data, ['Open', 'Close', 'Change', 'majority_sentiment', 'average_sentiment'], "CNN (MI - With Sentiment)")

# Train CNN models for Malta Business Weekly (MBW)
train_cnn(mbw_data, ['Open', 'Close', 'Change'], "CNN (MBW - Without Sentiment)")
train_cnn(mbw_data, ['Open', 'Close', 'Change', 'sentiment', 'sentiment_score'], "CNN (MBW - With Sentiment)")


Malta Independent Data Shape: (1001, 6)
Malta Business Weekly Data Shape: (430, 6)

### CNN (MI - Without Sentiment) ###

Fold 1
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Fold 1 Results:
MAE: 0.1100
R2 Score: -1.3713

Fold 2
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 2