In [1]:
import pandas as pd

import matplotlib.pyplot as plt
import mplfinance as mpf

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import statsmodels.api as sm
from sklearn.metrics import r2_score

from shared import read_dataset

2023-11-04 19:14:10.224666: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-11-04 19:14:10.510179: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-04 19:14:10.510210: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-04 19:14:10.511556: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-04 19:14:10.621498: I tensorflow/core/platform/cpu_feature_g

In [2]:
start = "2010-01-01"
end = "2021-01-01"

commodities = read_dataset('../data/commodities.csv', start, end)
company = read_dataset('../data/company.csv', start, end)
company = company.drop('CS', axis=1)
fixed_income = read_dataset('../data/fixed_income.csv', start, end)
forex = read_dataset('../data/forex.csv', start, end)
GS = read_dataset('../data/GS.csv', start, end)
us_macro = read_dataset('../data/us_macro.csv', start, end)
vix = read_dataset('../data/vix.csv', start, end)

GS['Close_Diff'] = GS['Close'].diff()
GS.dropna(inplace=True)

data = pd.concat([
    # company,
    # commodities,
    # fixed_income,
    # forex,
    # us_macro,
    GS,
    # vix
    ], axis=1)

target_column_name = 'Adj Close'

# Reorder columns to have 'target_column' as the first column:
cols = [target_column_name] + [ col for col in data if col != target_column_name]
target_column = list(data.columns).index(target_column_name)
data = data[cols]

In [None]:
def evaluate_prediction_power(target_series, predictor_series):
    # Ensure the indices are datetime objects and the series are aligned by date
    target_series.index = pd.to_datetime(target_series.index)
    predictor_series.index = pd.to_datetime(predictor_series.index)
    merged_data = pd.merge(target_series, predictor_series, left_index=True, right_index=True)
    
    # Drop rows with missing or infinite values
    merged_data = merged_data.dropna().replace([np.inf, -np.inf], np.nan).dropna()

    # Perform linear regression
    X = merged_data.iloc[:, 1]  # Predictor Series
    y = merged_data.iloc[:, 0]  # Target Series
    X = sm.add_constant(X)  # Adds a constant term to the predictor

    model = sm.OLS(y, X)
    results = model.fit()

    # Evaluate the model
    predictions = results.predict(X)
    r_squared = r2_score(y, predictions)
    
    return r_squared, results.summary()

def get_top_five(dataframe, target_series):
    results_list = []
    for column in dataframe.columns:
        predictor_series = dataframe[column]
        r_squared, summary = evaluate_prediction_power(target_series, predictor_series)
        results_list.append((column, r_squared))
    sorted_results = sorted(results_list, key=lambda x: x[1], reverse=True)
    top_5_columns = [item[0] for item in sorted_results[:5]]
    return dataframe[top_5_columns]

## Using a Generative Adversarial Network (GAN)
- LSTM Generator
- CNN Discriminator


Unconventional GAN.

> Feed the LSTM Generator with the real data and make it output an y: a four days price forecast.
> Than use the CNN discriminator to see if the forecast is feasible or not.

The discriminator is trained with the real data and its forecast and has to classify binary if the forecast is real or generated.


In [8]:
# # Data refactoring
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.fillna(method='bfill', inplace=True)
data.fillna(method='ffill', inplace=True)

data_array = np.array(data.values)
target_array = np.array(data[target_column_name].values).reshape(-1, 1)

# # Data scaling
# data_array = data[:-1].values
# target_array = data[target_column_name].shift(-1).values[:-1]

# data_array = np.array([i for i in range(2000)]).reshape(-1, 1)
# target_array = np.array([a_ + 1 for a_ in data_array]).reshape(-1, 1)

scaler_data = MinMaxScaler()
scaler_data.fit(data_array)
data_array = scaler_data.transform(data_array)

scaler_target = MinMaxScaler()
scaler_target.fit(target_array)
target_array = scaler_target.transform(target_array)

# Data splitting
train_size = int(len(data_array) * 0.70)
evaluation_size = int(len(data_array) * 0.10)


def create_sequences(data, target, seq_length):
    sequence_data = []
    sequence_target = []
    for i in range(seq_length, len(data)+1):
        sequence_data.append(data[i-seq_length:i])
        sequence_target.append(target[i-1])
    return np.array(sequence_data), np.array(sequence_target)

SEQUENCE_LENGTH = 17
data_sequences, target_sequences = create_sequences(data_array, target_array, SEQUENCE_LENGTH)

shuffle_idxs = np.random.permutation(len(data_sequences))
revert_idxs = np.argsort(shuffle_idxs)

data_sequences = data_sequences[shuffle_idxs]
target_sequences = target_sequences[shuffle_idxs]

train_data, eval_data, test_data = data_sequences[:train_size], data_sequences[train_size:train_size+evaluation_size], data_sequences[train_size+evaluation_size:]
train_target, eval_target, test_target = target_sequences[:train_size], target_sequences[train_size:train_size+evaluation_size], target_sequences[train_size+evaluation_size:]


  data.fillna(method='bfill', inplace=True)
  data.fillna(method='ffill', inplace=True)


In [None]:
# last_day_trading = []
# for i in range (train_data.shape[0]):
#     w = scaler_data.inverse_transform(train_data[i])
#     last_day_trading.append(w[-1])

# denormalized_target_array = scaler_target.inverse_transform(train_target)

# print(last_day_trading)
# print(denormalized_target_array)

In [19]:
lr_d = 0.005  # discriminator learning rate
lr_g = 0.001   # generator learning rate
NOISE_DIM = 100

def build_generator():
    model = keras.Sequential()
    model.add(layers.LSTM(units=256, return_sequences=True, input_shape=(SEQUENCE_LENGTH, data.shape[1])))
    model.add(layers.Dropout(0.2))
    model.add(layers.LSTM(units=128, return_sequences=True, activation='relu'))
    model.add(layers.Dropout(0.2))
    model.add(layers.LSTM(units=64, return_sequences=True, activation='relu'))
    model.add(layers.Dropout(0.2))
    model.add(layers.LSTM(units=32, return_sequences=True))
    model.add(layers.Dropout(0.2))
    model.add(layers.LSTM(units=16, return_sequences=False))
    model.add(layers.Dense(1))
    return model

def train_just_generator(model, train_features, train_target, eval_features, eval_target, epochs=1000, batch_size=10):
    #early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=20, verbose=1, restore_best_weights=True)

    model.compile(optimizer='adam', loss='mean_squared_error')
    model.fit(train_features, train_target)
    # model.fit(train_features, train_target,
    #           validation_data=(eval_features, eval_target),
    #           epochs=epochs,
    #           batch_size=batch_size,
    #           verbose=2,
    #           #callbacks=[early_stopping]
    #           )

def build_discriminator():
    # Input for the sequence
    sequence_input = keras.Input(shape=(SEQUENCE_LENGTH, data.shape[1]))
    x = layers.LSTM(64, return_sequences=False)(sequence_input)
    
    # Input for the forecasted value
    forecast_input = keras.Input(shape=(1,))
    
    # Concatenate the sequence features and the forecasted value
    merged = keras.layers.Concatenate()([x, forecast_input])
    
    x = layers.Dense(64, activation='relu')(merged)
    validity = layers.Dense(1, activation='sigmoid')(x)
    
    model = keras.Model(inputs=[sequence_input, forecast_input], outputs=validity)
    model.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(lr_d), metrics=['accuracy'])
    
    return model


def build_gan(generator, discriminator):
    discriminator.trainable = False
    noise = keras.Input(shape=(SEQUENCE_LENGTH, data.shape[1]))
    generated_sequence = generator(noise)
    validity = discriminator([noise, generated_sequence])
    combined = keras.Model(noise, validity)
    combined.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(lr_g))
    return combined

def train_gan(data, generator, discriminator, combined, epochs=1000, batch_size=128):
    d_losses = []
    g_losses = []

    for epoch in range(epochs):
        idx = np.random.randint(0, data.shape[0], batch_size)
        real_sequences_input = data[idx, :, :]
        real_forecast = data[idx, -1, target_column].reshape(batch_size, 1)  # One-day forecast

        generated_forecast = generator.predict(real_sequences_input)

        # Train discriminator
        d_loss_real = discriminator.train_on_batch([real_sequences_input, real_forecast], np.ones((batch_size, 1)))
        d_loss_fake = discriminator.train_on_batch([real_sequences_input, generated_forecast], np.zeros((batch_size, 1)))
        d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

        # Train generator
        g_loss = combined.train_on_batch(real_sequences_input, np.ones((batch_size, 1)))

        d_losses.append(d_loss[0])
        g_losses.append(g_loss)
        # print(f"{epoch}/{epochs} [D loss: {d_loss[0]:.4f}] [G loss: {g_loss[0]:.4f}]")

    plt.figure(figsize=(10,5))
    plt.plot(d_losses, label="Discriminator Loss")
    plt.plot(g_losses, label="Generator Loss")
    plt.title("GAN Training Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()
    plt.show()


### Model Import

In [None]:
from tensorflow.keras.models import load_model

generator = load_model('../models/generator.h5')

### Model Generation

In [20]:
generator = build_generator()
train_just_generator(generator, train_data, train_target, eval_data, eval_target, epochs=1000, batch_size=128)



ValueError: in user code:

    File "/home/andrea/.local/lib/python3.10/site-packages/keras/src/engine/training.py", line 1377, in train_function  *
        return step_function(self, iterator)
    File "/home/andrea/.local/lib/python3.10/site-packages/keras/src/engine/training.py", line 1360, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/andrea/.local/lib/python3.10/site-packages/keras/src/engine/training.py", line 1349, in run_step  **
        outputs = model.train_step(data)
    File "/home/andrea/.local/lib/python3.10/site-packages/keras/src/engine/training.py", line 1126, in train_step
        y_pred = self(x, training=True)
    File "/home/andrea/.local/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/home/andrea/.local/lib/python3.10/site-packages/keras/src/engine/input_spec.py", line 298, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer "sequential_4" is incompatible with the layer: expected shape=(None, 2, 21), found shape=(None, 17, 21)


In [None]:
generator = build_generator()
discriminator = build_discriminator()
combined = build_gan(generator, discriminator)

train_gan(train_data, generator, discriminator, combined, epochs=2000)

In [None]:
# loss = generator.evaluate(test_data, test_target)

recomposed_target = np.concatenate([train_target, eval_target, test_target])
recomposed_target = recomposed_target[revert_idxs]

predicted_target = np.concatenate([train_target, eval_target, generator.predict(test_data)])
predicted_target = predicted_target[revert_idxs]

# Reverse the scaling transformation to get the original price values
price_predicted_array = scaler_target.inverse_transform(predicted_target)
price_actual_array = scaler_target.inverse_transform(recomposed_target.reshape(-1, 1)).flatten()

# print(f"Testa data evaluation: loss {loss}")

In [None]:
# Plotting the first 200 actual vs predicted prices
plt.figure(figsize=(15, 6))

plt.plot(price_actual_array, label="Real", color='blue')
plt.scatter(range(len(price_predicted_array)), price_predicted_array, label="Predicted", color='red', marker='o')

# Title and labels
#plt.title(f"Actual vs Predicted {target_column_name}")
plt.xlabel("Time Step")
#plt.ylabel(f"{target_column_name}")
plt.legend()

plt.show()

In [None]:
generator.save('../models/light/price_generator.h5')