In [7]:
import pandas as pd

# Load your final merged dataset
merged_data = pd.read_parquet('../datasets/final_merged_dataset_hourly.parquet')

# Sort by datetime if not already
merged_data = merged_data.sort_values('datetime')
merged_data.set_index('datetime', inplace=True)

print(merged_data.head())


                         Open      High       Low     Close    Volume  \
datetime                                                                
2021-01-01 00:00:00  0.250730  0.251106  0.250628  0.250493  0.076344   
2021-01-01 01:00:00  0.251433  0.251695  0.251353  0.251520  0.015258   
2021-01-01 02:00:00  0.256154  0.256011  0.255263  0.255877  0.034476   
2021-01-01 03:00:00  0.252147  0.252928  0.252351  0.253123  0.010795   
2021-01-01 04:00:00  0.254932  0.255284  0.254180  0.254066  0.023016   

                     sentiment_score  bert_sentiment  prev_close  \
datetime                                                           
2021-01-01 00:00:00              0.0             0.0    0.250493   
2021-01-01 01:00:00              0.0             0.0    0.250493   
2021-01-01 02:00:00              0.0             0.0    0.251520   
2021-01-01 03:00:00              0.0             0.0    0.255877   
2021-01-01 04:00:00              0.0             0.0    0.253123   

           

In [8]:
import numpy as np

# Select features
feature_cols = ['Open', 'High', 'Low', 'Close', 'Volume', 'sentiment_score', 'bert_sentiment']

# Prepare input X and output y
X = merged_data[feature_cols].values
y = merged_data['Close'].values  # Target is 'Close' price

print(f"Shape of X: {X.shape}")
print(f"Shape of y: {y.shape}")


Shape of X: (18265, 7)
Shape of y: (18265,)


In [9]:
def create_sliding_windows(X, y, input_window=24, output_window=1):
    X_seq, y_seq = [], []
    for i in range(len(X) - input_window - output_window + 1):
        X_seq.append(X[i:i+input_window])
        y_seq.append(y[i+input_window:i+input_window+output_window])
    return np.array(X_seq), np.array(y_seq)

# Create sequences
input_window = 24    # 24 hours as input
output_window = 1    # predict next 1 hour

X_seq, y_seq = create_sliding_windows(X, y, input_window, output_window)

print(f"Shape of X_seq: {X_seq.shape}")
print(f"Shape of y_seq: {y_seq.shape}")


Shape of X_seq: (18241, 24, 7)
Shape of y_seq: (18241, 1)


In [10]:
from sklearn.model_selection import train_test_split

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(
    X_seq, y_seq, test_size=0.2, shuffle=False
)

print(f"Training samples: {X_train.shape[0]}")
print(f"Testing samples: {X_test.shape[0]}")


Training samples: 14592
Testing samples: 3649


In [11]:
import torch
import torch.nn as nn
from informer.models.informer import Informer

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Hyperparameters
enc_in = X_train.shape[2]  # number of features
dec_in = X_train.shape[2]  # same input size
c_out = 1                 # output size (predict Close price)
seq_len = X_train.shape[1] # input sequence length (24)
label_len = 12             # how many previous labels known to decoder
out_len = 1                # predict next 1 hour
d_model = 512              # hidden size
n_heads = 8
e_layers = 2
d_layers = 1
dropout = 0.05
factor = 5  # ProbSparse attention factor
learning_rate = 1e-3

# Define Informer model
model = Informer(
    enc_in=enc_in,
    dec_in=dec_in,
    c_out=c_out,
    seq_len=seq_len,
    label_len=label_len,
    out_len=out_len,
    d_model=d_model,
    n_heads=n_heads,
    e_layers=e_layers,
    d_layers=d_layers,
    dropout=dropout,
    factor=factor,
    device=device,
).to(device)

print(model)


ModuleNotFoundError: No module named 'informer.models'