In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
import torch.optim as optim

# ---------------------------------------------------
# 1. TemporalBlock Class with Corrected Padding
# ---------------------------------------------------
class TemporalBlock(nn.Module):
    """
    A Temporal Block comprising two dilated convolutional layers with
    batch normalization, GELU activations, dropout, and residual connections.
    """
    def __init__(
        self, 
        in_channels: int, 
        out_channels: int, 
        kernel_size: int, 
        stride: int, 
        dilation: int, 
        padding: int, 
        dropout: float,
        activation: str = 'GELU',
        normalization: str = 'BatchNorm'
    ):
        super(TemporalBlock, self).__init__()
        
        # Select activation function
        if activation == 'GELU':
            self.activation = nn.GELU()
        elif activation == 'LeakyReLU':
            self.activation = nn.LeakyReLU()
        elif activation == 'ELU':
            self.activation = nn.ELU()
        else:
            self.activation = nn.ReLU()
        
        # Select normalization layer
        if normalization == 'BatchNorm':
            self.norm1 = nn.BatchNorm1d(out_channels)
            self.norm2 = nn.BatchNorm1d(out_channels)
        elif normalization == 'LayerNorm':
            self.norm1 = nn.LayerNorm(out_channels)
            self.norm2 = nn.LayerNorm(out_channels)
        else:
            self.norm1 = nn.Identity()
            self.norm2 = nn.Identity()
        
        self.conv1 = nn.Conv1d(
            in_channels, 
            out_channels, 
            kernel_size, 
            stride=stride, 
            padding=padding, 
            dilation=dilation
        )
        self.dropout1 = nn.Dropout(dropout)

        self.conv2 = nn.Conv1d(
            out_channels, 
            out_channels, 
            kernel_size, 
            stride=stride, 
            padding=padding, 
            dilation=dilation
        )
        self.dropout2 = nn.Dropout(dropout)

        self.downsample = nn.Conv1d(
            in_channels, 
            out_channels, 
            kernel_size=1
        ) if in_channels != out_channels else None

        self.init_weights()

    def init_weights(self):
        """
        Initialize weights using Kaiming Normal for convolutional layers.
        'gelu' activation is mapped to 'relu' for initialization purposes.
        """
        # Determine the appropriate nonlinearity for initialization
        if isinstance(self.activation, nn.GELU):
            init_nonlinearity = 'relu'  # Approximation
        elif isinstance(self.activation, nn.LeakyReLU):
            init_nonlinearity = 'leaky_relu'
        elif isinstance(self.activation, nn.ELU):
            init_nonlinearity = 'relu'  # 'ELU' not directly supported
        else:
            init_nonlinearity = 'relu'  # Default to 'relu'

        nn.init.kaiming_normal_(self.conv1.weight, nonlinearity=init_nonlinearity)
        nn.init.kaiming_normal_(self.conv2.weight, nonlinearity=init_nonlinearity)
        
        if self.downsample is not None:
            nn.init.kaiming_normal_(self.downsample.weight, nonlinearity='linear')

    def forward(self, x):
        """
        Forward pass through the Temporal Block.
        """
        out = self.conv1(x)
        out = self.norm1(out)
        out = self.activation(out)
        out = self.dropout1(out)

        out = self.conv2(out)
        out = self.norm2(out)
        out = self.activation(out)
        out = self.dropout2(out)

        residual = x if self.downsample is None else self.downsample(x)
        return self.activation(out + residual)

# ---------------------------------------------------
# 2. PositionalEncoding Class
# ---------------------------------------------------
class PositionalEncoding(nn.Module):
    """
    Implements the sinusoidal positional encoding for Transformers.
    """
    def __init__(self, d_model: int, max_len: int = 5000):
        super(PositionalEncoding, self).__init__()
        
        pe = torch.zeros(max_len, d_model)  # (max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)  # (max_len, 1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * 
                             (-math.log(10000.0) / d_model))  # (d_model/2,)
        pe[:, 0::2] = torch.sin(position * div_term)  # Even indices
        pe[:, 1::2] = torch.cos(position * div_term)  # Odd indices
        pe = pe.unsqueeze(0).transpose(0, 1)  # (max_len, 1, d_model)
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Args:
            x: Tensor of shape (seq_len, batch_size, d_model)
        Returns:
            Tensor with positional encoding added.
        """
        x = x + self.pe[:x.size(0), :]
        return x

# ---------------------------------------------------
# 3. TCNTransformer Model Class
# ---------------------------------------------------
class TCNTransformer(nn.Module):
    """
    Temporal Convolutional Network combined with Transformer Encoder for Time Series Classification.
    """
    def __init__(
        self, 
        num_inputs: int, 
        num_tcn_channels: list, 
        tcn_kernel_size: int, 
        tcn_dropout: float, 
        transformer_hidden_size: int, 
        transformer_num_heads: int, 
        transformer_num_layers: int, 
        transformer_dropout: float, 
        num_classes: int,
        activation: str = 'GELU',
        normalization: str = 'BatchNorm',
        max_seq_len: int = 1000
    ):
        super(TCNTransformer, self).__init__()
        
        # Build TCN layers
        layers = []
        num_levels = len(num_tcn_channels)
        for i in range(num_levels):
            dilation_size = 2 ** i
            in_channels = num_inputs if i == 0 else num_tcn_channels[i-1]
            out_channels = num_tcn_channels[i]
            # Correct padding to maintain sequence length
            padding = (tcn_kernel_size - 1) * dilation_size // 2
            layers += [TemporalBlock(
                in_channels, 
                out_channels, 
                tcn_kernel_size, 
                stride=1, 
                dilation=dilation_size,
                padding=padding, 
                dropout=tcn_dropout,
                activation=activation,
                normalization=normalization
            )]
        self.tcn = nn.Sequential(*layers)  # Output shape: (batch_size, num_tcn_channels[-1], seq_len)
        
        # Positional Encoding
        self.positional_encoding = PositionalEncoding(d_model=num_tcn_channels[-1], max_len=max_seq_len)
        
        # Projection to transformer hidden size if necessary
        if num_tcn_channels[-1] != transformer_hidden_size:
            self.projection = nn.Linear(num_tcn_channels[-1], transformer_hidden_size)
        else:
            self.projection = nn.Identity()
        
        # Transformer Encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=transformer_hidden_size, 
            nhead=transformer_num_heads, 
            dim_feedforward=transformer_hidden_size * 4, 
            dropout=transformer_dropout, 
            activation='gelu'
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=transformer_num_layers)
        
        # Classification Head
        self.fc = nn.Sequential(
            nn.Dropout(transformer_dropout),
            nn.Linear(transformer_hidden_size, num_classes)
        )
        
        self.init_weights()

    def init_weights(self):
        """
        Initialize weights for projection and classification layers.
        """
        if not isinstance(self.projection, nn.Identity):
            nn.init.xavier_uniform_(self.projection.weight)
            nn.init.zeros_(self.projection.bias)
        for layer in self.fc:
            if isinstance(layer, nn.Linear):
                nn.init.xavier_uniform_(layer.weight)
                nn.init.zeros_(layer.bias)

    def forward(self, x):
        """
        Forward pass through the TCN-Transformer model.
        
        Args:
            x: Input tensor of shape (batch_size, num_features, seq_len)
        
        Returns:
            Output logits of shape (batch_size, num_classes)
        """
        # Pass through TCN
        tcn_out = self.tcn(x)  # (batch_size, num_tcn_channels[-1], seq_len)
        
        # Permute for Transformer: (seq_len, batch_size, num_tcn_channels[-1])
        tcn_out = tcn_out.permute(2, 0, 1)
        
        # Project to transformer hidden size if necessary
        transformer_input = self.projection(tcn_out)  # (seq_len, batch_size, transformer_hidden_size)
        
        # Add positional encoding
        transformer_input = self.positional_encoding(transformer_input)  # (seq_len, batch_size, transformer_hidden_size)
        
        # Pass through Transformer Encoder
        transformer_out = self.transformer_encoder(transformer_input)  # (seq_len, batch_size, transformer_hidden_size)
        
        # Aggregate Transformer outputs (e.g., take the mean over the sequence)
        transformer_out = transformer_out.mean(dim=0)  # (batch_size, transformer_hidden_size)
        
        # Classification Head
        out = self.fc(transformer_out)  # (batch_size, num_classes)
        
        return out

# ---------------------------------------------------
# 4. TimeSeriesDataset Class
# ---------------------------------------------------
class TimeSeriesDataset(Dataset):
    def __init__(self, sequences, labels):
        """
        Args:
            sequences: Numpy array of shape (num_samples, seq_len, num_features)
            labels: Numpy array of shape (num_samples,)
        """
        self.sequences = torch.tensor(sequences, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        # Transpose to (num_features, seq_len) for TCN input
        return self.sequences[idx].permute(1, 0), self.labels[idx]

# ---------------------------------------------------
# 5. Example Usage and Verification
# ---------------------------------------------------
def main():
    # Define model parameters
    num_inputs = 10  # Number of features
    sequence_length = 100  # Number of time steps (n minutes candles)
    batch_size = 32
    num_tcn_channels = [64, 128, 256]  # TCN channels per layer
    tcn_kernel_size = 3
    tcn_dropout = 0.3
    transformer_hidden_size = 256  # Must match the last TCN channel or use projection
    transformer_num_heads = 8
    transformer_num_layers = 3
    transformer_dropout = 0.1
    num_classes = 3  # long, short, flat
    max_seq_len = sequence_length  # Ensure positional encoding covers sequence length

    # Initialize the model
    model = TCNTransformer(
        num_inputs=num_inputs, 
        num_tcn_channels=num_tcn_channels, 
        tcn_kernel_size=tcn_kernel_size, 
        tcn_dropout=tcn_dropout, 
        transformer_hidden_size=transformer_hidden_size, 
        transformer_num_heads=transformer_num_heads, 
        transformer_num_layers=transformer_num_layers, 
        transformer_dropout=transformer_dropout, 
        num_classes=num_classes,
        activation='GELU',
        normalization='BatchNorm',
        max_seq_len=max_seq_len
    )
    
    # Verify model architecture
    print(model)
    
    # Create a random input tensor
    x = torch.rand(batch_size, num_inputs, sequence_length)  # (batch_size, num_features, seq_len)
    
    # Forward pass
    output = model(x)  # (batch_size, num_classes)
    print(output.shape)  # Expected: (batch_size, num_classes)


In [None]:

main()
# train_and_evaluate()

In [None]:
import pandas as pd
import numpy as np
import os
from datetime import datetime, timedelta
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader

# ------------------------------
# 1. Load and Inspect the Data
# ------------------------------

# Define the file path
file_path = '../Data/Binance_BTCUSDT_2024_minute — копия.csv'

# Check if the file exists
if not os.path.exists(file_path):
    raise FileNotFoundError(f"The file {file_path} does not exist.")

# Load the data into a pandas DataFrame
df = pd.read_csv(file_path)

# Display the first few rows
print("Initial Data:")
print(df.head())

# ------------------------------
# 2. Preprocess the Data
# ------------------------------

# Convert 'date' column to datetime
df['date'] = pd.to_datetime(df['date'])

# Set 'date' as the DataFrame index
df.set_index('date', inplace=True)

# Sort the DataFrame by datetime index to ensure chronological order
df.sort_index(inplace=True)

# Extract OCHL columns
ochl_df = df[['open', 'high', 'low', 'close']].copy()

# Calculate volatility as the average of (high - low) over the window
ochl_df['volatility'] = ochl_df['high'] - ochl_df['low']
ochl_df['avg_volatility'] = ochl_df['volatility'].rolling(window=500).mean()

# Resample to hourly data, taking the last closing price as the hour's close
hourly_close = ochl_df['close'].resample('h').last()

# Align the average volatility with hourly data by resampling
hourly_avg_volatility = ochl_df['avg_volatility'].resample('h').last()

# Shift the hourly closes by one to get the next hour's close
next_hour_close = hourly_close.shift(-1)

# Create a DataFrame to hold the necessary data
label_df = pd.DataFrame({
    'current_close': hourly_close,
    'next_close': next_hour_close,
    'avg_volatility': hourly_avg_volatility
})

# Drop rows with NaN values (especially the last row where next_close is NaN)
label_df.dropna(inplace=True)

# Define labels based on the criteria
def assign_label(row):
    if row['next_close'] > row['current_close'] + row['avg_volatility']:
        return 'long'
    elif row['next_close'] < row['current_close'] - row['avg_volatility']:
        return 'short'
    else:
        return 'flat'

label_df['label'] = label_df.apply(assign_label, axis=1)

# ------------------------------
# 3. Generate Input Sequences
# ------------------------------

# Initialize lists to hold sequences and labels
sequences = []
labels = []

# Iterate over each row in label_df
for idx, row in label_df.iterrows():
    # Current hour's end time
    current_hour_end = idx
    
    # Define the start time of the 500-minute window
    start_time = current_hour_end - timedelta(minutes=500)
    
    # Check if the start_time exists in the minute-level data
    if start_time in ochl_df.index:
        # Extract the 500-minute window
        window_data = ochl_df.loc[start_time:current_hour_end - timedelta(minutes=1), ['open', 'high', 'low', 'close']]
        
        # Ensure the window has exactly 500 data points
        if len(window_data) == 500:
            # Convert to numpy array
            window_array = window_data.values  # Shape: (500, 4)
            sequences.append(window_array)
            labels.append(row['label'])
    else:
        # If start_time not in index, skip this sample
        continue

# Convert lists to numpy arrays
sequences = np.array(sequences)  # Shape: (num_samples, 500, 4)
labels = np.array(labels)        # Shape: (num_samples,)

# ------------------------------
# 4. Encode Labels
# ------------------------------

# Initialize LabelEncoder
le = LabelEncoder()

# Fit and transform labels
encoded_labels = le.fit_transform(labels)

# ------------------------------
# 5. Split the Data
# ------------------------------

# First, split into training and temporary sets (80% train, 20% temp)
X_train, X_temp, y_train, y_temp = train_test_split(
    sequences, encoded_labels, test_size=0.2, random_state=42, stratify=encoded_labels
)

# Then, split the temporary set into validation and test sets (10% each of the total data)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

# ------------------------------
# 6. Create PyTorch Datasets and DataLoaders
# ------------------------------

class TimeSeriesDataset(Dataset):
    def __init__(self, sequences, labels):
        """
        Args:
            sequences (numpy.ndarray): Array of shape (num_samples, seq_len, num_features)
            labels (numpy.ndarray): Array of shape (num_samples,)
        """
        self.sequences = torch.tensor(sequences, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.long)
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        return self.sequences[idx], self.labels[idx]

# Create Dataset instances
train_dataset = TimeSeriesDataset(X_train, y_train)
val_dataset = TimeSeriesDataset(X_val, y_val)
test_dataset = TimeSeriesDataset(X_test, y_test)

# Create DataLoader instances
batch_size = 32

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# ------------------------------
# 7. Verify the DataLoaders
# ------------------------------

# Get a sample batch from the training loader
sample_sequence, sample_label = next(iter(train_loader))
print(f"Sample sequence shape: {sample_sequence.shape}")  # Expected: (batch_size, 500, 4)
print(f"Sample label shape: {sample_label.shape}")        # Expected: (batch_size,)
print(f"Sample labels: {sample_label}")                  # Tensor of labels

# ------------------------------
# 8. Summary of Label Encoding
# ------------------------------

print(f"Classes: {le.classes_}")  # ['flat', 'long', 'short']


In [None]:
# Define model parameters (ensure these match your data)
num_inputs = 4  # OCHL features
num_tcn_channels = [64, 128, 256]
tcn_kernel_size = 3
tcn_dropout = 0.3
transformer_hidden_size = 256
transformer_num_heads = 8
transformer_num_layers = 3
transformer_dropout = 0.1
num_classes = 3  # 'long', 'short', 'flat'
max_seq_len = 500  # Number of minutes in the input sequence

# Initialize the model
model = TCNTransformer(
    num_inputs=num_inputs, 
    num_tcn_channels=num_tcn_channels, 
    tcn_kernel_size=tcn_kernel_size, 
    tcn_dropout=tcn_dropout, 
    transformer_hidden_size=transformer_hidden_size, 
    transformer_num_heads=transformer_num_heads, 
    transformer_num_layers=transformer_num_layers, 
    transformer_dropout=transformer_dropout, 
    num_classes=num_classes,
    activation='GELU',
    normalization='BatchNorm',
    max_seq_len=max_seq_len
)

# Move the model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


In [None]:
from sklearn.preprocessing import StandardScaler

# Initialize scaler
scaler = StandardScaler()

# Reshape the training sequences to (num_samples * seq_len, num_features) for fitting
X_train_reshaped = X_train.reshape(-1, X_train.shape[-1])

# Fit the scaler on the training data
scaler.fit(X_train_reshaped)

# Define a function to scale sequences
def scale_sequences(sequences, scaler):
    num_samples, seq_len, num_features = sequences.shape
    sequences_reshaped = sequences.reshape(-1, num_features)
    sequences_scaled = scaler.transform(sequences_reshaped)
    return sequences_scaled.reshape(num_samples, seq_len, num_features)

# Apply the scaler to all datasets
X_train_scaled = scale_sequences(X_train, scaler)
X_val_scaled = scale_sequences(X_val, scaler)
X_test_scaled = scale_sequences(X_test, scaler)

# Update the datasets with scaled data
train_dataset = TimeSeriesDataset(X_train_scaled, y_train)
val_dataset = TimeSeriesDataset(X_val_scaled, y_val)
test_dataset = TimeSeriesDataset(X_test_scaled, y_test)

# Recreate DataLoaders if necessary
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [None]:
# Verify scaling on training data
print("Scaled training data statistics:")
print(f"Mean: {X_train_scaled.mean():.4f}")
print(f"Std: {X_train_scaled.std():.4f}")

# Output should be approximately:
# Mean: 0.0000
# Std: 1.0000


In [None]:
from sklearn.utils.class_weight import compute_class_weight
import torch.nn as nn
import torch.optim as optim

# Compute class weights to handle class imbalance
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)

# Convert class weights to a tensor
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)

# Define the loss function with class weights
criterion = nn.CrossEntropyLoss(weight=class_weights_tensor)

# Define the optimizer
optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-5)

# Define a learning rate scheduler
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, 
    mode='min', 
    factor=0.5, 
    patience=5, 
    verbose=True
)


In [None]:
from sklearn.metrics import classification_report

# Number of epochs
num_epochs = 50

# Path to save the best model
checkpoint_path = 'best_tcn_transformer_model.pth'

# Initialize variables to track the best validation F1-Score
best_f1 = 0.0

for epoch in range(num_epochs):
    # -------------------
    # Training Phase
    # -------------------
    model.train()
    running_loss = 0.0
    for sequences_batch, labels_batch in train_loader:
        # Move data to device
        sequences_batch = sequences_batch.to(device)  # Shape: (batch_size, 500, 4)
        labels_batch = labels_batch.to(device)        # Shape: (batch_size,)
        
        # Zero the parameter gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(sequences_batch)  # Shape: (batch_size, 3)
        
        # Compute loss
        loss = criterion(outputs, labels_batch)
        
        # Backward pass and optimization
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Gradient clipping
        optimizer.step()
        
        # Accumulate loss
        running_loss += loss.item() * sequences_batch.size(0)
    
    epoch_loss = running_loss / len(train_loader.dataset)
    
    # -------------------
    # Validation Phase
    # -------------------
    model.eval()
    val_loss = 0.0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for sequences_batch, labels_batch in val_loader:
            # Move data to device
            sequences_batch = sequences_batch.to(device)
            labels_batch = labels_batch.to(device)
            
            # Forward pass
            outputs = model(sequences_batch)
            
            # Compute loss
            loss = criterion(outputs, labels_batch)
            val_loss += loss.item() * sequences_batch.size(0)
            
            # Predictions
            _, preds = torch.max(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels_batch.cpu().numpy())
    
    val_loss /= len(val_loader.dataset)
    
    # Compute F1-Score
    report = classification_report(all_labels, all_preds, target_names=le.classes_, output_dict=True)
    f1_score_val = report['weighted avg']['f1-score']
    
    print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {epoch_loss:.4f} | Val Loss: {val_loss:.4f} | Val F1-Score: {f1_score_val:.4f}")
    
    # Adjust learning rate based on validation loss
    scheduler.step(val_loss)
    
    # Save the model if it has the best F1-Score so far
    if f1_score_val > best_f1:
        best_f1 = f1_score_val
        torch.save(model.state_dict(), checkpoint_path)
        print(f"Saved Best Model at Epoch {epoch+1} with F1-Score: {f1_score_val:.4f}")


In [None]:
from sklearn.metrics import confusion_matrix, classification_report

# Load the best model
model.load_state_dict(torch.load(checkpoint_path))

# Move model to evaluation mode
model.eval()

all_preds = []
all_labels = []

with torch.no_grad():
    for sequences_batch, labels_batch in test_loader:
        # Move data to device
        sequences_batch = sequences_batch.to(device)
        labels_batch = labels_batch.to(device)
        
        # Forward pass
        outputs = model(sequences_batch)
        
        # Predictions
        _, preds = torch.max(outputs, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels_batch.cpu().numpy())

# Compute confusion matrix
conf_matrix = confusion_matrix(all_labels, all_preds)
print("Confusion Matrix:")
print(conf_matrix)

# Compute classification report
class_report = classification_report(all_labels, all_preds, target_names=le.classes_)
print("\nClassification Report:")
print(class_report)


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# ---------------------------------------------------
# 1. TemporalBlock Class with Corrected Padding
# ---------------------------------------------------
class TemporalBlock(nn.Module):
    """
    A Temporal Block comprising two dilated convolutional layers with
    batch normalization, GELU activations, dropout, and residual connections.
    """
    def __init__(
        self, 
        in_channels: int, 
        out_channels: int, 
        kernel_size: int, 
        stride: int, 
        dilation: int, 
        padding: int, 
        dropout: float,
        activation: str = 'GELU',
        normalization: str = 'BatchNorm'
    ):
        super(TemporalBlock, self).__init__()
        
        # Select activation function
        if activation == 'GELU':
            self.activation = nn.GELU()
        elif activation == 'LeakyReLU':
            self.activation = nn.LeakyReLU()
        elif activation == 'ELU':
            self.activation = nn.ELU()
        else:
            self.activation = nn.ReLU()
        
        # Select normalization layer
        if normalization == 'BatchNorm':
            self.norm1 = nn.BatchNorm1d(out_channels)
            self.norm2 = nn.BatchNorm1d(out_channels)
        elif normalization == 'LayerNorm':
            self.norm1 = nn.LayerNorm(out_channels)
            self.norm2 = nn.LayerNorm(out_channels)
        else:
            self.norm1 = nn.Identity()
            self.norm2 = nn.Identity()
        
        self.conv1 = nn.Conv1d(
            in_channels, 
            out_channels, 
            kernel_size, 
            stride=stride, 
            padding=padding, 
            dilation=dilation
        )
        self.dropout1 = nn.Dropout(dropout)

        self.conv2 = nn.Conv1d(
            out_channels, 
            out_channels, 
            kernel_size, 
            stride=stride, 
            padding=padding, 
            dilation=dilation
        )
        self.dropout2 = nn.Dropout(dropout)

        self.downsample = nn.Conv1d(
            in_channels, 
            out_channels, 
            kernel_size=1
        ) if in_channels != out_channels else None

        self.init_weights()

    def init_weights(self):
        """
        Initialize weights using Kaiming Normal for convolutional layers.
        'gelu' activation is mapped to 'relu' for initialization purposes.
        """
        # Determine the appropriate nonlinearity for initialization
        if isinstance(self.activation, nn.GELU):
            init_nonlinearity = 'relu'  # Approximation
        elif isinstance(self.activation, nn.LeakyReLU):
            init_nonlinearity = 'leaky_relu'
        elif isinstance(self.activation, nn.ELU):
            init_nonlinearity = 'relu'  # 'ELU' not directly supported
        else:
            init_nonlinearity = 'relu'  # Default to 'relu'

        nn.init.kaiming_normal_(self.conv1.weight, nonlinearity=init_nonlinearity)
        nn.init.kaiming_normal_(self.conv2.weight, nonlinearity=init_nonlinearity)
        
        if self.downsample is not None:
            nn.init.kaiming_normal_(self.downsample.weight, nonlinearity='linear')

    def forward(self, x):
        """
        Forward pass through the Temporal Block.
        """
        out = self.conv1(x)
        out = self.norm1(out)
        out = self.activation(out)
        out = self.dropout1(out)

        out = self.conv2(out)
        out = self.norm2(out)
        out = self.activation(out)
        out = self.dropout2(out)

        residual = x if self.downsample is None else self.downsample(x)
        return self.activation(out + residual)

# ---------------------------------------------------
# 2. PositionalEncoding Class
# ---------------------------------------------------
class PositionalEncoding(nn.Module):
    """
    Implements the sinusoidal positional encoding for Transformers.
    """
    def __init__(self, d_model: int, max_len: int = 5000):
        super(PositionalEncoding, self).__init__()
        
        pe = torch.zeros(max_len, d_model)  # (max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)  # (max_len, 1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * 
                             (-math.log(10000.0) / d_model))  # (d_model/2,)
        pe[:, 0::2] = torch.sin(position * div_term)  # Even indices
        pe[:, 1::2] = torch.cos(position * div_term)  # Odd indices
        pe = pe.unsqueeze(0).transpose(0, 1)  # (max_len, 1, d_model)
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Args:
            x: Tensor of shape (seq_len, batch_size, d_model)
        Returns:
            Tensor with positional encoding added.
        """
        x = x + self.pe[:x.size(0), :]
        return x

# ---------------------------------------------------
# 3. TCNTransformer Model Class
# ---------------------------------------------------
class TCNTransformer(nn.Module):
    """
    Temporal Convolutional Network combined with Transformer Encoder for Time Series Classification.
    """
    def __init__(
        self, 
        num_inputs: int, 
        num_tcn_channels: list, 
        tcn_kernel_size: int, 
        tcn_dropout: float, 
        transformer_hidden_size: int, 
        transformer_num_heads: int, 
        transformer_num_layers: int, 
        transformer_dropout: float, 
        num_classes: int,
        activation: str = 'GELU',
        normalization: str = 'BatchNorm',
        max_seq_len: int = 1000
    ):
        super(TCNTransformer, self).__init__()
        
        # Build TCN layers
        layers = []
        num_levels = len(num_tcn_channels)
        for i in range(num_levels):
            dilation_size = 2 ** i
            in_channels = num_inputs if i == 0 else num_tcn_channels[i-1]
            out_channels = num_tcn_channels[i]
            # Correct padding to maintain sequence length
            padding = (tcn_kernel_size - 1) * dilation_size // 2
            layers += [TemporalBlock(
                in_channels, 
                out_channels, 
                tcn_kernel_size, 
                stride=1, 
                dilation=dilation_size,
                padding=padding, 
                dropout=tcn_dropout,
                activation=activation,
                normalization=normalization
            )]
        self.tcn = nn.Sequential(*layers)  # Output shape: (batch_size, num_tcn_channels[-1], seq_len)
        
        # Positional Encoding
        self.positional_encoding = PositionalEncoding(d_model=num_tcn_channels[-1], max_len=max_seq_len)
        
        # Projection to transformer hidden size if necessary
        if num_tcn_channels[-1] != transformer_hidden_size:
            self.projection = nn.Linear(num_tcn_channels[-1], transformer_hidden_size)
        else:
            self.projection = nn.Identity()
        
        # Transformer Encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=transformer_hidden_size, 
            nhead=transformer_num_heads, 
            dim_feedforward=transformer_hidden_size * 4, 
            dropout=transformer_dropout, 
            activation='gelu'
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=transformer_num_layers)
        
        # Classification Head
        self.fc = nn.Sequential(
            nn.Dropout(transformer_dropout),
            nn.Linear(transformer_hidden_size, num_classes)
        )
        
        self.init_weights()

    def init_weights(self):
        """
        Initialize weights for projection and classification layers.
        """
        if not isinstance(self.projection, nn.Identity):
            nn.init.xavier_uniform_(self.projection.weight)
            nn.init.zeros_(self.projection.bias)
        for layer in self.fc:
            if isinstance(layer, nn.Linear):
                nn.init.xavier_uniform_(layer.weight)
                nn.init.zeros_(layer.bias)

    def forward(self, x):
        """
        Forward pass through the TCN-Transformer model.
        
        Args:
            x: Input tensor of shape (batch_size, num_features, seq_len)
        
        Returns:
            Output logits of shape (batch_size, num_classes)
        """
        # Pass through TCN
        tcn_out = self.tcn(x)  # (batch_size, num_tcn_channels[-1], seq_len)
        
        # Permute for Transformer: (seq_len, batch_size, num_tcn_channels[-1])
        tcn_out = tcn_out.permute(2, 0, 1)
        
        # Project to transformer hidden size if necessary
        transformer_input = self.projection(tcn_out)  # (seq_len, batch_size, transformer_hidden_size)
        
        # Add positional encoding
        transformer_input = self.positional_encoding(transformer_input)  # (seq_len, batch_size, transformer_hidden_size)
        
        # Pass through Transformer Encoder
        transformer_out = self.transformer_encoder(transformer_input)  # (seq_len, batch_size, transformer_hidden_size)
        
        # Aggregate Transformer outputs (e.g., take the mean over the sequence)
        transformer_out = transformer_out.mean(dim=0)  # (batch_size, transformer_hidden_size)
        
        # Classification Head
        out = self.fc(transformer_out)  # (batch_size, num_classes)
        
        return out

# ---------------------------------------------------
# 4. TimeSeriesDataset Class
# ---------------------------------------------------
class TimeSeriesDataset(Dataset):
    def __init__(self, sequences, labels):
        """
        Args:
            sequences: Numpy array of shape (num_samples, seq_len, num_features)
            labels: Numpy array of shape (num_samples,)
        """
        self.sequences = torch.tensor(sequences, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        # Transpose to (num_features, seq_len) for TCN input
        return self.sequences[idx].permute(1, 0), self.labels[idx]

# ---------------------------------------------------
# 5. Example Usage and Verification
# ---------------------------------------------------
def main():
    # Define model parameters
    num_inputs = 4  # OCHL features
    sequence_length = 500  # Number of time steps (n minutes candles)
    batch_size = 32
    num_tcn_channels = [64, 128, 256]  # TCN channels per layer
    tcn_kernel_size = 3
    tcn_dropout = 0.3
    transformer_hidden_size = 256  # Must match the last TCN channel or use projection
    transformer_num_heads = 8
    transformer_num_layers = 3
    transformer_dropout = 0.1
    num_classes = 3  # long, short, flat
    max_seq_len = sequence_length  # Ensure positional encoding covers sequence length

    # Initialize the model
    model = TCNTransformer(
        num_inputs=num_inputs, 
        num_tcn_channels=num_tcn_channels, 
        tcn_kernel_size=tcn_kernel_size, 
        tcn_dropout=tcn_dropout, 
        transformer_hidden_size=transformer_hidden_size, 
        transformer_num_heads=transformer_num_heads, 
        transformer_num_layers=transformer_num_layers, 
        transformer_dropout=transformer_dropout, 
        num_classes=num_classes,
        activation='GELU',
        normalization='BatchNorm',
        max_seq_len=max_seq_len
    )
    
    # Verify model architecture
    print(model)
    
    # Create a random input tensor
    x = torch.rand(batch_size, num_inputs, sequence_length)  # (batch_size, num_features, seq_len)
    
    # Forward pass
    output = model(x)  # (batch_size, num_classes)
    print(output.shape)  # Expected: (batch_size, num_classes)

main()
# train_and_evaluate()
import pandas as pd
import numpy as np
import os
from datetime import datetime, timedelta
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader

# ------------------------------
# 1. Load and Inspect the Data
# ------------------------------

# Define the file path
file_path = '../Data/Binance_BTCUSDT_2024_minute — копия.csv'

# Check if the file exists
if not os.path.exists(file_path):
    raise FileNotFoundError(f"The file {file_path} does not exist.")

# Load the data into a pandas DataFrame
df = pd.read_csv(file_path)

# Display the first few rows
print("Initial Data:")
print(df.head())

# ------------------------------
# 2. Preprocess the Data
# ------------------------------

# Convert 'date' column to datetime
df['date'] = pd.to_datetime(df['date'])

# Set 'date' as the DataFrame index
df.set_index('date', inplace=True)

# Sort the DataFrame by datetime index to ensure chronological order
df.sort_index(inplace=True)

# Extract OCHL columns
ochl_df = df[['open', 'high', 'low', 'close']].copy()

# Calculate volatility as the average of (high - low) over the window
ochl_df['volatility'] = ochl_df['high'] - ochl_df['low']
ochl_df['avg_volatility'] = ochl_df['volatility'].rolling(window=500).mean()

# Resample to hourly data, taking the last closing price as the hour's close
hourly_close = ochl_df['close'].resample('h').last()

# Align the average volatility with hourly data by resampling
hourly_avg_volatility = ochl_df['avg_volatility'].resample('h').last()

# Shift the hourly closes by one to get the next hour's close
next_hour_close = hourly_close.shift(-1)

# Create a DataFrame to hold the necessary data
label_df = pd.DataFrame({
    'current_close': hourly_close,
    'next_close': next_hour_close,
    'avg_volatility': hourly_avg_volatility
})

# Drop rows with NaN values (especially the last row where next_close is NaN)
label_df.dropna(inplace=True)

# Define labels based on the criteria
def assign_label(row):
    if row['next_close'] > row['current_close'] + row['avg_volatility']:
        return 'long'
    elif row['next_close'] < row['current_close'] - row['avg_volatility']:
        return 'short'
    else:
        return 'flat'

label_df['label'] = label_df.apply(assign_label, axis=1)

# ------------------------------
# 3. Generate Input Sequences
# ------------------------------

# Initialize lists to hold sequences and labels
sequences = []
labels = []

# Iterate over each row in label_df
for idx, row in label_df.iterrows():
    # Current hour's end time
    current_hour_end = idx
    
    # Define the start time of the 500-minute window
    start_time = current_hour_end - timedelta(minutes=500)
    
    # Check if the start_time exists in the minute-level data
    if start_time in ochl_df.index:
        # Extract the 500-minute window
        window_data = ochl_df.loc[start_time:current_hour_end - timedelta(minutes=1), ['open', 'high', 'low', 'close']]
        
        # Ensure the window has exactly 500 data points
        if len(window_data) == 500:
            # Convert to numpy array
            window_array = window_data.values  # Shape: (500, 4)
            sequences.append(window_array)
            labels.append(row['label'])
    else:
        # If start_time not in index, skip this sample
        continue

# Convert lists to numpy arrays
sequences = np.array(sequences)  # Shape: (num_samples, 500, 4)
labels = np.array(labels)        # Shape: (num_samples,)

# ------------------------------
# 4. Encode Labels
# ------------------------------

# Initialize LabelEncoder
le = LabelEncoder()

# Fit and transform labels
encoded_labels = le.fit_transform(labels)

# ------------------------------
# 5. Split the Data
# ------------------------------

# First, split into training and temporary sets (80% train, 20% temp)
X_train, X_temp, y_train, y_temp = train_test_split(
    sequences, encoded_labels, test_size=0.2, random_state=42, stratify=encoded_labels
)

# Then, split the temporary set into validation and test sets (10% each of the total data)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

# ------------------------------
# 6. Create PyTorch Datasets and DataLoaders
# ------------------------------

class TimeSeriesDataset(Dataset):
    def __init__(self, sequences, labels):
        """
        Args:
            sequences (numpy.ndarray): Array of shape (num_samples, seq_len, num_features)
            labels (numpy.ndarray): Array of shape (num_samples,)
        """
        self.sequences = torch.tensor(sequences, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.long)
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        return self.sequences[idx], self.labels[idx]

# Create Dataset instances
train_dataset = TimeSeriesDataset(X_train, y_train)
val_dataset = TimeSeriesDataset(X_val, y_val)
test_dataset = TimeSeriesDataset(X_test, y_test)

# Create DataLoader instances
batch_size = 32

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# ------------------------------
# 7. Verify the DataLoaders
# ------------------------------

# Get a sample batch from the training loader
sample_sequence, sample_label = next(iter(train_loader))
print(f"Sample sequence shape: {sample_sequence.shape}")  # Expected: (batch_size, 500, 4)
print(f"Sample label shape: {sample_label.shape}")        # Expected: (batch_size,)
print(f"Sample labels: {sample_label}")                  # Tensor of labels

# ------------------------------
# 8. Summary of Label Encoding
# ------------------------------

print(f"Classes: {le.classes_}")  # ['flat', 'long', 'short']


In [None]:
# ---------------------------------------------------
# 1. Import Necessary Libraries
# ---------------------------------------------------
import os
import math
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings  # For handling warnings

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix

from datetime import timedelta  # For time-based operations

# Suppress specific warnings if desired
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Check if CUDA is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# ---------------------------------------------------
# 2. Define Model Components
# ---------------------------------------------------

# 2.1 TemporalBlock Class
class TemporalBlock(nn.Module):
    """
    A Temporal Block comprising two dilated convolutional layers with
    batch normalization, GELU activations, dropout, and residual connections.
    """
    def __init__(
        self, 
        in_channels: int, 
        out_channels: int, 
        kernel_size: int, 
        stride: int, 
        dilation: int, 
        padding: int, 
        dropout: float,
        activation: str = 'GELU',
        normalization: str = 'BatchNorm'
    ):
        super(TemporalBlock, self).__init__()
        
        # Select activation function
        if activation == 'GELU':
            self.activation = nn.GELU()
        elif activation == 'LeakyReLU':
            self.activation = nn.LeakyReLU()
        elif activation == 'ELU':
            self.activation = nn.ELU()
        else:
            self.activation = nn.ReLU()
        
        # Select normalization layer
        if normalization == 'BatchNorm':
            self.norm1 = nn.BatchNorm1d(out_channels)
            self.norm2 = nn.BatchNorm1d(out_channels)
        elif normalization == 'LayerNorm':
            self.norm1 = nn.LayerNorm(out_channels)
            self.norm2 = nn.LayerNorm(out_channels)
        else:
            self.norm1 = nn.Identity()
            self.norm2 = nn.Identity()
        
        self.conv1 = nn.Conv1d(
            in_channels, 
            out_channels, 
            kernel_size, 
            stride=stride, 
            padding=padding, 
            dilation=dilation
        )
        self.dropout1 = nn.Dropout(dropout)

        self.conv2 = nn.Conv1d(
            out_channels, 
            out_channels, 
            kernel_size, 
            stride=stride, 
            padding=padding, 
            dilation=dilation
        )
        self.dropout2 = nn.Dropout(dropout)

        self.downsample = nn.Conv1d(
            in_channels, 
            out_channels, 
            kernel_size=1
        ) if in_channels != out_channels else None

        self.init_weights()

    def init_weights(self):
        """
        Initialize weights using Kaiming Normal for convolutional layers.
        'gelu' activation is mapped to 'relu' for initialization purposes.
        """
        # Determine the appropriate nonlinearity for initialization
        if isinstance(self.activation, nn.GELU):
            init_nonlinearity = 'relu'  # Approximation
        elif isinstance(self.activation, nn.LeakyReLU):
            init_nonlinearity = 'leaky_relu'
        elif isinstance(self.activation, nn.ELU):
            init_nonlinearity = 'relu'  # 'ELU' not directly supported
        else:
            init_nonlinearity = 'relu'  # Default to 'relu'

        nn.init.kaiming_normal_(self.conv1.weight, nonlinearity=init_nonlinearity)
        nn.init.kaiming_normal_(self.conv2.weight, nonlinearity=init_nonlinearity)
        
        if self.downsample is not None:
            nn.init.kaiming_normal_(self.downsample.weight, nonlinearity='linear')

    def forward(self, x):
        """
        Forward pass through the Temporal Block.
        """
        out = self.conv1(x)
        out = self.norm1(out)
        out = self.activation(out)
        out = self.dropout1(out)

        out = self.conv2(out)
        out = self.norm2(out)
        out = self.activation(out)
        out = self.dropout2(out)

        residual = x if self.downsample is None else self.downsample(x)
        return self.activation(out + residual)

# 2.2 PositionalEncoding Class
class PositionalEncoding(nn.Module):
    """
    Implements the sinusoidal positional encoding for Transformers.
    """
    def __init__(self, d_model: int, max_len: int = 5000):
        super(PositionalEncoding, self).__init__()
        
        pe = torch.zeros(max_len, d_model)  # (max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)  # (max_len, 1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * 
                             (-math.log(10000.0) / d_model))  # (d_model/2,)
        pe[:, 0::2] = torch.sin(position * div_term)  # Even indices
        pe[:, 1::2] = torch.cos(position * div_term)  # Odd indices
        pe = pe.unsqueeze(0).transpose(0, 1)  # (max_len, 1, d_model)
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Args:
            x: Tensor of shape (seq_len, batch_size, d_model)
        Returns:
            Tensor with positional encoding added.
        """
        x = x + self.pe[:x.size(0), :]
        return x

# 2.3 Custom TransformerEncoderLayerWithWeights Class
class TransformerEncoderLayerWithWeights(nn.TransformerEncoderLayer):
    """
    Custom Transformer Encoder Layer that returns attention weights.
    """
    def __init__(self, *args, **kwargs):
        super(TransformerEncoderLayerWithWeights, self).__init__(*args, **kwargs)
    
    def forward(self, src, src_mask=None, src_key_padding_mask=None, is_causal=False):
        # Set need_weights=True to get attention weights
        src2, attn_weights = self.self_attn(src, src, src,
                                            attn_mask=src_mask,
                                            key_padding_mask=src_key_padding_mask,
                                            need_weights=True,
                                            is_causal=is_causal)
        src = src + self.dropout1(src2)
        src = self.norm1(src)
        src2 = self.linear1(src)
        src2 = self.dropout(self.activation(src2))
        src2 = self.linear2(src2)
        src = src + self.dropout2(src2)
        src = self.norm2(src)
        return src, attn_weights  # Return both output and attention weights

# 2.4 TCNTransformerWithWeights Model Class
class TCNTransformerWithWeights(nn.Module):
    """
    Temporal Convolutional Network combined with Transformer Encoder for Time Series Classification.
    Returns both output logits and attention weights.
    """
    def __init__(
        self, 
        num_inputs: int, 
        num_tcn_channels: list, 
        tcn_kernel_size: int, 
        tcn_dropout: float, 
        transformer_hidden_size: int, 
        transformer_num_heads: int, 
        transformer_num_layers: int, 
        transformer_dropout: float, 
        num_classes: int,
        activation: str = 'GELU',
        normalization: str = 'BatchNorm',
        max_seq_len: int = 1000
    ):
        super(TCNTransformerWithWeights, self).__init__()
        
        # Build TCN layers
        layers = []
        num_levels = len(num_tcn_channels)
        for i in range(num_levels):
            dilation_size = 2 ** i
            in_channels = num_inputs if i == 0 else num_tcn_channels[i-1]
            out_channels = num_tcn_channels[i]
            # Correct padding to maintain sequence length
            padding = (tcn_kernel_size - 1) * dilation_size // 2
            layers += [TemporalBlock(
                in_channels, 
                out_channels, 
                tcn_kernel_size, 
                stride=1, 
                dilation=dilation_size,
                padding=padding, 
                dropout=tcn_dropout,
                activation=activation,
                normalization=normalization
            )]
        self.tcn = nn.Sequential(*layers)  # Output shape: (batch_size, num_tcn_channels[-1], seq_len)
        
        # Positional Encoding
        self.positional_encoding = PositionalEncoding(d_model=num_tcn_channels[-1], max_len=max_seq_len)
        
        # Projection to transformer hidden size if necessary
        if num_tcn_channels[-1] != transformer_hidden_size:
            self.projection = nn.Linear(num_tcn_channels[-1], transformer_hidden_size)
        else:
            self.projection = nn.Identity()
        
        # Transformer Encoder with custom layers
        encoder_layer = TransformerEncoderLayerWithWeights(
            d_model=transformer_hidden_size, 
            nhead=transformer_num_heads, 
            dim_feedforward=transformer_hidden_size * 4, 
            dropout=transformer_dropout, 
            activation='gelu'
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=transformer_num_layers)
        
        # Classification Head
        self.fc = nn.Sequential(
            nn.Dropout(transformer_dropout),
            nn.Linear(transformer_hidden_size, num_classes)
        )
        
        self.init_weights()

    def init_weights(self):
        """
        Initialize weights for projection and classification layers.
        """
        if not isinstance(self.projection, nn.Identity):
            nn.init.xavier_uniform_(self.projection.weight)
            nn.init.zeros_(self.projection.bias)
        for layer in self.fc:
            if isinstance(layer, nn.Linear):
                nn.init.xavier_uniform_(layer.weight)
                nn.init.zeros_(layer.bias)

    def forward(self, x):
        """
        Forward pass through the TCN-Transformer model.
        
        Args:
            x: Input tensor of shape (batch_size, num_features, seq_len)
        
        Returns:
            out: Output logits of shape (batch_size, num_classes)
            attention_weights: List of attention weights from each Transformer layer
        """
        # Pass through TCN
        tcn_out = self.tcn(x)  # (batch_size, num_tcn_channels[-1], seq_len)
        
        # Permute for Transformer: (seq_len, batch_size, num_tcn_channels[-1])
        tcn_out = tcn_out.permute(2, 0, 1)
        
        # Project to transformer hidden size if necessary
        transformer_input = self.projection(tcn_out)  # (seq_len, batch_size, transformer_hidden_size)
        
        # Add positional encoding
        transformer_input = self.positional_encoding(transformer_input)  # (seq_len, batch_size, transformer_hidden_size)
        
        # Pass through Transformer Encoder and capture attention weights
        attention_weights = []
        transformer_out = transformer_input
        for layer in self.transformer_encoder.layers:
            transformer_out, attn_weights = layer(transformer_out)
            attention_weights.append(attn_weights)
        
        # Aggregate Transformer outputs (e.g., take the mean over the sequence)
        transformer_out = transformer_out.mean(dim=0)  # (batch_size, transformer_hidden_size)
        
        # Classification Head
        out = self.fc(transformer_out)  # (batch_size, num_classes)
        
        return out, attention_weights

# 2.5 TimeSeriesDataset Class
class TimeSeriesDataset(Dataset):
    def __init__(self, sequences, labels):
        """
        Args:
            sequences: Numpy array of shape (num_samples, seq_len, num_features)
            labels: Numpy array of shape (num_samples,)
        """
        self.sequences = torch.tensor(sequences, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        # Transpose to (num_features, seq_len) for TCN input
        return self.sequences[idx].permute(1, 0), self.labels[idx]

# 2.6 Focal Loss Class
class FocalLoss(nn.Module):
    def __init__(self, alpha=None, gamma=2, reduction='mean'):
        """
        Args:
            alpha (tensor, optional): Weights for each class. Shape: (num_classes,)
            gamma (float): Focusing parameter.
            reduction (str): 'mean' | 'sum' | 'none'
        """
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, reduction='none', weight=self.alpha)
        pt = torch.exp(-ce_loss)
        focal_loss = (1 - pt) ** self.gamma * ce_loss
        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        else:
            return focal_loss

# ---------------------------------------------------
# 3. Data Loading and Preprocessing
# ---------------------------------------------------
def load_and_preprocess_data(file_path):
    """
    Loads data from a CSV file, preprocesses it to generate input sequences and labels.
    
    Args:
        file_path (str): Path to the CSV file.
    
    Returns:
        sequences (numpy.ndarray): Array of shape (num_samples, seq_len, num_features)
        labels (numpy.ndarray): Array of shape (num_samples,)
        label_encoder (LabelEncoder): Fitted label encoder
        example_sequence (numpy.ndarray): An example input sequence
        example_label (int): Encoded label for the example sequence
    """
    # Check if the file exists
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"The file {file_path} does not exist.")
    
    # Load the data into a pandas DataFrame
    df = pd.read_csv(file_path)
    
    # Display the first few rows
    print("Initial Data:")
    print(df.head())
    
    # Convert 'date' column to datetime
    df['date'] = pd.to_datetime(df['date'])
    
    # Set 'date' as the DataFrame index
    df.set_index('date', inplace=True)
    
    # Sort the DataFrame by datetime index to ensure chronological order
    df.sort_index(inplace=True)
    
    # Ensure the minute-level data has a fixed frequency ('T' for minutes)
    ochl_df = df[['open', 'high', 'low', 'close']].copy()
    ochl_df = ochl_df.asfreq('T')  # 'T' stands for minute frequency

    # Handle missing data by forward filling
    ochl_df.ffill(inplace=True)
    
    # Calculate volatility as the difference between high and low
    ochl_df['volatility'] = ochl_df['high'] - ochl_df['low']
    
    # Calculate average volatility over a rolling window of 500 minutes
    ochl_df['avg_volatility'] = ochl_df['volatility'].rolling(window=500).mean()
    
    # Drop rows with NaN values resulting from rolling window
    ochl_df.dropna(inplace=True)
    
    # Resample to hourly data, taking the last closing price as the hour's close
    hourly_close = ochl_df['close'].resample('H').last()
    hourly_avg_volatility = ochl_df['avg_volatility'].resample('H').last()
    
    # Shift the hourly closes by one to get the next hour's close
    next_hour_close = hourly_close.shift(-1)
    
    # Create a DataFrame to hold the necessary data
    label_df = pd.DataFrame({
        'current_close': hourly_close,
        'next_close': next_hour_close,
        'avg_volatility': hourly_avg_volatility
    })
    
    # Drop rows with NaN values (especially the last row where next_close is NaN)
    label_df.dropna(inplace=True)
    
    # Define labels based on the criteria
    def assign_label(row):
        if row['next_close'] > row['current_close'] + row['avg_volatility']:
            return 'long'
        elif row['next_close'] < row['current_close'] - row['avg_volatility']:
            return 'short'
        else:
            return 'flat'
    
    label_df['label'] = label_df.apply(assign_label, axis=1)
    
    # Initialize lists to hold sequences and labels
    sequences = []
    labels = []
    
    # Print total number of data points
    print(f"\nTotal data points after resampling and rolling: {len(ochl_df)} minutes")
    print(f"Total samples after windowing: {len(label_df)}")
    
    # Initialize a list to record window sizes (optional, useful for debugging)
    window_sizes = []
    
    # Iterate over each row in label_df
    for idx, row in label_df.iterrows():
        # Current hour's end time
        current_hour_end = idx
        
        # Define the start time of the 500-minute window
        start_time = current_hour_end - timedelta(minutes=500)
        
        # Extract the 500-minute window
        window_data = ochl_df.loc[start_time:current_hour_end - timedelta(minutes=1), ['open', 'high', 'low', 'close']]
        
        # Record window size
        window_size = len(window_data)
        window_sizes.append(window_size)
        
        # Check if the window has exactly 500 data points
        if window_size == 500:
            # Convert to numpy array
            window_array = window_data.values.astype(np.float32)  # Shape: (500, 4), float32
            sequences.append(window_array)
            labels.append(row['label'])
        else:
            # If window_data is not exactly 500 minutes, skip this sample
            continue  # Removed print statements as per user request
    
    # Convert lists to numpy arrays
    sequences = np.array(sequences)  # Shape: (num_samples, 500, 4)
    labels = np.array(labels)        # Shape: (num_samples,)
    
    print(f"\nNumber of valid samples: {sequences.shape[0]}")
    print(f"Sequence shape: {sequences.shape[1:]}")
    print(f"Labels shape: {labels.shape}")
    print(f"\nSample label distribution:")
    print(pd.Series(labels).value_counts())
    
    # Encode labels
    le = LabelEncoder()
    encoded_labels = le.fit_transform(labels)
    print(f"\nClasses: {le.classes_}")
    print(f"Encoded labels sample: {encoded_labels[:10]}")
    
    # Print an example input and expected result
    example_idx = 0  # You can change this index to view different samples
    example_sequence = sequences[example_idx]  # Shape: (500, 4)
    example_label = encoded_labels[example_idx]
    print("\nExample Input Sequence (first 5 minutes):")
    print(example_sequence[:5])  # Display first 5 minutes for brevity
    print(f"Expected Label: {labels[example_idx]} (Encoded: {example_label})")
    
    return sequences, encoded_labels, le, example_sequence, example_label

# ---------------------------------------------------
# 4. Scaling the Data
# ---------------------------------------------------
def scale_data(X_train, X_val, X_test):
    """
    Scales the data using StandardScaler.
    
    Args:
        X_train (numpy.ndarray): Training data of shape (num_samples, seq_len, num_features)
        X_val (numpy.ndarray): Validation data of shape (num_samples, seq_len, num_features)
        X_test (numpy.ndarray): Test data of shape (num_samples, seq_len, num_features)
    
    Returns:
        X_train_scaled, X_val_scaled, X_test_scaled: Scaled datasets
        scaler: Fitted StandardScaler
    """
    scaler = StandardScaler()
    
    # Reshape to (num_samples * seq_len, num_features) for fitting
    X_train_reshaped = X_train.reshape(-1, X_train.shape[-1]).astype(np.float32)
    
    # Fit the scaler on the training data
    scaler.fit(X_train_reshaped)
    print("\nScaling the data using StandardScaler...")
    
    # Define a function to scale sequences
    def scale_sequences(sequences, scaler):
        num_samples, seq_len, num_features = sequences.shape
        sequences_reshaped = sequences.reshape(-1, num_features).astype(np.float32)
        sequences_scaled = scaler.transform(sequences_reshaped)
        scaled = sequences_scaled.reshape(num_samples, seq_len, num_features).astype(np.float32)
        print(f"Scaled sequences shape: {scaled.shape}")
        return scaled
    
    # Apply the scaler to all datasets
    X_train_scaled = scale_sequences(X_train, scaler)
    X_val_scaled = scale_sequences(X_val, scaler)
    X_test_scaled = scale_sequences(X_test, scaler)
    
    # Verify scaling
    print("\nScaled training data statistics:")
    print(f"Mean per feature: {X_train_scaled.mean(axis=(0,1))}")
    print(f"Std per feature: {X_train_scaled.std(axis=(0,1))}")
    
    # Additional statistics for validation and test sets
    print("\nScaled validation data statistics:")
    print(f"Mean per feature: {X_val_scaled.mean(axis=(0,1))}")
    print(f"Std per feature: {X_val_scaled.std(axis=(0,1))}")
    
    print("\nScaled test data statistics:")
    print(f"Mean per feature: {X_test_scaled.mean(axis=(0,1))}")
    print(f"Std per feature: {X_test_scaled.std(axis=(0,1))}")
    
    return X_train_scaled, X_val_scaled, X_test_scaled, scaler

# ---------------------------------------------------
# 5. Addressing Class Imbalance
# ---------------------------------------------------
def create_weighted_sampler(y_train):
    """
    Creates a WeightedRandomSampler to handle class imbalance.
    
    Args:
        y_train (numpy.ndarray): Training labels.
    
    Returns:
        sampler (WeightedRandomSampler): Sampler for DataLoader.
    """
    class_counts = np.bincount(y_train)
    class_weights = 1. / class_counts
    samples_weights = class_weights[y_train]
    sampler = WeightedRandomSampler(weights=samples_weights, num_samples=len(samples_weights), replacement=True)
    return sampler

# ---------------------------------------------------
# 6. Data Augmentation (Optional)
# ---------------------------------------------------
def augment_data(sequences, noise_factor=0.01):
    """
    Adds Gaussian noise to the sequences for data augmentation.
    
    Args:
        sequences (numpy.ndarray): Original sequences of shape (num_samples, seq_len, num_features)
        noise_factor (float): Standard deviation of the Gaussian noise.
    
    Returns:
        augmented_sequences (numpy.ndarray): Augmented sequences.
    """
    noise = np.random.normal(0, noise_factor, sequences.shape).astype(np.float32)
    augmented_sequences = sequences + noise
    return augmented_sequences

# ---------------------------------------------------
# 7. Training and Evaluation Pipeline
# ---------------------------------------------------
def train_and_evaluate(model, 
                       train_loader, 
                       val_loader, 
                       test_loader, 
                       criterion, 
                       optimizer, 
                       scheduler, 
                       device, 
                       num_epochs=50, 
                       checkpoint_path='best_tcn_transformer_model.pth',
                       patience=10,
                       example_input=None,
                       example_label=None,
                       le=None):  # Added 'le' parameter
    """
    Trains and evaluates the model with early stopping.
    
    Args:
        model (nn.Module): The TCNTransformerWithWeights model.
        train_loader (DataLoader): DataLoader for training data.
        val_loader (DataLoader): DataLoader for validation data.
        test_loader (DataLoader): DataLoader for test data.
        criterion (nn.Module): Loss function.
        optimizer (torch.optim.Optimizer): Optimizer.
        scheduler (torch.optim.lr_scheduler): Learning rate scheduler.
        device (torch.device): Device to train on.
        num_epochs (int): Number of training epochs.
        checkpoint_path (str): Path to save the best model.
        patience (int): Patience for early stopping.
        example_input (numpy.ndarray): Example input sequence for testing.
        example_label (int): Encoded label of the example input.
        le (LabelEncoder): Label encoder instance.
    
    Returns:
        model (nn.Module): Trained model.
    """
    best_f1 = 0.0
    counter = 0
    
    print("\nStarting Training and Evaluation...")
    
    for epoch in range(num_epochs):
        print(f"\n--- Epoch {epoch+1}/{num_epochs} ---")
        # -------------------
        # Training Phase
        # -------------------
        model.train()
        running_loss = 0.0
        for batch_idx, (sequences_batch, labels_batch) in enumerate(train_loader):
            # Move data to device
            sequences_batch = sequences_batch.to(device)  # Shape: (batch_size, num_features, seq_len)
            labels_batch = labels_batch.to(device)        # Shape: (batch_size,)
            
            # Zero the parameter gradients
            optimizer.zero_grad()
            
            try:
                # Forward pass
                outputs, _ = model(sequences_batch)  # Shape: (batch_size, num_classes), attention_weights
            except Exception as e:
                print(f"Error during model forward pass: {e}")
                return
            
            # Compute loss
            loss = criterion(outputs, labels_batch)
            
            # Backward pass and optimization
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Gradient clipping
            optimizer.step()
            
            # Accumulate loss
            running_loss += loss.item() * sequences_batch.size(0)
            
            if (batch_idx + 1) % 100 == 0 or (batch_idx + 1) == len(train_loader):
                print(f"  Training Batch {batch_idx+1}/{len(train_loader)} | Loss: {loss.item():.4f}")
        
        epoch_loss = running_loss / len(train_loader.dataset)
        print(f"  Epoch {epoch+1} Training Loss: {epoch_loss:.4f}")
        
        # -------------------
        # Validation Phase
        # -------------------
        model.eval()
        val_loss = 0.0
        all_preds = []
        all_labels = []
        
        with torch.no_grad():
            for batch_idx, (sequences_batch, labels_batch) in enumerate(val_loader):
                # Move data to device
                sequences_batch = sequences_batch.to(device)
                labels_batch = labels_batch.to(device)
                
                try:
                    # Forward pass
                    outputs, _ = model(sequences_batch)
                except Exception as e:
                    print(f"Error during model forward pass (validation): {e}")
                    return
                
                # Compute loss
                loss = criterion(outputs, labels_batch)
                val_loss += loss.item() * sequences_batch.size(0)
                
                # Predictions
                _, preds = torch.max(outputs, dim=1)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels_batch.cpu().numpy())
        
        val_loss /= len(val_loader.dataset)
        print(f"  Epoch {epoch+1} Validation Loss: {val_loss:.4f}")
        
        # Compute F1-Score
        report = classification_report(all_labels, all_preds, output_dict=True, zero_division=0)
        f1_score_val = report['weighted avg']['f1-score']
        print(f"  Epoch {epoch+1} Validation F1-Score: {f1_score_val:.4f}")
        
        # Adjust learning rate based on validation loss
        scheduler.step(val_loss)
        
        # Early Stopping and Checkpointing
        if f1_score_val > best_f1:
            best_f1 = f1_score_val
            counter = 0
            torch.save(model.state_dict(), checkpoint_path)
            print(f"  [Checkpoint] Saved Best Model at Epoch {epoch+1} with F1-Score: {f1_score_val:.4f}")
        else:
            counter += 1
            print(f"  Early Stopping Counter: {counter}/{patience}")
            if counter >= patience:
                print("  [Early Stopping] Triggered.")
                break
    
    # Load the best model
    try:
        model.load_state_dict(torch.load(checkpoint_path))
        print("\nLoaded Best Model for Testing.")
    except Exception as e:
        print(f"Error loading the best model: {e}")
        return
    
    # -------------------
    # Test Phase
    # -------------------
    if test_loader is not None:
        print("\n--- Testing Phase ---")
        model.eval()
        all_preds = []
        all_labels = []
        
        with torch.no_grad():
            for batch_idx, (sequences_batch, labels_batch) in enumerate(test_loader):
                # Move data to device
                sequences_batch = sequences_batch.to(device)
                labels_batch = labels_batch.to(device)
                
                try:
                    # Forward pass
                    outputs, _ = model(sequences_batch)
                except Exception as e:
                    print(f"Error during model forward pass (testing): {e}")
                    return
                
                # Predictions
                _, preds = torch.max(outputs, dim=1)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels_batch.cpu().numpy())
        
        # Compute confusion matrix
        conf_matrix = confusion_matrix(all_labels, all_preds)
        print("\nConfusion Matrix:")
        print(conf_matrix)
        
        # Plot confusion matrix
        plt.figure(figsize=(8,6))
        sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
                    xticklabels=['flat', 'long', 'short'],
                    yticklabels=['flat', 'long', 'short'])
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.title('Confusion Matrix')
        plt.show()
        
        # Compute classification report
        class_report = classification_report(all_labels, all_preds, target_names=['flat', 'long', 'short'], zero_division=0)
        print("\nClassification Report:")
        print(class_report)
    else:
        print("\nNo test loader provided. Skipping test phase.")
    
    # -------------------
    # Print Model Output for Example Input
    # -------------------
    if example_input is not None and le is not None:
        print("\n--- Model Output for Example Input ---")
        try:
            # Prepare the example input
            example_tensor = torch.tensor(example_input, dtype=torch.float32).unsqueeze(0)  # Shape: (1, 500, 4)
            print(f"Example Input Shape before permute: {example_tensor.shape}")  # (1, 500, 4)
            example_tensor = example_tensor.permute(0, 2, 1).to(device)  # Shape: (1, 4, 500)
            print(f"Example Tensor Shape after permute: {example_tensor.shape}")  # (1, 4, 500)
            
            # Display the scaled example input
            print("\n--- Scaled Example Input Sequence (first 5 minutes) ---")
            print(example_input[:5])  # Display first 5 minutes of the scaled example input
            
            # Display the tensor shape and a snippet
            print("\n--- Example Tensor Passed to the Model ---")
            print(f"Tensor Shape: {example_tensor.shape}")  # Should be (1, 4, 500)
            print(example_tensor[0, :, :5])  # Display first 5 time steps for each feature
            
            # Get model output
            model.eval()
            with torch.no_grad():
                output_logits, _ = model(example_tensor)
                probabilities = F.softmax(output_logits, dim=1).cpu().numpy()
                predicted_class = np.argmax(probabilities, axis=1)[0]
            
            # Print results
            expected_label_name = le.inverse_transform([example_label])[0]
            predicted_label_name = le.inverse_transform([predicted_class])[0]
            print(f"\nExpected Label: {expected_label_name} (Encoded: {example_label})")
            print(f"Model Predicted Class: {predicted_label_name} (Encoded: {predicted_class})")
            print(f"Probabilities: {probabilities[0]}")
        except Exception as e:
            print(f"Error during example input processing: {e}")
    else:
        print("No example input provided for model output demonstration.")
    
    return model

# ---------------------------------------------------
# 8. Model Interpretability
# ---------------------------------------------------
def visualize_attention_weights(model, device, X_test_scaled):
    """
    Visualizes attention weights for a sample sequence.
    
    Args:
        model (nn.Module): Trained model.
        device (torch.device): Device used for computation.
        X_test_scaled (numpy.ndarray): Scaled test data.
    """
    import seaborn as sns
    import matplotlib.pyplot as plt
    
    # Select a sample from the test set
    sample_idx = 0
    sample_sequence = X_test_scaled[sample_idx:sample_idx+1]  # Shape: (1, 500, 4)
    sample_tensor = torch.tensor(sample_sequence, dtype=torch.float32).to(device)
    
    try:
        # Get model output and attention weights
        model.eval()
        with torch.no_grad():
            outputs, attn_weights = model(sample_tensor.permute(0, 2, 1))  # (1, num_classes), list of [num_heads, seq_len, seq_len]
    except Exception as e:
        print(f"Error during attention weights extraction: {e}")
        return
    
    if attn_weights:
        # For demonstration, visualize attention weights from the first Transformer layer and first head
        first_layer_attn = attn_weights[0][0]  # (num_heads, seq_len, seq_len)
        
        # Move to CPU and convert to numpy
        first_layer_attn_np = first_layer_attn.cpu().numpy()
        
        plt.figure(figsize=(10, 8))
        sns.heatmap(first_layer_attn_np, cmap='viridis')
        plt.xlabel('Key Positions')
        plt.ylabel('Query Positions')
        plt.title('Attention Weights for Sample Sequence (Layer 1, Head 1)')
        plt.show()
    else:
        print("No attention weights captured.")

# ---------------------------------------------------
# 9. Execute the Pipeline
# ---------------------------------------------------
def main_pipeline():
    """
    Executes the full pipeline: data loading, preprocessing, scaling,
    handling class imbalance, model training with early stopping,
    and evaluation.
    """
    # File path to the CSV data
    file_path = '../Data/Binance_BTCUSDT_2024_minute — копия.csv'
    
    # Load and preprocess data
    sequences, encoded_labels, le, example_sequence, example_label = load_and_preprocess_data(file_path)
    
    # Check if any sequences were created
    if sequences.size == 0:
        print("No valid samples were created. Please check your window extraction logic.")
        return
    
    # Split the data
    print("\nSplitting the data into Training, Validation, and Test sets...")
    X_train, X_temp, y_train, y_temp = train_test_split(
        sequences, encoded_labels, test_size=0.2, random_state=42, stratify=encoded_labels
    )
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
    )
    print(f"Training samples: {X_train.shape[0]}")
    print(f"Validation samples: {X_val.shape[0]}")
    print(f"Test samples: {X_test.shape[0]}")
    
    # Scale the data
    X_train_scaled, X_val_scaled, X_test_scaled, scaler = scale_data(X_train, X_val, X_test)
    
    # Scale the example input using the same scaler
    example_sequence_scaled = scaler.transform(example_sequence)  # Shape: (500, 4)
    # Do NOT reshape here; let the train_and_evaluate function handle batching
    
    # Optional: Data Augmentation (e.g., Jittering) for Minority Classes
    # Uncomment the following lines to apply data augmentation
    
    # Identify the index for the 'flat' class
    flat_class_index = np.where(le.classes_ == 'flat')[0][0]
    flat_indices = np.where(y_train == flat_class_index)[0]
    flat_sequences = X_train_scaled[flat_indices]
    flat_labels = y_train[flat_indices]
    
    print(f"\nNumber of 'flat' class samples before augmentation: {len(flat_sequences)}")
    
    # Augment 'flat' sequences
    augmented_flat_sequences = augment_data(flat_sequences, noise_factor=0.01)
    augmented_flat_labels = flat_labels.copy()
    
    # Combine augmented data with original training data
    X_train_combined = np.concatenate((X_train_scaled, augmented_flat_sequences), axis=0)
    y_train_combined = np.concatenate((y_train, augmented_flat_labels), axis=0)
    
    print(f"Number of 'flat' class samples after augmentation: {len(augmented_flat_sequences) + len(flat_sequences)}")
    
    # Create WeightedRandomSampler to address class imbalance
    sampler = create_weighted_sampler(y_train_combined)
    
    # Create Dataset instances
    print("\nCreating Dataset instances...")
    train_dataset = TimeSeriesDataset(X_train_combined, y_train_combined)
    val_dataset = TimeSeriesDataset(X_val_scaled, y_val)
    test_dataset = TimeSeriesDataset(X_test_scaled, y_test)
    
    # Determine optimal batch size based on GPU memory
    # Starting with a smaller batch size to prevent memory issues
    batch_size = 32
    print(f"\nUsing batch size: {batch_size}")
    
    # Create DataLoader instances with sampler for training
    print("Creating DataLoader instances...")
    train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=sampler, num_workers=4, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)
    
    # Define model parameters
    num_inputs = 4  # OCHL features
    num_tcn_channels = [64, 128, 256]  # TCN channels per layer
    tcn_kernel_size = 3
    tcn_dropout = 0.3
    transformer_hidden_size = 256  # Must match the last TCN channel or use projection
    transformer_num_heads = 8
    transformer_num_layers = 3
    transformer_dropout = 0.1
    num_classes = 3  # long, short, flat
    max_seq_len = 500  # Number of minutes in the input sequence
    
    # Initialize the model
    print("\nInitializing the model...")
    model = TCNTransformerWithWeights(
        num_inputs=num_inputs, 
        num_tcn_channels=num_tcn_channels, 
        tcn_kernel_size=tcn_kernel_size, 
        tcn_dropout=tcn_dropout, 
        transformer_hidden_size=transformer_hidden_size, 
        transformer_num_heads=transformer_num_heads, 
        transformer_num_layers=transformer_num_layers, 
        transformer_dropout=transformer_dropout, 
        num_classes=num_classes,
        activation='GELU',
        normalization='BatchNorm',
        max_seq_len=max_seq_len
    )
    
    # Move the model to the appropriate device
    model.to(device)
    print("Model moved to device.")
    
    # Compute class weights based on training data
    class_weights = compute_class_weight(
        class_weight='balanced',
        classes=np.unique(y_train_combined),
        y=y_train_combined
    )
    print(f"\nComputed Class Weights: {class_weights}")
    
    # Normalize the class weights
    alpha = class_weights / class_weights.sum()
    print(f"Normalized Class Weights (Alpha): {alpha}")
    
    # Convert class weights to tensor
    alpha_tensor = torch.tensor(alpha, dtype=torch.float).to(device)
    
    # Define the loss function with class-specific alpha
    criterion = FocalLoss(alpha=alpha_tensor, gamma=2, reduction='mean')
    print("Defined Focal Loss with class-specific alpha.")
    # Alternatively, use CrossEntropyLoss with class weights
    # criterion = nn.CrossEntropyLoss(weight=alpha_tensor)
    
    # Define the optimizer
    optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-5)
    print("Initialized AdamW optimizer.")
    
    # Define a learning rate scheduler
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, 
        mode='min', 
        factor=0.5, 
        patience=5, 
        verbose=True
    )
    print("Initialized ReduceLROnPlateau scheduler.")
    
    # Start training and evaluation
    trained_model = train_and_evaluate(
        model=model, 
        train_loader=train_loader, 
        val_loader=val_loader, 
        test_loader=test_loader, 
        criterion=criterion, 
        optimizer=optimizer, 
        scheduler=scheduler, 
        device=device, 
        num_epochs=50, 
        checkpoint_path='best_tcn_transformer_model.pth',
        patience=10,
        example_input=example_sequence_scaled,  # Pass the scaled example input without batch dimension
        example_label=example_label,
        le=le  # Passed 'le' here
    )
    
    # -------------------
    # Model Interpretability
    # -------------------
    visualize_attention_weights(trained_model, device, X_test_scaled)

# ---------------------------------------------------
# Execute the Pipeline
# ---------------------------------------------------
if __name__ == "__main__":
    try:
        main_pipeline()
    except Exception as e:
        print(f"\nAn error occurred: {e}")


Using device: cuda
Initial Data:
            unix                 date   symbol      open      high       low  \
0  1711583940000  2024-03-27 23:59:00  BTCUSDT  69492.00  69500.00  69430.81   
1  1711583880000  2024-03-27 23:58:00  BTCUSDT  69484.59  69516.00  69454.86   
2  1711583820000  2024-03-27 23:57:00  BTCUSDT  69548.61  69548.61  69470.58   
3  1711583760000  2024-03-27 23:56:00  BTCUSDT  69579.37  69614.96  69536.64   
4  1711583700000  2024-03-27 23:55:00  BTCUSDT  69559.99  69637.66  69557.93   

      close    volume   volume_from  tradecount  
0  69469.99  24.22811  1.682992e+06         944  
1  69492.00  41.93422  2.913823e+06        1168  
2  69484.58  57.01773  3.963247e+06        1653  
3  69548.61  26.47675  1.841978e+06         974  
4  69579.37  68.19374  4.746594e+06        3120  

Total data points after resampling and rolling: 650381 minutes
Total samples after windowing: 10839

Number of valid samples: 10830
Sequence shape: (500, 4)
Labels shape: (10830,)

Samp