In [None]:
import math
import torch
import time
import random
import torch.nn as nn
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.parameter import Parameter
from torch.nn import init
from torch import Tensor
from scipy.special import gamma 
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import os

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

def clip_matrix_norm(matrix, max_norm):
    norm = torch.norm(matrix)
    if norm > max_norm:
        matrix = matrix * (max_norm / norm)
    return matrix


class Fractional_Order_Matrix_Differential_Solver(torch.autograd.Function):
    @staticmethod
    def forward(ctx,input1,w,b,alpha,c):
        alpha = torch.tensor(alpha)
        c = torch.tensor(c)
        ctx.save_for_backward(input1,w,b,alpha,c)
        outputs = input1@w + b
        return outputs

    @staticmethod
    def backward(ctx, grad_outputs):
        input1,w,b,alpha,c = ctx.saved_tensors
        x_fractional, w_fractional = Fractional_Order_Matrix_Differential_Solver.Fractional_Order_Matrix_Differential_Linear(input1,w,b,alpha,c)   
        x_grad = grad_outputs@x_fractional
        w_grad = w_fractional@grad_outputs
        b_grad = grad_outputs.sum(dim=0)
        return x_grad, w_grad, b_grad,None,None
          
    @staticmethod
    def Fractional_Order_Matrix_Differential_Linear(xs,ws,b,alpha,c):
        wf = ws[:,0].view(1,-1)
        #main
        w_main = torch.mul(xs,(torch.abs(wf)+1e-8)**(1-alpha)/gamma(2-alpha))
        #partial
        w_partial = torch.mul((xs@wf.T).expand(xs.shape) - torch.mul(xs,wf) + b[0], torch.sgn(wf)*(torch.abs(wf)+1e-8)**(-alpha)/gamma(1-alpha))
        return ws.T, (w_main + clip_matrix_norm(w_partial,c)).transpose(-2,-1)

class FLinear(nn.Module):
    
    __constants__ = ['in_features', 'out_features']
    in_features: int
    out_features: int
    weight: Tensor

    def __init__(self, in_features: int, out_features: int, alpha=0.9, c=1.0, bias: bool = True,
                 device=None, dtype=None) -> None:
        factory_kwargs = {'device': device, 'dtype': dtype}
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.alpha = alpha
        self.c = c
        self.weight = Parameter(torch.empty((out_features, in_features), **factory_kwargs))
        if bias:
            self.bias = Parameter(torch.empty(out_features, **factory_kwargs))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()

    def reset_parameters(self) -> None:
        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
        if self.bias is not None:
            fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
            bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
            init.uniform_(self.bias, -bound, bound)

    def forward(self, x):
        return Fractional_Order_Matrix_Differential_Solver.apply(x, self.weight.T, self.bias, self.alpha,self.c)

    def extra_repr(self) -> str:
        return f"in_features={self.in_features}, out_features={self.out_features}, bias={self.bias is not None}"
    

# Configuration parameters
slide_windows_size = 192  # Input sequence length
pred_length = 384        # Prediction horizon length
stock = 'ETTh1'            # Dataset name (ETTm2 for comparison)
features_j = 6           # Target feature index (DJI:4, ETTm2:6)
num_feature = features_j + 1         #(DJI:5, ETTm2:7)

# Load data
root = r'C:\Users\Administrator\torch_zxj\博士第四篇代码'
df_DJIA = pd.read_csv(root+'/data/'+stock+'.csv')
# df_DJIA = pd.read_csv(r'./data/'+stock+'.csv')
del df_DJIA['date']  # Remove date column

# 1. Split data first (7:1:2 ratio)
def split_time_series(data, train_ratio=0.7, val_ratio=0.1):
    n_samples = len(data)
    train_end = int(n_samples * train_ratio)
    val_end = train_end + int(n_samples * val_ratio)
    
    # Split in chronological order (important for time series)
    train_data = data[:train_end]
    val_data = data[train_end:val_end]
    test_data = data[val_end:]
    
    return train_data, val_data, test_data

# Split raw data first
train_raw, val_raw, test_raw = split_time_series(df_DJIA.values, 0.7, 0.1)

# scaler = MinMaxScaler()
scaler = StandardScaler()
scaler.fit(train_raw)

# Transform all datasets using training set statistics
train_scaled = scaler.transform(train_raw)
val_scaled = scaler.transform(val_raw)  # Use training set statistics
test_scaled = scaler.transform(test_raw)  # Use training set statistics

# 3. Create sequences for time series forecasting
def create_sequences(data, slide_windows_size, pred_length, target_idx):
    X, y = [], []
    for i in range(len(data) - slide_windows_size - pred_length + 1):
        # Input sequence: sliding window of features
        X.append(data[i:i+slide_windows_size, :])  # [seq_len, features]
        # Target sequence: future values of target feature
        y.append(data[i+slide_windows_size:i+slide_windows_size+pred_length, target_idx])  
    return np.array(X, dtype=np.float32), np.array(y, dtype=np.float32)

# Create sequences for each dataset
X_train, y_train = create_sequences(train_scaled, slide_windows_size, pred_length, features_j)
X_val, y_val = create_sequences(val_scaled, slide_windows_size, pred_length, features_j)
X_test, y_test = create_sequences(test_scaled, slide_windows_size, pred_length, features_j)

# 4. Convert to PyTorch tensors
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f"\nUsing device: {device}")

X_train_tensor = torch.FloatTensor(X_train).to(device)
y_train_tensor = torch.FloatTensor(y_train).to(device)
X_val_tensor = torch.FloatTensor(X_val).to(device)
y_val_tensor = torch.FloatTensor(y_val).to(device)
X_test_tensor = torch.FloatTensor(X_test).to(device)
y_test_tensor = torch.FloatTensor(y_test).to(device)

batch_size = 256
train_data = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

lr = 1e-4   
weight_decay = 1e-5        
num_epochs = 200
c = 1.0
alpha = 1.0

class MLP(nn.Module):
    def __init__(self, input_size, hidden_size1=512, hidden_size2=256,output_size=pred_length):  
        super().__init__()
        self.flatten = nn.Flatten()

        # self.linear1 = FLinear(input_size, hidden_size1, alpha,c)   
        # self.leakrelu1 = nn.LeakyReLU()                          
        # self.linear2 = FLinear(hidden_size1, hidden_size2, alpha,c)    
        # self.leakrelu2 = nn.LeakyReLU()
        # self.linear3 = FLinear(hidden_size2, output_size, alpha,c)  

        self.linear1 = nn.Linear(input_size, hidden_size1)    
        self.leakrelu1 = nn.LeakyReLU()                          
        self.linear2 = nn.Linear(hidden_size1, hidden_size2)    
        self.leakrelu2 = nn.LeakyReLU()
        self.linear3 = nn.Linear(hidden_size2, output_size)  
        
    def forward(self, x):
        x = self.flatten(x)    # (batch_size, seq_len*num_features)
        x = self.leakrelu1(self.linear1(x))
        x = self.leakrelu2(self.linear2(x))
        x = self.linear3(x)
        return x

peak_memory_max = 0
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()
initial_memory = torch.cuda.memory_allocated() / 1024**2
torch.cuda.synchronize()
time_start = time.time()

set_seed()
model = MLP(input_size=slide_windows_size*num_feature).to(device)
criterion = nn.MSELoss()
# optimizer = torch.optim.Adam(model.parameters(), lr=lr,weight_decay=weight_decay)   #adam
optimizer = torch.optim.SGD(model.parameters(),lr=lr) #sgd
for ii in range(num_epochs):
    model.train()
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()   #The default value of retain_graph is False.
        optimizer.step()
        peak_memory = torch.cuda.max_memory_allocated() / 1024**2

        if peak_memory_max < peak_memory:
            peak_memory_max = peak_memory
torch.cuda.synchronize()
time_end = time.time()
print(f"Initial Memory: {initial_memory:.4f} MB")
print(f"Peak Memory: {peak_memory_max:.4f} MB")
print(f"Memory: {(peak_memory_max-initial_memory):.4f} MB")
print(f"Training time: {(time_end-time_start)/200:.4f} s")
###########To ensure fair comparisons, restart the kernel before running each experiment.



Using device: cuda:0
Initial Memory: 104.8687 MB
Peak Memory: 141.2490 MB
Memory: 36.3804 MB
Training time: 0.2647 s
