In [1]:
import argparse
import torch
from accelerate import Accelerator, DeepSpeedPlugin
from accelerate import DistributedDataParallelKwargs
from torch import nn, optim
from torch.optim import lr_scheduler
from tqdm import tqdm

from models import Autoformer, DLinear, TimeLLM, TimeLLM_lora_bnb

from data_provider.data_factory import data_provider
import time
import random
import numpy as np
import os

os.environ['CURL_CA_BUNDLE'] = ''
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:64"

from utils.tools import del_files, EarlyStopping, adjust_learning_rate, vali, load_content

fix_seed = 2021
random.seed(fix_seed)
torch.manual_seed(fix_seed)
np.random.seed(fix_seed)

class Args:
    def __init__(self):
        # Task parameters
        self.task_name = 'long_term_forecast'  # options: [long_term_forecast, short_term_forecast, imputation, classification, anomaly_detection]
        self.is_training = 1
        self.model_id = 'test'
        self.model_comment = 'none'
        self.model = 'TimeLLM'  # options: [Autoformer, DLinear]
        self.seed = 2021

        # Data loader parameters
        self.data = 'ETTm1'  # dataset type
        self.root_path = "../data/dataset/ETT-small"  # root path of the data file
        self.data_path = 'ETTm1.csv'  # data file
        self.features = 'M'  # options: [M, S, MS]
        self.target = 'OT'  # target feature in S or MS task
        self.loader = 'modal'  # dataset type
        self.freq = 'h'  # options: [s, t, h, d, b, w, m]
        self.checkpoints = './checkpoints/'  # location of model checkpoints

        # Forecasting task parameters
        self.seq_len = 96  # input sequence length
        self.label_len = 48  # start token length
        self.pred_len = 96  # prediction sequence length
        self.seasonal_patterns = 'Monthly'  # subset for M4

        # Model definition parameters
        self.enc_in = 7  # encoder input size
        self.dec_in = 7  # decoder input size
        self.c_out = 7   # output size
        self.d_model = 16  # dimension of model
        self.n_heads = 8   # num of heads
        self.e_layers = 2   # num of encoder layers
        self.d_layers = 1   # num of decoder layers
        self.d_ff = 32      # dimension of fcn
        self.moving_avg = 25   # window size of moving average
        self.factor = 1      # attention factor
        self.dropout = 0.1   # dropout rate
        self.embed = 'timeF'   # time features encoding options: [timeF, fixed, learned]
        self.activation = 'gelu'   # activation function
        self.output_attention = False   # whether to output attention in encoder
        self.patch_len = 16   # patch length
        self.stride = 8       # stride length
        self.prompt_domain = 0   # prompt domain (if applicable)
        self.llm_model = 'LLAMA'   # LLM model options: [LLAMA, GPT2, BERT]
        self.llm_dim = 4096    # LLM model dimension

        # Optimization parameters
        self.num_workers = 10   # data loader num workers
        self.itr = 1            # experiments times
        self.train_epochs = 10   # train epochs
        self.align_epochs = 10    # alignment epochs
        self.batch_size = 16     # batch size of train input data
        self.eval_batch_size = 8   # batch size of model evaluation
        self.patience = 10       # early stopping patience
        self.learning_rate = 0.0001    # optimizer learning rate
        self.des = 'test'       # experiment description
        self.loss = 'MSE'       # loss function options: ['MSE', ...]
        self.lradj = 'type1'    # adjust learning rate type options: ['type1', ...]
        self.pct_start = 0.2     # pct_start for learning rate adjustment
        self.use_amp = False      # use automatic mixed precision training
        self.llm_layers = 6       # number of LLM layers
        self.percent = 100
        self.model_name = "meta-llama/Llama-3.1-8B"
        self.device = "cuda" if torch.cuda.is_available() else "cpu"


# 使用示例：
args = Args()
print(args.task_name)           # 输出: long_term_forecast
print(args.batch_size)
ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
deepspeed_plugin = DeepSpeedPlugin(hf_ds_config='./ds_config_zero2.json')
accelerator = Accelerator(kwargs_handlers=[ddp_kwargs], deepspeed_plugin=deepspeed_plugin)
setting = '{}_{}_{}_{}_ft{}_sl{}_ll{}_pl{}_dm{}_nh{}_el{}_dl{}_df{}_fc{}_eb{}_{}_{}'.format(
    args.task_name,
    args.model_id,
    args.model,
    args.data,
    args.features,
    args.seq_len,
    args.label_len,
    args.pred_len,
    args.d_model,
    args.n_heads,
    args.e_layers,
    args.d_layers,
    args.d_ff,
    args.factor,
    args.embed,
    args.des, 0)# 输出: 32

  warn(
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


long_term_forecast
16


In [7]:
from transformers import (
    AutoModel,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
    AutoModelForCausalLM
)
q_config = BitsAndBytesConfig(load_in_4bit=False,
                                bnb_4bit_quant_type='nf4',
                                bnb_4bit_use_double_quant=True,
                                bnb_4bit_compute_dtype=torch.float16
                                )
from transformers.utils import is_bitsandbytes_available
is_bitsandbytes_available()     

True

In [2]:
hf_token = "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"  #Put your own HF token here, do not publish it
from huggingface_hub import login

# Login directly with your Token (remember not to share this Token publicly)
login(token=hf_token)
import os
device = args.device
model = TimeLLM_lora_bnb.Model(args).to(device)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [3]:
print(1)

1


In [4]:
# 假设其他必要的导入和EarlyStopping, adjust_learning_rate, vali, data_provider等函数已经定义

print(device)
train_data, train_loader = data_provider(args, 'train')
vali_data, vali_loader = data_provider(args, 'val')
test_data, test_loader = data_provider(args, 'test')
early_stopping = EarlyStopping(accelerator=accelerator, patience=args.patience)
path = os.path.join(args.checkpoints, setting + '-' + args.model_comment)  # unique checkpoint saving path
args.content = load_content(args)
if not os.path.exists(path) and accelerator.is_local_main_process:
    os.makedirs(path)
time_now = time.time()
train_steps = len(train_loader)
model_optim = optim.Adam(model.parameters(), lr=args.learning_rate)
scheduler = lr_scheduler.OneCycleLR(optimizer=model_optim,
            steps_per_epoch=train_steps,
            pct_start=args.pct_start,
            epochs=args.train_epochs,
            max_lr=args.learning_rate)
criterion = nn.MSELoss()
mae_metric = nn.L1Loss()

model.train()
epoch_time = time.time()
epoch = 0

iter_count = 0
train_loss = []

for i, (batch_x, batch_y, batch_x_mark, batch_y_mark) in enumerate(tqdm(train_loader)):
    start_time = time.time()

    model_optim.zero_grad()
    print(f"step {iter_count + 1} 1 - Zero grad: {time.time() - start_time:.4f}s")

    start_time = time.time()
    batch_x = batch_x.float().to(device)
    batch_y = batch_y.float().to(device)
    batch_x_mark = batch_x_mark.float().to(device)
    batch_y_mark = batch_y_mark.float().to(device)
    print(f"step {iter_count + 1} 2 - Data to device: {time.time() - start_time:.4f}s")

    start_time = time.time()
    dec_inp = torch.zeros_like(batch_y[:, -args.pred_len:, :]).float().to(device)
    dec_inp = torch.cat([batch_y[:, :args.label_len, :], dec_inp], dim=1).float().to(device)
    print(f"step {iter_count + 1} 3 - Prepare decoder input: {time.time() - start_time:.4f}s")

    start_time = time.time()
    if args.output_attention:
        outputs = model(batch_x, batch_x_mark, dec_inp, batch_y_mark)[0]
    else:
        outputs = model(batch_x, batch_x_mark, dec_inp, batch_y_mark)
    print(f"step {iter_count + 1} 4 - Model forward: {time.time() - start_time:.4f}s")

    start_time = time.time()
    f_dim = -1 if args.features == 'MS' else 0
    outputs = outputs[:, -args.pred_len:, f_dim:]
    batch_y = batch_y[:, -args.pred_len:, f_dim:]
    loss = criterion(outputs, batch_y)
    train_loss.append(loss.item())
    print(f"step {iter_count + 1} 5 - Calculate loss: {time.time() - start_time:.4f}s")

    if (i + 1) % 100 == 0:
        print(f"\titers: {i + 1}, epoch: {epoch + 1} | loss: {loss.item():.7f}")
        speed = (time.time() - time_now) / iter_count
        left_time = speed * ((args.train_epochs - epoch) * train_steps - i)
        print(f'\tspeed: {speed:.4f}s/iter; left time: {left_time:.4f}s')
        iter_count = 0
        time_now = time.time()
    start_time = time.time()
    loss.backward()
    print(f"step {iter_count + 1} 6 - Backward: {time.time() - start_time:.4f}s")

    start_time = time.time()
    model_optim.step()
    print(f"step {iter_count + 1} 7 - Optimizer step: {time.time() - start_time:.4f}s")

    start_time = time.time()
    adjust_learning_rate(accelerator, model_optim, scheduler, epoch + 1, args, printout=False)
    print(f"step {iter_count + 1} 8 - Adjust learning rate: {time.time() - start_time:.4f}s")

    start_time = time.time()
    scheduler.step()
    print(f"step {iter_count + 1} 9 - Scheduler step: {time.time() - start_time:.4f}s")

    start_time = time.time()
    print(f"Epoch: {epoch + 1} cost time: {time.time() - epoch_time}")
    train_loss = np.average(train_loss)
    print(f"step {iter_count + 1} 10 - Average train loss: {time.time() - start_time:.4f}s")

    start_time = time.time()
    vali_loss, vali_mae_loss = vali(args, accelerator, model, vali_data, vali_loader, criterion, mae_metric)
    print(f"step {iter_count + 1} 11 - Validation: {time.time() - start_time:.4f}s")

    start_time = time.time()
    test_loss, test_mae_loss = vali(args, accelerator, model, test_data, test_loader, criterion, mae_metric)
    print(f"step {iter_count + 1} 12 - Test: {time.time() - start_time:.4f}s")

    start_time = time.time()
    print(f"Epoch: {epoch + 1} | Train Loss: {train_loss:.7f} Vali Loss: {vali_loss:.7f} Test Loss: {test_loss:.7f} MAE Loss: {test_mae_loss:.7f}")
    early_stopping(vali_loss, model, path)
    print(f"step {iter_count + 1} 13 - Early stopping: {time.time() - start_time:.4f}s")

    if early_stopping.early_stop:
        print("Early stopping")
        break

    start_time = time.time()
    print(f'Updating learning rate to {scheduler.get_last_lr()[0]}')
    print(f"step {iter_count + 1} 14 - Learning rate updated: {time.time() - start_time:.4f}s")

    iter_count += 1

cuda


  0%|          | 0/15036 [00:00<?, ?it/s]

step 1 1 - Zero grad: 0.0013s
step 1 2 - Data to device: 0.0012s
step 1 3 - Prepare decoder input: 0.0003s
cuda:0





RuntimeError: mat1 and mat2 must have the same dtype, but got Half and Float

In [36]:
early_stopping

<utils.tools.EarlyStopping at 0x7fa472f83910>

In [38]:
trained_parameters = []
for p in model.parameters():
    if p.requires_grad is True:
        trained_parameters.append(p)

In [43]:
device

device(type='mps')

In [46]:
for i, (batch_x, batch_y, batch_x_mark, batch_y_mark) in tqdm(enumerate(train_loader)):
    if i > 2: break
    batch_x = batch_x.float().to(device)
    batch_y = batch_y.float().to(device)
    batch_x_mark = batch_x_mark.float().to(device)
    batch_y_mark = batch_y_mark.float().to(device)

    print(batch_x.shape, batch_y.shape, batch_x_mark.shape, batch_y_mark.shape)

3it [00:01,  1.98it/s]

torch.Size([32, 96, 1]) torch.Size([32, 144, 1]) torch.Size([32, 96, 4]) torch.Size([32, 144, 4])
torch.Size([32, 96, 1]) torch.Size([32, 144, 1]) torch.Size([32, 96, 4]) torch.Size([32, 144, 4])
torch.Size([32, 96, 1]) torch.Size([32, 144, 1]) torch.Size([32, 96, 4]) torch.Size([32, 144, 4])





In [63]:
args.label_len

48

In [60]:
dec_inp = torch.zeros_like(batch_y[:, -args.pred_len:, :], dtype=torch.float32).float().to(
                device)
dec_inp = torch.cat([batch_y[:, :args.label_len, :].float().to(device), dec_inp], dim=1).to(
    device)

In [61]:
dec_inp.shape

torch.Size([32, 144, 1])