In [2]:
import argparse
import torch
from accelerate import Accelerator, DeepSpeedPlugin
from accelerate import DistributedDataParallelKwargs
from torch import nn, optim
from torch.optim import lr_scheduler
from tqdm import tqdm

from models import Autoformer, DLinear, TimeLLM, TimeLLM_lora_bnb

from data_provider.data_factory import data_provider
import time
import random
import numpy as np
import os

os.environ['CURL_CA_BUNDLE'] = ''
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:64"

from utils.tools import del_files, EarlyStopping, adjust_learning_rate, vali, load_content

fix_seed = 2021
random.seed(fix_seed)
torch.manual_seed(fix_seed)
np.random.seed(fix_seed)

class Args:
    def __init__(self):
        # Task parameters
        self.task_name = 'long_term_forecast'  # options: [long_term_forecast, short_term_forecast, imputation, classification, anomaly_detection]
        self.is_training = 1
        self.model_id = 'test'
        self.model_comment = 'none'
        self.model = 'TimeLLM'  # options: [Autoformer, DLinear]
        self.seed = 2021

        # Data loader parameters
        self.data = 'ETTm1'  # dataset type
        self.root_path = "../data/dataset/ETT-small"  # root path of the data file
        self.data_path = 'ETTm1.csv'  # data file
        self.features = 'M'  # options: [M, S, MS]
        self.target = 'OT'  # target feature in S or MS task
        self.loader = 'modal'  # dataset type
        self.freq = 'h'  # options: [s, t, h, d, b, w, m]
        self.checkpoints = './checkpoints/'  # location of model checkpoints

        # Forecasting task parameters
        self.seq_len = 96  # input sequence length
        self.label_len = 48  # start token length
        self.pred_len = 96  # prediction sequence length
        self.seasonal_patterns = 'Monthly'  # subset for M4

        # Model definition parameters
        self.enc_in = 7  # encoder input size
        self.dec_in = 7  # decoder input size
        self.c_out = 7   # output size
        self.d_model = 16  # dimension of model
        self.n_heads = 8   # num of heads
        self.e_layers = 2   # num of encoder layers
        self.d_layers = 1   # num of decoder layers
        self.d_ff = 32      # dimension of fcn
        self.moving_avg = 25   # window size of moving average
        self.factor = 1      # attention factor
        self.dropout = 0.1   # dropout rate
        self.embed = 'timeF'   # time features encoding options: [timeF, fixed, learned]
        self.activation = 'gelu'   # activation function
        self.output_attention = False   # whether to output attention in encoder
        self.patch_len = 16   # patch length
        self.stride = 8       # stride length
        self.prompt_domain = 0   # prompt domain (if applicable)
        self.llm_model = 'LLAMA'   # LLM model options: [LLAMA, GPT2, BERT]
        self.llm_dim = 4096    # LLM model dimension

        # Optimization parameters
        self.num_workers = 10   # data loader num workers
        self.itr = 1            # experiments times
        self.train_epochs = 10   # train epochs
        self.align_epochs = 10    # alignment epochs
        self.batch_size = 16     # batch size of train input data
        self.eval_batch_size = 8   # batch size of model evaluation
        self.patience = 10       # early stopping patience
        self.learning_rate = 0.0001    # optimizer learning rate
        self.des = 'test'       # experiment description
        self.loss = 'MSE'       # loss function options: ['MSE', ...]
        self.lradj = 'type1'    # adjust learning rate type options: ['type1', ...]
        self.pct_start = 0.2     # pct_start for learning rate adjustment
        self.use_amp = True      # use automatic mixed precision training
        self.llm_layers = 6       # number of LLM layers
        self.percent = 100
        self.model_name = "meta-llama/Llama-3.1-8B"
        self.device = "cuda" if torch.cuda.is_available() else "cpu"


# 使用示例：
args = Args()
print(args.task_name)           # 输出: long_term_forecast
print(args.batch_size)
ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
deepspeed_plugin = DeepSpeedPlugin(hf_ds_config='./ds_config_zero2.json')
accelerator = Accelerator(kwargs_handlers=[ddp_kwargs], deepspeed_plugin=deepspeed_plugin)
setting = '{}_{}_{}_{}_ft{}_sl{}_ll{}_pl{}_dm{}_nh{}_el{}_dl{}_df{}_fc{}_eb{}_{}_{}'.format(
    args.task_name,
    args.model_id,
    args.model,
    args.data,
    args.features,
    args.seq_len,
    args.label_len,
    args.pred_len,
    args.d_model,
    args.n_heads,
    args.e_layers,
    args.d_layers,
    args.d_ff,
    args.factor,
    args.embed,
    args.des, 0)# 输出: 32

  warn(
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


long_term_forecast
16


In [2]:
from transformers import (
    AutoModel,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
    AutoModelForCausalLM
)
q_config = BitsAndBytesConfig(load_in_4bit=False,
                                bnb_4bit_quant_type='nf4',
                                bnb_4bit_use_double_quant=True,
                                bnb_4bit_compute_dtype=torch.float16
                                )
from transformers.utils import is_bitsandbytes_available
is_bitsandbytes_available()     

True

In [3]:
hf_token = "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"  #Put your own HF token here, do not publish it
from huggingface_hub import login

# Login directly with your Token (remember not to share this Token publicly)
login(token=hf_token)
import os
device = args.device
model = TimeLLM_lora_bnb.Model(args).to(device)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [4]:
from time import sleep
from torch.cuda.amp import autocast, GradScaler
from collections import defaultdict

print(device)
train_data, train_loader = data_provider(args, 'train')
vali_data, vali_loader = data_provider(args, 'val')
test_data, test_loader = data_provider(args, 'test')
early_stopping = EarlyStopping(accelerator=accelerator, patience=args.patience)
time_now = time.time()
train_steps = len(train_loader)
model_optim = optim.Adam(model.parameters(), lr=args.learning_rate)
criterion = nn.MSELoss()
scaler = GradScaler()
scheduler = lr_scheduler.OneCycleLR(optimizer=model_optim,
                            steps_per_epoch=train_steps,
                            pct_start=args.pct_start,
                            epochs=args.train_epochs,
                            max_lr=args.learning_rate)

model.train()
time_now = time.time()
epoch = 0

iter_count = 0
train_loss = []

time_cost_map = defaultdict(list)
for i, (batch_x, batch_y, batch_x_mark, batch_y_mark) in enumerate(tqdm(train_loader)):
    model_optim.zero_grad()
    batch_x = batch_x.float().to(device)
    batch_y = batch_y.float().to(device)
    batch_x_mark = batch_x_mark.float().to(device)
    batch_y_mark = batch_y_mark.float().to(device)
    dec_inp = torch.zeros_like(batch_y[:, -args.pred_len:, :]).float().to(device)
    dec_inp = torch.cat([batch_y[:, :args.label_len, :], dec_inp], dim=1).float().to(device)

    with autocast():  # autocast context manager
        outputs = model(batch_x, batch_x_mark, dec_inp, batch_y_mark)
        start_time = time.time()
        f_dim = -1 if args.features == 'MS' else 0
        outputs = outputs[:, -args.pred_len:, f_dim:]
        batch_y = batch_y[:, -args.pred_len:, f_dim:]
        loss = criterion(outputs, batch_y)
        train_loss.append(loss.item())
    
    scaler.scale(loss).backward()  # scale the loss
    scaler.step(model_optim)  # update parameters
    scaler.update()
    if (i + 1) % 100 == 0:
        print(f"\titers: {i + 1}, epoch: {epoch + 1} | loss: {loss.item():.7f}")
        speed = (time.time() - time_now) / iter_count
        left_time = speed * ((args.train_epochs - epoch) * train_steps - i)
        print(f'\tspeed: {speed:.4f}s/iter; left time: {left_time:.4f}s')
        iter_count = 0
        time_now = time.time()
    
    adjust_learning_rate(accelerator, model_optim, scheduler, epoch + 1, args, printout=False)
    scheduler.step()
    iter_count += 1
train_loss = np.average(train_loss)

start_time = time.time()
vali_loss, vali_mae_loss = vali(args, accelerator, model, vali_data, vali_loader, criterion, mae_metric)
time_cost_map["Validation"].append(f"{time.time() - start_time:.4f}s")

start_time = time.time()
test_loss, test_mae_loss = vali(args, accelerator, model, test_data, test_loader, criterion, mae_metric)
time_cost_map["Test"].append(f"{time.time() - start_time:.4f}s")

print(f"Epoch: {epoch + 1} | Train Loss: {train_loss:.7f} Vali Loss: {vali_loss:.7f} Test Loss: {test_loss:.7f} MAE Loss: {test_mae_loss:.7f}")

cuda


  1%|          | 100/15036 [01:56<4:36:16,  1.11s/it]

	iters: 100, epoch: 1 | loss: 0.7790316
	speed: 1.1789s/iter; left time: 177149.5179s


  1%|▏         | 200/15036 [03:47<4:36:26,  1.12s/it]

	iters: 200, epoch: 1 | loss: 0.4822383
	speed: 1.1123s/iter; left time: 167026.3161s


  2%|▏         | 300/15036 [05:39<4:33:46,  1.11s/it]

	iters: 300, epoch: 1 | loss: 0.1686792
	speed: 1.1141s/iter; left time: 167176.8720s


  3%|▎         | 400/15036 [07:30<4:32:02,  1.12s/it]

	iters: 400, epoch: 1 | loss: 0.1516259
	speed: 1.1145s/iter; left time: 167125.0346s


  3%|▎         | 500/15036 [09:22<4:29:44,  1.11s/it]

	iters: 500, epoch: 1 | loss: 0.3683558
	speed: 1.1140s/iter; left time: 166938.2233s


  4%|▍         | 600/15036 [11:13<4:31:28,  1.13s/it]

	iters: 600, epoch: 1 | loss: 0.4165886
	speed: 1.1158s/iter; left time: 167107.8969s


  5%|▍         | 700/15036 [13:05<4:26:47,  1.12s/it]

	iters: 700, epoch: 1 | loss: 0.5344632
	speed: 1.1135s/iter; left time: 166654.4696s


  5%|▌         | 800/15036 [14:56<4:25:21,  1.12s/it]

	iters: 800, epoch: 1 | loss: 0.5059830
	speed: 1.1127s/iter; left time: 166413.2673s


  6%|▌         | 900/15036 [16:47<4:22:58,  1.12s/it]

	iters: 900, epoch: 1 | loss: 0.5508135
	speed: 1.1117s/iter; left time: 166155.7364s


  7%|▋         | 1000/15036 [18:39<4:20:21,  1.11s/it]

	iters: 1000, epoch: 1 | loss: 0.3299023
	speed: 1.1146s/iter; left time: 166481.2035s


  7%|▋         | 1100/15036 [20:30<4:18:42,  1.11s/it]

	iters: 1100, epoch: 1 | loss: 0.5405400
	speed: 1.1141s/iter; left time: 166290.1415s


  8%|▊         | 1200/15036 [22:21<4:16:37,  1.11s/it]

	iters: 1200, epoch: 1 | loss: 0.4565525
	speed: 1.1118s/iter; left time: 165832.2378s


  9%|▊         | 1300/15036 [24:13<4:15:13,  1.11s/it]

	iters: 1300, epoch: 1 | loss: 0.5180427
	speed: 1.1149s/iter; left time: 166190.5465s


  9%|▉         | 1400/15036 [26:04<4:13:18,  1.11s/it]

	iters: 1400, epoch: 1 | loss: 0.6829585
	speed: 1.1131s/iter; left time: 165812.8471s


 10%|▉         | 1500/15036 [27:55<4:11:17,  1.11s/it]

	iters: 1500, epoch: 1 | loss: 0.4100318
	speed: 1.1127s/iter; left time: 165637.1644s


 11%|█         | 1600/15036 [29:47<4:09:22,  1.11s/it]

	iters: 1600, epoch: 1 | loss: 0.3144052
	speed: 1.1135s/iter; left time: 165650.1632s


 11%|█▏        | 1700/15036 [31:38<4:09:37,  1.12s/it]

	iters: 1700, epoch: 1 | loss: 0.3712424
	speed: 1.1133s/iter; left time: 165505.7245s


 12%|█▏        | 1800/15036 [33:29<4:06:09,  1.12s/it]

	iters: 1800, epoch: 1 | loss: 0.3833300
	speed: 1.1132s/iter; left time: 165382.8010s


 13%|█▎        | 1900/15036 [35:21<4:04:24,  1.12s/it]

	iters: 1900, epoch: 1 | loss: 0.2031572
	speed: 1.1132s/iter; left time: 165269.4387s


 13%|█▎        | 2000/15036 [37:12<4:02:23,  1.12s/it]

	iters: 2000, epoch: 1 | loss: 0.3133192
	speed: 1.1128s/iter; left time: 165095.7674s


 14%|█▍        | 2100/15036 [39:03<4:00:52,  1.12s/it]

	iters: 2100, epoch: 1 | loss: 0.3756426
	speed: 1.1155s/iter; left time: 165392.1341s


 15%|█▍        | 2200/15036 [40:55<3:58:12,  1.11s/it]

	iters: 2200, epoch: 1 | loss: 0.4869344
	speed: 1.1133s/iter; left time: 164948.2322s


 15%|█▌        | 2300/15036 [42:46<3:59:35,  1.13s/it]

	iters: 2300, epoch: 1 | loss: 0.4825015
	speed: 1.1143s/iter; left time: 164990.9026s


 16%|█▌        | 2400/15036 [44:37<3:54:32,  1.11s/it]

	iters: 2400, epoch: 1 | loss: 0.5821098
	speed: 1.1121s/iter; left time: 164549.5511s


 17%|█▋        | 2500/15036 [46:29<3:52:28,  1.11s/it]

	iters: 2500, epoch: 1 | loss: 0.3402213
	speed: 1.1133s/iter; left time: 164613.8722s


 17%|█▋        | 2600/15036 [48:20<3:51:13,  1.12s/it]

	iters: 2600, epoch: 1 | loss: 0.2619679
	speed: 1.1122s/iter; left time: 164346.0388s


 18%|█▊        | 2700/15036 [50:11<3:49:47,  1.12s/it]

	iters: 2700, epoch: 1 | loss: 0.3653626
	speed: 1.1136s/iter; left time: 164436.8364s


 19%|█▊        | 2800/15036 [52:03<3:48:37,  1.12s/it]

	iters: 2800, epoch: 1 | loss: 0.2858514
	speed: 1.1136s/iter; left time: 164330.2016s


 19%|█▉        | 2900/15036 [53:54<3:45:29,  1.11s/it]

	iters: 2900, epoch: 1 | loss: 0.2733112
	speed: 1.1135s/iter; left time: 164203.7281s


 20%|█▉        | 3000/15036 [55:45<3:43:12,  1.11s/it]

	iters: 3000, epoch: 1 | loss: 0.3777665
	speed: 1.1115s/iter; left time: 163794.4302s


 21%|██        | 3100/15036 [57:36<3:41:18,  1.11s/it]

	iters: 3100, epoch: 1 | loss: 0.5116787
	speed: 1.1134s/iter; left time: 163953.6506s


 21%|██▏       | 3200/15036 [59:28<3:41:19,  1.12s/it]

	iters: 3200, epoch: 1 | loss: 0.3827979
	speed: 1.1137s/iter; left time: 163896.1363s


 22%|██▏       | 3300/15036 [1:01:19<3:38:25,  1.12s/it]

	iters: 3300, epoch: 1 | loss: 0.4038167
	speed: 1.1141s/iter; left time: 163837.9282s


 23%|██▎       | 3400/15036 [1:03:10<3:35:41,  1.11s/it]

	iters: 3400, epoch: 1 | loss: 0.2527933
	speed: 1.1120s/iter; left time: 163418.1037s


 23%|██▎       | 3500/15036 [1:05:02<3:34:26,  1.12s/it]

	iters: 3500, epoch: 1 | loss: 0.4422641
	speed: 1.1124s/iter; left time: 163371.3260s


 24%|██▍       | 3600/15036 [1:06:53<3:34:24,  1.12s/it]

	iters: 3600, epoch: 1 | loss: 0.5528213
	speed: 1.1160s/iter; left time: 163789.5734s


 25%|██▍       | 3700/15036 [1:08:45<3:30:52,  1.12s/it]

	iters: 3700, epoch: 1 | loss: 0.4187749
	speed: 1.1123s/iter; left time: 163127.7053s


 25%|██▌       | 3800/15036 [1:10:36<3:29:42,  1.12s/it]

	iters: 3800, epoch: 1 | loss: 0.4177473
	speed: 1.1132s/iter; left time: 163152.8257s


 26%|██▌       | 3900/15036 [1:12:27<3:26:24,  1.11s/it]

	iters: 3900, epoch: 1 | loss: 0.4784898
	speed: 1.1132s/iter; left time: 163040.3160s


 27%|██▋       | 4000/15036 [1:14:18<3:24:45,  1.11s/it]

	iters: 4000, epoch: 1 | loss: 0.4323361
	speed: 1.1133s/iter; left time: 162939.5246s


 27%|██▋       | 4100/15036 [1:16:10<3:22:36,  1.11s/it]

	iters: 4100, epoch: 1 | loss: 0.4223856
	speed: 1.1125s/iter; left time: 162712.6624s


 28%|██▊       | 4200/15036 [1:18:01<3:21:18,  1.11s/it]

	iters: 4200, epoch: 1 | loss: 0.2253703
	speed: 1.1149s/iter; left time: 162957.2501s


 29%|██▊       | 4300/15036 [1:19:53<3:20:56,  1.12s/it]

	iters: 4300, epoch: 1 | loss: 0.2471094
	speed: 1.1139s/iter; left time: 162704.1004s


 29%|██▉       | 4400/15036 [1:21:44<3:17:30,  1.11s/it]

	iters: 4400, epoch: 1 | loss: 0.5374485
	speed: 1.1126s/iter; left time: 162400.8516s


 30%|██▉       | 4500/15036 [1:23:35<3:15:20,  1.11s/it]

	iters: 4500, epoch: 1 | loss: 0.3688613
	speed: 1.1140s/iter; left time: 162486.9107s


 31%|███       | 4600/15036 [1:25:27<3:17:12,  1.13s/it]

	iters: 4600, epoch: 1 | loss: 0.7511730
	speed: 1.1125s/iter; left time: 162165.5437s


 31%|███▏      | 4700/15036 [1:27:18<3:11:56,  1.11s/it]

	iters: 4700, epoch: 1 | loss: 0.3102842
	speed: 1.1133s/iter; left time: 162168.7114s


 32%|███▏      | 4800/15036 [1:29:09<3:09:48,  1.11s/it]

	iters: 4800, epoch: 1 | loss: 0.5028665
	speed: 1.1148s/iter; left time: 162273.4608s


 33%|███▎      | 4900/15036 [1:31:01<3:08:26,  1.12s/it]

	iters: 4900, epoch: 1 | loss: 0.4520597
	speed: 1.1129s/iter; left time: 161882.1738s


 33%|███▎      | 5000/15036 [1:32:52<3:06:06,  1.11s/it]

	iters: 5000, epoch: 1 | loss: 0.2699903
	speed: 1.1138s/iter; left time: 161901.3543s


 34%|███▍      | 5100/15036 [1:34:44<3:05:32,  1.12s/it]

	iters: 5100, epoch: 1 | loss: 0.2634360
	speed: 1.1153s/iter; left time: 162004.7765s


 35%|███▍      | 5200/15036 [1:36:35<3:02:33,  1.11s/it]

	iters: 5200, epoch: 1 | loss: 0.4614352
	speed: 1.1146s/iter; left time: 161800.0234s


 35%|███▌      | 5300/15036 [1:38:26<3:00:27,  1.11s/it]

	iters: 5300, epoch: 1 | loss: 0.2350820
	speed: 1.1142s/iter; left time: 161625.8810s


 36%|███▌      | 5400/15036 [1:40:18<2:58:57,  1.11s/it]

	iters: 5400, epoch: 1 | loss: 0.3661196
	speed: 1.1142s/iter; left time: 161513.7101s


 37%|███▋      | 5500/15036 [1:42:09<2:56:59,  1.11s/it]

	iters: 5500, epoch: 1 | loss: 0.4047999
	speed: 1.1117s/iter; left time: 161043.2759s


 37%|███▋      | 5600/15036 [1:44:00<2:54:50,  1.11s/it]

	iters: 5600, epoch: 1 | loss: 0.2813570
	speed: 1.1127s/iter; left time: 161072.0025s


 38%|███▊      | 5700/15036 [1:45:52<2:53:18,  1.11s/it]

	iters: 5700, epoch: 1 | loss: 0.2719160
	speed: 1.1133s/iter; left time: 161054.6619s


 39%|███▊      | 5800/15036 [1:47:43<2:53:08,  1.12s/it]

	iters: 5800, epoch: 1 | loss: 0.3513772
	speed: 1.1157s/iter; left time: 161286.4140s


 39%|███▉      | 5900/15036 [1:49:34<2:49:22,  1.11s/it]

	iters: 5900, epoch: 1 | loss: 0.5430110
	speed: 1.1128s/iter; left time: 160758.0210s


 40%|███▉      | 6000/15036 [1:51:26<2:47:25,  1.11s/it]

	iters: 6000, epoch: 1 | loss: 0.2517066
	speed: 1.1136s/iter; left time: 160765.6875s


 41%|████      | 6100/15036 [1:53:17<2:45:38,  1.11s/it]

	iters: 6100, epoch: 1 | loss: 0.2588045
	speed: 1.1124s/iter; left time: 160481.5630s


 41%|████      | 6200/15036 [1:55:08<2:43:49,  1.11s/it]

	iters: 6200, epoch: 1 | loss: 0.3999674
	speed: 1.1134s/iter; left time: 160505.3150s


 42%|████▏     | 6300/15036 [1:57:00<2:42:14,  1.11s/it]

	iters: 6300, epoch: 1 | loss: 0.2245929
	speed: 1.1138s/iter; left time: 160462.2916s


 43%|████▎     | 6400/15036 [1:58:51<2:40:40,  1.12s/it]

	iters: 6400, epoch: 1 | loss: 0.3238456
	speed: 1.1122s/iter; left time: 160106.5967s


 43%|████▎     | 6500/15036 [2:00:42<2:38:19,  1.11s/it]

	iters: 6500, epoch: 1 | loss: 0.5120806
	speed: 1.1120s/iter; left time: 159973.0716s


 44%|████▍     | 6600/15036 [2:02:33<2:36:19,  1.11s/it]

	iters: 6600, epoch: 1 | loss: 0.2335766
	speed: 1.1128s/iter; left time: 159980.3643s


 45%|████▍     | 6700/15036 [2:04:25<2:37:00,  1.13s/it]

	iters: 6700, epoch: 1 | loss: 0.4130335
	speed: 1.1138s/iter; left time: 160004.3757s


 45%|████▌     | 6800/15036 [2:06:16<2:34:53,  1.13s/it]

	iters: 6800, epoch: 1 | loss: 0.3596023
	speed: 1.1139s/iter; left time: 159913.7460s


 46%|████▌     | 6900/15036 [2:08:08<2:31:06,  1.11s/it]

	iters: 6900, epoch: 1 | loss: 0.3758933
	speed: 1.1163s/iter; left time: 160139.1396s


 47%|████▋     | 7000/15036 [2:09:59<2:29:49,  1.12s/it]

	iters: 7000, epoch: 1 | loss: 0.3825092
	speed: 1.1121s/iter; left time: 159427.6819s


 47%|████▋     | 7100/15036 [2:11:50<2:27:47,  1.12s/it]

	iters: 7100, epoch: 1 | loss: 0.4651513
	speed: 1.1138s/iter; left time: 159564.4968s


 48%|████▊     | 7200/15036 [2:13:42<2:26:38,  1.12s/it]

	iters: 7200, epoch: 1 | loss: 0.5144248
	speed: 1.1131s/iter; left time: 159356.6523s


 49%|████▊     | 7300/15036 [2:15:33<2:24:00,  1.12s/it]

	iters: 7300, epoch: 1 | loss: 0.1846963
	speed: 1.1148s/iter; left time: 159484.0563s


 49%|████▉     | 7400/15036 [2:17:25<2:24:32,  1.14s/it]

	iters: 7400, epoch: 1 | loss: 0.5874090
	speed: 1.1138s/iter; left time: 159236.9653s


 50%|████▉     | 7500/15036 [2:19:16<2:19:41,  1.11s/it]

	iters: 7500, epoch: 1 | loss: 0.2309799
	speed: 1.1115s/iter; left time: 158791.1481s


 51%|█████     | 7600/15036 [2:21:07<2:18:56,  1.12s/it]

	iters: 7600, epoch: 1 | loss: 0.1898429
	speed: 1.1144s/iter; left time: 159092.0920s


 51%|█████     | 7700/15036 [2:22:59<2:16:12,  1.11s/it]

	iters: 7700, epoch: 1 | loss: 0.5843292
	speed: 1.1146s/iter; left time: 159015.3489s


 52%|█████▏    | 7800/15036 [2:24:50<2:14:34,  1.12s/it]

	iters: 7800, epoch: 1 | loss: 0.3714770
	speed: 1.1111s/iter; left time: 158399.6024s


 53%|█████▎    | 7900/15036 [2:26:41<2:12:32,  1.11s/it]

	iters: 7900, epoch: 1 | loss: 0.1985273
	speed: 1.1127s/iter; left time: 158518.9242s


 53%|█████▎    | 8000/15036 [2:28:32<2:10:31,  1.11s/it]

	iters: 8000, epoch: 1 | loss: 0.3112651
	speed: 1.1118s/iter; left time: 158281.1109s


 54%|█████▍    | 8100/15036 [2:30:24<2:09:13,  1.12s/it]

	iters: 8100, epoch: 1 | loss: 0.2920770
	speed: 1.1145s/iter; left time: 158543.9703s


 55%|█████▍    | 8200/15036 [2:32:15<2:07:18,  1.12s/it]

	iters: 8200, epoch: 1 | loss: 0.3431327
	speed: 1.1153s/iter; left time: 158545.5831s


 55%|█████▌    | 8300/15036 [2:34:07<2:04:53,  1.11s/it]

	iters: 8300, epoch: 1 | loss: 0.4899525
	speed: 1.1137s/iter; left time: 158215.0331s


 56%|█████▌    | 8400/15036 [2:35:58<2:03:11,  1.11s/it]

	iters: 8400, epoch: 1 | loss: 0.2624430
	speed: 1.1140s/iter; left time: 158138.9421s


 57%|█████▋    | 8500/15036 [2:37:49<2:01:14,  1.11s/it]

	iters: 8500, epoch: 1 | loss: 0.4483156
	speed: 1.1116s/iter; left time: 157695.5694s


 57%|█████▋    | 8600/15036 [2:39:41<1:59:54,  1.12s/it]

	iters: 8600, epoch: 1 | loss: 0.3183158
	speed: 1.1155s/iter; left time: 158135.5334s


 58%|█████▊    | 8700/15036 [2:41:32<1:57:27,  1.11s/it]

	iters: 8700, epoch: 1 | loss: 0.2639415
	speed: 1.1127s/iter; left time: 157624.9079s


 59%|█████▊    | 8800/15036 [2:43:23<1:56:21,  1.12s/it]

	iters: 8800, epoch: 1 | loss: 0.2625238
	speed: 1.1126s/iter; left time: 157501.3856s


 59%|█████▉    | 8900/15036 [2:45:14<1:53:49,  1.11s/it]

	iters: 8900, epoch: 1 | loss: 0.2644734
	speed: 1.1120s/iter; left time: 157306.9693s


 60%|█████▉    | 9000/15036 [2:47:06<1:53:36,  1.13s/it]

	iters: 9000, epoch: 1 | loss: 0.3530653
	speed: 1.1138s/iter; left time: 157448.5422s


 61%|██████    | 9100/15036 [2:48:57<1:50:19,  1.12s/it]

	iters: 9100, epoch: 1 | loss: 0.4277333
	speed: 1.1143s/iter; left time: 157406.2500s


 61%|██████    | 9200/15036 [2:50:48<1:48:19,  1.11s/it]

	iters: 9200, epoch: 1 | loss: 0.3998293
	speed: 1.1124s/iter; left time: 157022.0045s


 62%|██████▏   | 9300/15036 [2:52:40<1:46:16,  1.11s/it]

	iters: 9300, epoch: 1 | loss: 0.2286516
	speed: 1.1107s/iter; left time: 156678.8922s


 63%|██████▎   | 9400/15036 [2:54:31<1:46:30,  1.13s/it]

	iters: 9400, epoch: 1 | loss: 0.3625774
	speed: 1.1129s/iter; left time: 156878.8741s


 63%|██████▎   | 9500/15036 [2:56:22<1:42:40,  1.11s/it]

	iters: 9500, epoch: 1 | loss: 0.2796750
	speed: 1.1128s/iter; left time: 156745.8462s


 64%|██████▍   | 9600/15036 [2:58:14<1:41:09,  1.12s/it]

	iters: 9600, epoch: 1 | loss: 0.3436350
	speed: 1.1152s/iter; left time: 156982.3103s


 65%|██████▍   | 9700/15036 [3:00:05<1:39:16,  1.12s/it]

	iters: 9700, epoch: 1 | loss: 0.2691304
	speed: 1.1122s/iter; left time: 156444.9598s


 65%|██████▌   | 9800/15036 [3:01:56<1:37:07,  1.11s/it]

	iters: 9800, epoch: 1 | loss: 0.2551854
	speed: 1.1113s/iter; left time: 156209.5849s


 66%|██████▌   | 9900/15036 [3:03:47<1:35:39,  1.12s/it]

	iters: 9900, epoch: 1 | loss: 0.3045796
	speed: 1.1119s/iter; left time: 156184.3895s


 67%|██████▋   | 10000/15036 [3:05:39<1:33:24,  1.11s/it]

	iters: 10000, epoch: 1 | loss: 0.3397721
	speed: 1.1134s/iter; left time: 156283.1374s


 67%|██████▋   | 10100/15036 [3:07:30<1:31:59,  1.12s/it]

	iters: 10100, epoch: 1 | loss: 0.2493875
	speed: 1.1113s/iter; left time: 155878.9730s


 68%|██████▊   | 10200/15036 [3:09:21<1:29:43,  1.11s/it]

	iters: 10200, epoch: 1 | loss: 0.4195319
	speed: 1.1120s/iter; left time: 155863.7278s


 69%|██████▊   | 10300/15036 [3:11:12<1:27:54,  1.11s/it]

	iters: 10300, epoch: 1 | loss: 0.3366079
	speed: 1.1134s/iter; left time: 155946.7233s


 69%|██████▉   | 10400/15036 [3:13:04<1:25:56,  1.11s/it]

	iters: 10400, epoch: 1 | loss: 0.2819530
	speed: 1.1135s/iter; left time: 155845.5049s


 70%|██████▉   | 10500/15036 [3:14:55<1:24:21,  1.12s/it]

	iters: 10500, epoch: 1 | loss: 0.4535101
	speed: 1.1129s/iter; left time: 155651.4732s


 70%|███████   | 10600/15036 [3:16:46<1:23:10,  1.13s/it]

	iters: 10600, epoch: 1 | loss: 0.4083638
	speed: 1.1147s/iter; left time: 155786.7771s


 71%|███████   | 10700/15036 [3:18:38<1:20:49,  1.12s/it]

	iters: 10700, epoch: 1 | loss: 0.2362381
	speed: 1.1170s/iter; left time: 155997.6430s


 72%|███████▏  | 10800/15036 [3:20:29<1:18:41,  1.11s/it]

	iters: 10800, epoch: 1 | loss: 0.4977838
	speed: 1.1130s/iter; left time: 155333.5478s


 72%|███████▏  | 10900/15036 [3:22:21<1:16:50,  1.11s/it]

	iters: 10900, epoch: 1 | loss: 0.5017090
	speed: 1.1129s/iter; left time: 155199.6777s


 73%|███████▎  | 11000/15036 [3:24:12<1:14:45,  1.11s/it]

	iters: 11000, epoch: 1 | loss: 0.3617222
	speed: 1.1154s/iter; left time: 155439.8008s


 74%|███████▍  | 11100/15036 [3:26:03<1:12:58,  1.11s/it]

	iters: 11100, epoch: 1 | loss: 0.2475175
	speed: 1.1121s/iter; left time: 154875.8413s


 74%|███████▍  | 11200/15036 [3:27:55<1:11:30,  1.12s/it]

	iters: 11200, epoch: 1 | loss: 0.2410468
	speed: 1.1145s/iter; left time: 155098.9485s


 75%|███████▌  | 11300/15036 [3:29:46<1:09:18,  1.11s/it]

	iters: 11300, epoch: 1 | loss: 0.5051583
	speed: 1.1132s/iter; left time: 154802.6783s


 76%|███████▌  | 11400/15036 [3:31:37<1:07:41,  1.12s/it]

	iters: 11400, epoch: 1 | loss: 0.1420971
	speed: 1.1130s/iter; left time: 154668.8800s


 76%|███████▋  | 11500/15036 [3:33:29<1:06:47,  1.13s/it]

	iters: 11500, epoch: 1 | loss: 0.8738269
	speed: 1.1162s/iter; left time: 154991.0198s


 77%|███████▋  | 11600/15036 [3:35:20<1:03:45,  1.11s/it]

	iters: 11600, epoch: 1 | loss: 0.2995484
	speed: 1.1126s/iter; left time: 154379.2382s


 78%|███████▊  | 11700/15036 [3:37:12<1:02:06,  1.12s/it]

	iters: 11700, epoch: 1 | loss: 0.2887154
	speed: 1.1156s/iter; left time: 154688.8270s


 78%|███████▊  | 11800/15036 [3:39:03<1:00:14,  1.12s/it]

	iters: 11800, epoch: 1 | loss: 0.4006651
	speed: 1.1115s/iter; left time: 154012.9054s


 79%|███████▉  | 11900/15036 [3:40:54<58:13,  1.11s/it]  

	iters: 11900, epoch: 1 | loss: 0.5487355
	speed: 1.1144s/iter; left time: 154300.8900s


 80%|███████▉  | 12000/15036 [3:42:46<57:12,  1.13s/it]

	iters: 12000, epoch: 1 | loss: 0.4862360
	speed: 1.1134s/iter; left time: 154051.8534s


 80%|████████  | 12100/15036 [3:44:37<54:48,  1.12s/it]

	iters: 12100, epoch: 1 | loss: 0.3040531
	speed: 1.1132s/iter; left time: 153911.7934s


 81%|████████  | 12200/15036 [3:46:28<52:58,  1.12s/it]

	iters: 12200, epoch: 1 | loss: 0.1508090
	speed: 1.1126s/iter; left time: 153717.7230s


 82%|████████▏ | 12300/15036 [3:48:20<50:55,  1.12s/it]

	iters: 12300, epoch: 1 | loss: 0.3646452
	speed: 1.1113s/iter; left time: 153420.9136s


 82%|████████▏ | 12400/15036 [3:50:11<49:08,  1.12s/it]

	iters: 12400, epoch: 1 | loss: 0.3966269
	speed: 1.1137s/iter; left time: 153647.2537s


 83%|████████▎ | 12500/15036 [3:52:02<47:03,  1.11s/it]

	iters: 12500, epoch: 1 | loss: 0.2710784
	speed: 1.1129s/iter; left time: 153419.6109s


 84%|████████▍ | 12600/15036 [3:53:53<45:39,  1.12s/it]

	iters: 12600, epoch: 1 | loss: 0.2310811
	speed: 1.1125s/iter; left time: 153256.4442s


 84%|████████▍ | 12700/15036 [3:55:45<43:18,  1.11s/it]

	iters: 12700, epoch: 1 | loss: 0.4007176
	speed: 1.1110s/iter; left time: 152935.7982s


 85%|████████▌ | 12800/15036 [3:57:36<41:38,  1.12s/it]

	iters: 12800, epoch: 1 | loss: 0.2717411
	speed: 1.1125s/iter; left time: 153042.4806s


 86%|████████▌ | 12900/15036 [3:59:27<39:44,  1.12s/it]

	iters: 12900, epoch: 1 | loss: 0.2357592
	speed: 1.1106s/iter; left time: 152668.0092s


 86%|████████▋ | 13000/15036 [4:01:18<37:44,  1.11s/it]

	iters: 13000, epoch: 1 | loss: 0.3230478
	speed: 1.1143s/iter; left time: 153055.3322s


 87%|████████▋ | 13100/15036 [4:03:10<35:54,  1.11s/it]

	iters: 13100, epoch: 1 | loss: 0.3218281
	speed: 1.1134s/iter; left time: 152824.9367s


 88%|████████▊ | 13200/15036 [4:05:01<34:15,  1.12s/it]

	iters: 13200, epoch: 1 | loss: 0.4455663
	speed: 1.1133s/iter; left time: 152703.9619s


 88%|████████▊ | 13300/15036 [4:06:52<32:12,  1.11s/it]

	iters: 13300, epoch: 1 | loss: 0.3098881
	speed: 1.1118s/iter; left time: 152385.1166s


 89%|████████▉ | 13400/15036 [4:08:43<30:19,  1.11s/it]

	iters: 13400, epoch: 1 | loss: 0.4407156
	speed: 1.1125s/iter; left time: 152365.3125s


 90%|████████▉ | 13500/15036 [4:10:35<28:31,  1.11s/it]

	iters: 13500, epoch: 1 | loss: 0.2413566
	speed: 1.1125s/iter; left time: 152256.5880s


 90%|█████████ | 13600/15036 [4:12:26<26:47,  1.12s/it]

	iters: 13600, epoch: 1 | loss: 0.2562832
	speed: 1.1135s/iter; left time: 152281.7196s


 91%|█████████ | 13700/15036 [4:14:17<24:47,  1.11s/it]

	iters: 13700, epoch: 1 | loss: 0.2525972
	speed: 1.1127s/iter; left time: 152060.1513s


 92%|█████████▏| 13800/15036 [4:16:09<22:59,  1.12s/it]

	iters: 13800, epoch: 1 | loss: 0.3085753
	speed: 1.1129s/iter; left time: 151975.7880s


 92%|█████████▏| 13900/15036 [4:18:00<21:06,  1.12s/it]

	iters: 13900, epoch: 1 | loss: 0.2734809
	speed: 1.1148s/iter; left time: 152126.7762s


 93%|█████████▎| 14000/15036 [4:19:51<19:14,  1.11s/it]

	iters: 14000, epoch: 1 | loss: 0.1986648
	speed: 1.1138s/iter; left time: 151875.4761s


 94%|█████████▍| 14100/15036 [4:21:43<17:23,  1.12s/it]

	iters: 14100, epoch: 1 | loss: 0.2712043
	speed: 1.1130s/iter; left time: 151655.7829s


 94%|█████████▍| 14200/15036 [4:23:34<15:52,  1.14s/it]

	iters: 14200, epoch: 1 | loss: 0.2487006
	speed: 1.1146s/iter; left time: 151761.2184s


 95%|█████████▌| 14300/15036 [4:25:25<13:38,  1.11s/it]

	iters: 14300, epoch: 1 | loss: 0.6236912
	speed: 1.1135s/iter; left time: 151501.1238s


 96%|█████████▌| 14400/15036 [4:27:17<11:51,  1.12s/it]

	iters: 14400, epoch: 1 | loss: 0.2466626
	speed: 1.1154s/iter; left time: 151644.3180s


 96%|█████████▋| 14500/15036 [4:29:08<09:58,  1.12s/it]

	iters: 14500, epoch: 1 | loss: 0.2723696
	speed: 1.1122s/iter; left time: 151101.7512s


 97%|█████████▋| 14600/15036 [4:31:00<08:05,  1.11s/it]

	iters: 14600, epoch: 1 | loss: 0.4484999
	speed: 1.1134s/iter; left time: 151152.9634s


 98%|█████████▊| 14700/15036 [4:32:51<06:14,  1.12s/it]

	iters: 14700, epoch: 1 | loss: 0.2301174
	speed: 1.1133s/iter; left time: 151027.6852s


 98%|█████████▊| 14800/15036 [4:34:42<04:27,  1.13s/it]

	iters: 14800, epoch: 1 | loss: 0.2866963
	speed: 1.1140s/iter; left time: 151018.1112s


 99%|█████████▉| 14900/15036 [4:36:34<02:31,  1.11s/it]

	iters: 14900, epoch: 1 | loss: 0.2902043
	speed: 1.1132s/iter; left time: 150793.5955s


100%|█████████▉| 15000/15036 [4:38:25<00:40,  1.11s/it]

	iters: 15000, epoch: 1 | loss: 0.2713231
	speed: 1.1135s/iter; left time: 150720.7582s


100%|██████████| 15036/15036 [4:39:05<00:00,  1.11s/it]


NameError: name 'mae_metric' is not defined

In [7]:
torch.save(model.state_dict(), '/root/FinAI/Time-LLM/checkpoint.pth')

In [8]:
args.use_amp = True
mae_metric = nn.L1Loss()
train_loss = np.average(train_loss)
print(train_loss)
start_time = time.time()
vali_loss, vali_mae_loss = vali(args, accelerator, model, vali_data, vali_loader, criterion, mae_metric)
time_cost_map["Validation"].append(f"{time.time() - start_time:.4f}s")

start_time = time.time()
test_loss, test_mae_loss = vali(args, accelerator, model, test_data, test_loader, criterion, mae_metric)
time_cost_map["Test"].append(f"{time.time() - start_time:.4f}s")

print(f"Epoch: {epoch + 1} | Train Loss: {train_loss:.7f} Vali Loss: {vali_loss:.7f} Test Loss: {test_loss:.7f} MAE Loss: {test_mae_loss:.7f}")

0.36592398014689625


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Epoch: 1 | Train Loss: 0.3659240 Vali Loss: 0.4365394 Test Loss: 0.3471910 MAE Loss: 0.3816030





In [1]:
print(len(train_loader))

NameError: name 'train_loader' is not defined