In [1]:
import h5py
import numpy as np
from transformers import AutoTokenizer
import wandb
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


# Baseline


## (a) Preprocesses and tokenized the dataset

In [2]:
from preprocessor import load_and_preprocess

### Use the preprocesser to preprocess the dataset, and tokenize them

In [3]:
file_path = "lotka_volterra_data.h5"

# Use the function to load and preprocess the data
train_texts, val_texts, test_texts = load_and_preprocess(
    file_path,
    decimal_places=2,
    max_target_value=9.99
)

# Demonstrate tokenization using Qwen2.5
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

tokenized_train=[]
tokenized_val=[]
tokenized_test=[]
for i in range(len(train_texts)):
    tokenized_train.append(tokenizer(train_texts[i], return_tensors="pt", add_special_tokens=False)["input_ids"][0])
for i in range(len(val_texts)): 
    tokenized_val.append(tokenizer(val_texts[i], return_tensors="pt", add_special_tokens=False)["input_ids"][0])    
for i in range(len(test_texts)):
    tokenized_test.append(tokenizer(test_texts[i], return_tensors="pt", add_special_tokens=False)["input_ids"][0])





### Show examples of preprocessed data and tokenized data

In [4]:
print(f"Example Preprocessed Sequences: Train Sequence {1}:", train_texts[1])
print(f"shape of train text:", len(train_texts[10]))
print(f"Example Tokenized Sequence: Train Sequence {1}:",tokenized_train[1].tolist())
print(f"shape of tokenized train text", len(tokenized_train[10]))

Example Preprocessed Sequences: Train Sequence 1: 0.92,0.74;0.56,0.77;0.34,0.70;0.23,0.59;0.17,0.48;0.14,0.37;0.13,0.29;0.12,0.23;0.13,0.17;0.14,0.14;0.16,0.11;0.18,0.08;0.21,0.07;0.26,0.06;0.31,0.05;0.37,0.04;0.45,0.04;0.55,0.03;0.67,0.03;0.82,0.04;0.99,0.04;1.19,0.05;1.40,0.07;1.62,0.10;1.80,0.15;1.88,0.25;1.77,0.41;1.41,0.62;0.93,0.79;0.54,0.82;0.32,0.74;0.20,0.61;0.15,0.49;0.12,0.38;0.11,0.29;0.11,0.22;0.11,0.17;0.12,0.13;0.14,0.10;0.16,0.08;0.19,0.06;0.23,0.05;0.27,0.04;0.33,0.04;0.41,0.03;0.50,0.03;0.61,0.03;0.75,0.03;0.91,0.03;1.10,0.04;1.32,0.05;1.56,0.07;1.79,0.11;1.96,0.18;1.98,0.31;1.75,0.51;1.26,0.75;0.74,0.88;0.40,0.84;0.23,0.72;0.15,0.58;0.12,0.45;0.10,0.34;0.09,0.26;0.09,0.20;0.10,0.15;0.11,0.12;0.13,0.09;0.15,0.07;0.18,0.06;0.21,0.04;0.26,0.04;0.32,0.03;0.39,0.03;0.48,0.02;0.59,0.02;0.73,0.02;0.89,0.03;1.09,0.03;1.32,0.04;1.57,0.06;1.82,0.09;2.04,0.15;2.13,0.26;1.96,0.46;1.46,0.73;0.86,0.92;0.44,0.92;0.24,0.79;0.15,0.63;0.11,0.49;0.09,0.37;0.08,0.28;0.08,0.21;0.08,0.16;

## (b) Evaluate the untrained Qwen2.5-Instruct model’s forecasting ability on this tokenized dataset.

In [5]:
from qwen import load_qwen

In [6]:
import torch
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
original_model, tokenizer = load_qwen()
original_model.to(device)

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((896,), eps=1e-06)
    (rotary_emb): Qwen2RotaryEmbe

In [7]:
def split_context_target(token_ids: torch.Tensor, context_ratio: float = 0.8):
    """
    Splits a 1D tensor of token IDs into context and target parts.
    
    Args:
        token_ids (torch.Tensor): A 1D tensor of token IDs.
        context_ratio (float): Fraction of tokens to use as context.
    
    Returns:
        (context_ids, target_ids) (torch.Tensor, torch.Tensor)
    """
    total_length = len(token_ids)
    context_length = int(total_length * context_ratio)
    context_ids = token_ids[:context_length]
    target_ids = token_ids[context_length:]
    return context_ids, target_ids

def decode_tokens_to_numbers(text: str):
    """
    Decodes a LLMTIME-formatted string into a list of numeric values.
    Example format: "0.25,1.50;0.27,1.47;0.31,1.42"
    
    We split by semicolon to separate timesteps, then by comma for variables,
    and parse each as a float.
    
    Args:
        text (str): The decoded text from the model's output.
    
    Returns:
        List[float]: A flat list of numeric values (prey, predator, prey, predator, ...).
    """
    numbers = []
    timesteps = text.split(";")
    for step in timesteps:
        # Split each timestep by commas
        parts = step.split(",")
        for p in parts:
            try:
                # Convert the string to float if possible
                val = float(p.strip())
                numbers.append(val)
            except ValueError:
                # If conversion fails (e.g., empty string), skip
                continue
    return numbers

###############################################################################
# Main Evaluation Function
###############################################################################

def evaluation(model, tokenizer, tokenized_data, context_ratio: float = 0.7):
    """
    Evaluates the model in a fully autoregressive manner using model.generate.
    
    For each sequence:
      1. Split the sequence into context and target using split_context_target.
      2. Use model.generate (with output_scores=True and return_dict_in_generate=True)
         to generate all target tokens at once.
      3. Extract the per-token logits (scores) for each generated token and compute the 
         cross-entropy loss against the ground truth token.
      4. Compute the average loss over the generated tokens and log the loss curve.
      5. Decode the generated tokens and the ground truth target tokens into numeric values,
         and compute the Mean Squared Error (MSE) for forecast evaluation.
    
    Args:
        model: The Qwen2.5-Instruct model.
        tokenizer: The corresponding tokenizer.
        tokenized_data (List[torch.Tensor]): List of 1D token ID tensors.
        context_ratio (float): Fraction of tokens used as context.
    
    Returns:
        Tuple[float, float]: The average cross-entropy loss and MSE over evaluated sequences.
    """
   
    num_eval = 10  # Evaluate first 10 sequences
    all_seq_losses = []
    all_seq_mses = []
    
    for i in range(num_eval):
        # Retrieve sequence and split into context and target
        seq = tokenized_data[i].to(device)
        context_ids, target_ids = split_context_target(seq, context_ratio)
        target_ids = target_ids[:100]
        input_ids = context_ids.unsqueeze(0)  # Shape: (1, context_length)
        
        # Generate tokens autoregressively using model.generate with scores output
        with torch.no_grad():
            gen_output = model.generate(
                input_ids,
                max_new_tokens=len(target_ids),
                do_sample=False,  # Greedy decoding
                output_scores=True,
                return_dict_in_generate=True
            )
        
        # gen_output.sequences contains context + generated tokens.
        generated_ids = gen_output.sequences[0]
        # gen_output.scores is a tuple of logits for each generated token, each with shape (batch_size, vocab_size)
        scores = gen_output.scores
        
        # Compute per-token loss using the returned scores and corresponding ground truth token
        token_losses = []
        for j, score in enumerate(scores):
            # Ground truth token for step j is target_ids[j]
            gt_token = target_ids[j].unsqueeze(0)  # Shape: (1,)
            loss_j = torch.nn.functional.cross_entropy(score, gt_token)
            token_losses.append(loss_j.item())
        avg_loss_seq = np.mean(token_losses)
        all_seq_losses.append(avg_loss_seq)
        
        # For forecast evaluation, compare generated tokens (excluding context) to ground truth target tokens
        generated_target_ids = generated_ids[len(context_ids):]
        pred_text = tokenizer.decode(generated_target_ids, skip_special_tokens=True)
        true_text = tokenizer.decode(target_ids, skip_special_tokens=True)
        pred_numbers = decode_tokens_to_numbers(pred_text)
        true_numbers = decode_tokens_to_numbers(true_text)
        mse = float("inf")
        if len(pred_numbers) == len(true_numbers) and len(pred_numbers) > 0:
            mse = np.mean((np.array(pred_numbers) - np.array(true_numbers)) ** 2)
        all_seq_mses.append(mse)
        
        # plot and log the loss curve
       
        plt.figure()
        plt.plot(token_losses, label="Token Loss")
        plt.title(f"Loss Curve for Sequence {i}")
        plt.xlabel("Prediction Step")
        plt.ylabel("Cross-Entropy Loss")
        plt.legend()
        wandb.log({f"loss_curve_seq_{i}": wandb.Image(plt)})
        plt.close()
        
        # plot and log forecast comparison (requires numeric reshaping, e.g., into (-1, 2))
        try:
            pred_array = np.array(pred_numbers).reshape(-1, 2)
            true_array = np.array(true_numbers).reshape(-1, 2)
            plt.figure()
            plt.plot(true_array[:, 0], label="True Prey")
            plt.plot(true_array[:, 1], label="True Predator", linestyle="--")
            plt.plot(pred_array[:, 0], label="Predicted Prey")
            plt.plot(pred_array[:, 1], label="Predicted Predator", linestyle="--")
            plt.title(f"Forecast Comparison for Sequence {i}")
            plt.legend()
            wandb.log({f"forecast_seq_{i}": wandb.Image(plt)})
            plt.close()
        except Exception as e:
            print(f"Sequence {i}: Error in plotting forecast: {e}")
        
        wandb.log({
            "loss_per_sequence": avg_loss_seq,
            "mse_per_sequence": mse,
        }, step=i+1)
    
    avg_loss_overall = np.mean(all_seq_losses) if all_seq_losses else float("inf")
    avg_mse_overall = np.mean(all_seq_mses) if all_seq_mses else float("inf")
    wandb.log({
        "avg_loss": avg_loss_overall,
        "avg_mse": avg_mse_overall,
    })
    return avg_loss_overall, avg_mse_overall



In [8]:

wandb.init(project="Qwen_baseline", name="untrained-noLora_evaluation",reinit=True) 



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[34m[1mwandb[0m: Current

In [None]:
avg_loss, avg_mse = evaluation(
    original_model, tokenizer, tokenized_val, context_ratio=0.7
)

# 4) Print results
print("=== Baseline Evaluation on Untrained Qwen2.5-Instruct ===")
print(f"Average Cross-Entropy Loss: {avg_loss:.4f}")
print(f"Average MSE (Forecast):  {avg_mse:.4f}")

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.




=== Baseline Evaluation on Untrained Qwen2.5-Instruct ===
Average Cross-Entropy Loss: 1.6455
Average MSE (Forecast):  0.2022


## (c) Map each operation to its flops

In [29]:
from flops import flops_for_experiment
import numpy as np

# Suppose we want to run inference for 10 steps (e.g., 10 forward passes)
num_steps = 1000

# Hypothetical model configuration
batch_size = 4    # number of samples per batch
seq_len = 512     # input sequence length
hidden_dim = 896   # model hidden dimension
num_layers = 24    # number of Transformer blocks
num_heads = 14       # number of attention heads
r=4                 # Lora rank
ffn_ratio = 4.0     # typical ratio for feed-forward layer size

# Compute total FLOPS for the inference experiment
total_inference_flops = flops_for_experiment(
    num_steps=num_steps,
    batch_size=batch_size,
    seq_len=seq_len,
    hidden_dim=hidden_dim,
    num_layers=num_layers,
    num_heads=num_heads,
    ffn_ratio=ffn_ratio,
    r=r,
    training=True
)

print("=== Inference FLOPS Calculation ===")
print(f"Number of inference steps: {num_steps}")
print(f"Batch size: {batch_size}")
print(f"Sequence length: {seq_len}")
print(f"Hidden dimension: {hidden_dim}")
print(f"Transformer blocks: {num_layers}")
print(f"Attention heads: {num_heads}")
print(f"FFN ratio: {ffn_ratio}")
print("------------------------------------")
print(f"Total FLOPS for inference: {np.log10(total_inference_flops)}")




=== Inference FLOPS Calculation ===
Number of inference steps: 1000
Batch size: 4
Sequence length: 512
Hidden dimension: 896
Transformer blocks: 24
Attention heads: 14
FFN ratio: 4.0
------------------------------------
Total FLOPS for inference: 15.68259714362721


In [11]:
10**17 / (10**12.68 *9 + 10**12.06 +10**12.87 *4)

1353.5962801256442

# LORA

## Section 3(a): Adapt the lora implementation, and train the 0.5B parameters Qwen model with default hyperparameters.

In [12]:
from lora_skeleton import apply_lora, load_data, train_lora

### load data

In [13]:
train_ids, val_ids, test_ids = load_data(tokenizer)

### load model

In [14]:
lr1_rank4_model,_ = load_qwen()
apply_lora(lr1_rank4_model, r=4)

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): LoRALinear(
            (original_linear): Linear(in_features=896, out_features=896, bias=True)
          )
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): LoRALinear(
            (original_linear): Linear(in_features=896, out_features=128, bias=True)
          )
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm(

In [15]:
wandb.init(project="Qwen_3a_train_model", name="default_value_train_model", reinit=True)

0,1
avg_loss,▁
avg_mse,▁

0,1
avg_loss,1.64547
avg_mse,0.20217


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


### Train model


In [17]:
losses_default_train=train_lora(lr1_rank4_model, train_ids, max_steps=700)

Training:   0%|          | 0/700 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Training:   7%|▋         | 50/700 [01:06<12:48,  1.18s/it, loss=0.551]

Step 50: loss = 0.5513


Training:  14%|█▍        | 100/700 [02:05<11:30,  1.15s/it, loss=0.676]

Step 100: loss = 0.6756


Training:  21%|██▏       | 150/700 [03:03<10:40,  1.16s/it, loss=0.647]

Step 150: loss = 0.6474


Training:  29%|██▊       | 200/700 [04:05<10:20,  1.24s/it, loss=0.75] 

Step 200: loss = 0.7501


Training:  36%|███▌      | 250/700 [05:07<09:00,  1.20s/it, loss=0.703]

Step 250: loss = 0.7033


Training:  43%|████▎     | 300/700 [06:07<08:12,  1.23s/it, loss=0.56] 

Step 300: loss = 0.5595


Training:  50%|█████     | 350/700 [07:07<07:04,  1.21s/it, loss=0.512]

Step 350: loss = 0.5115


Training:  57%|█████▋    | 400/700 [08:06<06:04,  1.22s/it, loss=0.583]

Step 400: loss = 0.5833


Training:  64%|██████▍   | 450/700 [09:06<05:12,  1.25s/it, loss=0.576]

Step 450: loss = 0.5758


Training:  71%|███████▏  | 500/700 [10:05<04:17,  1.29s/it, loss=0.527]

Step 500: loss = 0.5275


Training:  79%|███████▊  | 550/700 [11:04<02:57,  1.18s/it, loss=0.558]

Step 550: loss = 0.5578


Training:  86%|████████▌ | 600/700 [12:06<02:01,  1.21s/it, loss=0.492]

Step 600: loss = 0.4921


Training:  93%|█████████▎| 650/700 [13:08<01:14,  1.48s/it, loss=0.593]

Step 650: loss = 0.5931


Training: 100%|██████████| 700/700 [14:09<00:00,  1.21s/it, loss=0.492]


Step 700: loss = 0.4924


### Evalutate the model on validation dataset

In [18]:
wandb.init(project="Qwen_3a_train_model", name="default_value_evaluation_model", reinit=True)

0,1
loss,▄██▅▃▅▄▃▃▄▅▃▂▃▄▁▂▄▂▂▂▃▃▂▃▂▂▁▁▃▂▂▄▃▂▃▂▂▃▂
step,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇██

0,1
loss,0.49242
step,699.0


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [19]:
# val_loss_1_4, val_mse_1_4 = evaluate_lora_mse(lr1_rank4_model, tokenizer,val_ids, context_ratio=0.5,max_gen_tokens=20)

# print(f"Validation Loss: {val_loss_1_4:.4f}")
# print(f"Validation MSE (forecast): {val_mse_1_4:.4f}")

In [None]:
val_loss_1_4, val_mse_1_4 = evaluation(lr1_rank4_model, tokenizer, tokenized_val,context_ratio=0.5)






Validation Loss: 1.4635 Validation MSE (forecast): 0.0648


In [27]:
print(f"Validation Loss: {val_loss_1_4:.4f} "
      f"Validation MSE (forecast): {val_mse_1_4:.4f}")

Validation Loss: 1.4635 Validation MSE (forecast): 0.0648


### Reload the untrained QWen model, apply LoRA on the untrained model

In [21]:
untrained_model, _ = load_qwen()
apply_lora(untrained_model, r=4)
untrained_model.to(device)

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): LoRALinear(
            (original_linear): Linear(in_features=896, out_features=896, bias=True)
          )
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): LoRALinear(
            (original_linear): Linear(in_features=896, out_features=128, bias=True)
          )
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm(

In [None]:
wandb.init(project="Qwen_3a_train_model", name="LoRA_untrained_evaluation", reinit=True)

0,1
avg_loss,▁
avg_mse,▁

0,1
avg_loss,1.46346
avg_mse,0.06484


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [23]:

val_lora_untrained_loss, val_lora_untrained_mse = evaluation(untrained_model, tokenizer, tokenized_val, context_ratio=0.5)
print(f"Validation Loss: {val_lora_untrained_loss:.4f}")
print(f"Validation MSE (forecast): {val_lora_untrained_mse:.4f}")





Validation Loss: 1.5688
Validation MSE (forecast): 0.4736


In [24]:
# val_loss_untrained, val_mse_untrained = evaluate_lora_mse(untrained_model, tokenizer,val_ids, context_ratio=0.5,max_gen_tokens=20)

# print(f"Validation Loss: {val_loss_1_4:.4f}")
# print(f"Validation MSE (forecast): {val_mse_1_4:.4f}")

## 3(b): Make hyperparameter tuning and using metrics to select a best model.

### Implement a new method for evaluate the validation dataset by using dataloader. Previously we predicted many tokens for each of the 10 sequences, so we can check the performance for untrained/trained and LoRA/non-LoRA models. In this step, we do not explictly compare the performance. Instead, we use metrics, so we use the whole validation dataset but only predict 20-30 tokens for each of all the sequences.

In [25]:
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm 
from accelerate import Accelerator   
def evaluate_lora_mse(model, tokenizer, val_ids, context_ratio=0.5, batch_size=4, max_gen_tokens=30):
    """
    Evaluate a trained model on validation data using MSE.

    Args:
        model: The trained Qwen2.5 model (with LoRA applied).
        tokenizer: The tokenizer used for encoding/decoding.
        val_ids: A tensor of shape [num_sequences, seq_len].
        context_ratio: The percentage of tokens to use as context (e.g. 0.7).
        batch_size: Batch size for evaluation.

    Returns:
        avg_mse: Average Mean Squared Error over all sequences.
    """
    model.eval()
    val_dataset = TensorDataset(val_ids)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    accelerator = Accelerator()
    model, val_loader = accelerator.prepare(model, val_loader)


    mses = []
    losses = []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Evaluating"):
        # batch: (batch_size, seq_len)
            input_seqs = batch[0]

            for seq in input_seqs:
                context_ids, target_ids = split_context_target(seq, context_ratio)
                target_ids = target_ids[:max_gen_tokens]

                # 1. Cross-Entropy Loss over context+target
                full_seq = seq.unsqueeze(0)
                output = model(full_seq, labels=full_seq)
                losses.append(output.loss.item())

                # 2. Generation & MSE calculation
                input_ids = context_ids.unsqueeze(0)
                max_new_tokens = len(target_ids)
                generated = model.generate(
                    input_ids,
                    max_new_tokens=max_new_tokens,
                    do_sample=False
                )

                generated_ids = generated[0][len(context_ids):]
                pred_text = tokenizer.decode(generated_ids, skip_special_tokens=True)
                true_text = tokenizer.decode(target_ids, skip_special_tokens=True)

                pred_numbers = decode_tokens_to_numbers(pred_text)
                true_numbers = decode_tokens_to_numbers(true_text)

                if len(pred_numbers) == len(true_numbers) and len(pred_numbers) > 0:
                    mse = np.mean((np.array(pred_numbers) - np.array(true_numbers)) ** 2)
                    mses.append(mse)

    avg_ce_loss = np.mean(losses) if len(losses) > 0 else float("inf")
    avg_mse = np.mean(mses) if len(mses) > 0 else float("inf")

    print(f"\nEvaluation Results:")
    print(f"  Average Cross-Entropy Loss: {avg_ce_loss:.4f}")
    print(f"  Average MSE (forecast):     {avg_mse:.4f}")

    return avg_ce_loss, avg_mse

### Train the models

In [26]:
# val_loss_base, val_mse_base = evaluate_lora_mse(
#    original_model, tokenizer, val_ids, context_ratio=0.5,batch_size=4,max_gen_tokens=20
# )


# print("=== Evaluation on Validation Set (Untrained Qwen2.5) ===")
# print(f"Cross-Entropy Loss:  {val_loss_base:.4f}")
# print(f"Forecast MSE:        {val_mse_base:.4f}")