In [27]:
import h5py
import numpy as np
from transformers import AutoTokenizer

# Baseline


## (a) Preprocesses and tokenized the dataset

In [28]:
from preprocessor import load_and_preprocess

### Use the preprocesser to preprocess the dataset, and tokenize them

In [30]:
file_path = "data/lotka_volterra_data.h5"

# Use the function to load and preprocess the data
train_texts, val_texts, test_texts = load_and_preprocess(
    file_path,
    decimal_places=2,
    max_target_value=10.0
)

# Demonstrate tokenization using Qwen2.5
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

tokenized_train=[]
tokenized_val=[]
tokenized_test=[]
for i in range(len(train_texts)):
    tokenized_train.append(tokenizer(train_texts[i], return_tensors="pt", add_special_tokens=False)["input_ids"][0])
for i in range(len(val_texts)): 
    tokenized_val.append(tokenizer(val_texts[i], return_tensors="pt", add_special_tokens=False)["input_ids"][0])    
for i in range(len(test_texts)):
    tokenized_test.append(tokenizer(test_texts[i], return_tensors="pt", add_special_tokens=False)["input_ids"][0])





### Show examples of preprocessed data and tokenized data

In [35]:
print(f"Example Preprocessed Sequences: Train Sequence {1}:", train_texts[1])


print(f"Example Tokenized Sequence: Train Sequence {1}:",tokenized_train[1].tolist())

Example Preprocessed Sequences: Train Sequence 1: 2.91,0.28;1.79,0.29;1.09,0.27;0.72,0.23;0.54,0.18;0.44,0.14;0.4,0.11;0.4,0.09;0.41,0.07;0.45,0.05;0.5,0.04;0.58,0.03;0.68,0.03;0.81,0.02;0.98,0.02;1.18,0.02;1.44,0.01;1.76,0.01;2.14,0.01;2.6,0.01;3.15,0.02;3.77,0.02;4.45,0.03;5.14,0.04;5.72,0.06;5.97,0.1;5.61,0.16;4.47,0.24;2.95,0.3;1.72,0.31;1.01,0.28;0.65,0.23;0.47,0.19;0.39,0.14;0.35,0.11;0.34,0.09;0.36,0.07;0.39,0.05;0.44,0.04;0.51,0.03;0.6,0.02;0.72,0.02;0.87,0.02;1.05,0.01;1.29,0.01;1.58,0.01;1.94,0.01;2.37,0.01;2.89,0.01;3.5,0.01;4.2,0.02;4.95,0.03;5.68,0.04;6.22,0.07;6.3,0.12;5.55,0.2;4.0,0.28;2.35,0.33;1.28,0.32;0.74,0.27;0.49,0.22;0.37,0.17;0.32,0.13;0.29,0.1;0.3,0.08;0.32,0.06;0.35,0.04;0.4,0.03;0.47,0.03;0.56,0.02;0.68,0.02;0.82,0.01;1.01,0.01;1.24,0.01;1.53,0.01;1.88,0.01;2.32,0.01;2.84,0.01;3.46,0.01;4.18,0.01;4.97,0.02;5.79,0.03;6.48,0.06;6.77,0.1;6.22,0.18;4.64,0.28;2.72,0.35;1.4,0.35;0.76,0.3;0.47,0.24;0.34,0.19;0.27,0.14;0.25,0.11;0.25,0.08;0.26,0.06;0.29,0.05;0.33,0.0

## (b) Evaluate the untrained Qwen2.5-Instruct modelâ€™s forecasting ability on this tokenized dataset.

In [36]:
from qwen import load_qwen
load_qwen()

(Qwen2ForCausalLM(
   (model): Qwen2Model(
     (embed_tokens): Embedding(151936, 896)
     (layers): ModuleList(
       (0-23): 24 x Qwen2DecoderLayer(
         (self_attn): Qwen2Attention(
           (q_proj): Linear(in_features=896, out_features=896, bias=True)
           (k_proj): Linear(in_features=896, out_features=128, bias=True)
           (v_proj): Linear(in_features=896, out_features=128, bias=True)
           (o_proj): Linear(in_features=896, out_features=896, bias=False)
         )
         (mlp): Qwen2MLP(
           (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
           (up_proj): Linear(in_features=896, out_features=4864, bias=False)
           (down_proj): Linear(in_features=4864, out_features=896, bias=False)
           (act_fn): SiLU()
         )
         (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
         (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
       )
     )
     (norm): Qwen2RMSNorm((896,), eps=1e-06)
     (rotar

In [40]:
import torch
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model, tokenizer = load_qwen()
model.to(device)

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((896,), eps=1e-06)
    (rotary_emb): Qwen2RotaryEmbe

In [41]:

def split_context_target(token_ids: torch.Tensor, context_ratio: float = 0.7):
    """
    Splits a 1D tensor of token IDs into context and target parts.
    
    Args:
        token_ids (torch.Tensor): A 1D tensor of token IDs.
        context_ratio (float): Fraction of tokens to use as context.
    
    Returns:
        (context_ids, target_ids) (torch.Tensor, torch.Tensor)
    """
    total_length = len(token_ids)
    context_length = int(total_length * context_ratio)
    context_ids = token_ids[:context_length]
    target_ids = token_ids[context_length:]
    return context_ids, target_ids

def decode_tokens_to_numbers(text: str):
    """
    Decodes a LLMTIME-formatted string into a list of numeric values.
    Example format: "0.25,1.50;0.27,1.47;0.31,1.42"
    
    We split by semicolon to separate timesteps, then by comma for variables,
    and parse each as a float.
    
    Args:
        text (str): The decoded text from the model's output.
    
    Returns:
        List[float]: A flat list of numeric values (prey, predator, prey, predator, ...).
    """
    numbers = []
    timesteps = text.split(";")
    for step in timesteps:
        # Split each timestep by commas
        parts = step.split(",")
        for p in parts:
            try:
                # Convert the string to float if possible
                val = float(p.strip())
                numbers.append(val)
            except ValueError:
                # If conversion fails (e.g., empty string), skip
                continue
    return numbers

###############################################################################
# Main Evaluation Function
###############################################################################

def evaluate_untrained_forecasting(model, tokenizer, tokenized_data, context_ratio=0.7):
    """
    Evaluate the untrained Qwen2.5-Instruct model's forecasting ability by:
      1) Splitting each tokenized sequence into context (70%) and target (30%).
      2) Generating predictions from the context.
      3) Computing:
         - Cross-entropy loss & perplexity over the entire sequence
         - MSE of the decoded numeric predictions vs. the true target
    
    Args:
        model: The untrained Qwen2.5-Instruct model from qwen.py.
        tokenizer: The Qwen2.5-Instruct tokenizer.
        tokenized_data (List[torch.Tensor]): List of tokenized sequences (1D tensors).
        context_ratio (float): Fraction of tokens to use as context.
    
    Returns:
        (avg_loss, avg_perplexity, avg_mse): Tuple of floats representing
        the mean cross-entropy loss, perplexity, and mean squared error (forecast).
    """
    losses = []
    mses = []

    # Evaluate on a subset (e.g. 10 sequences) for brevity
    num_eval = min(10, len(tokenized_data))

    for i in range(num_eval):
        # 1) Retrieve the i-th tokenized sequence
        seq = tokenized_data[i].to(device)

        # 2) Split into context vs target
        context_ids, target_ids = split_context_target(seq, context_ratio)

        # 3) Generate predictions from the context
        input_ids = context_ids.unsqueeze(0)  # add batch dimension
        max_gen_length = len(context_ids) + len(target_ids)  # we aim to generate as many tokens as the target
        with torch.no_grad():
            generated = model.generate(
                input_ids,
                max_length=max_gen_length,
                do_sample=False  # Greedy generation
            )
        
        # 4) Compute cross-entropy loss over the entire sequence (context + target)
        #    The 'labels' argument means the model will compute language modeling loss
        #    comparing each output token to the same shifted input token.
        full_seq = seq.unsqueeze(0)
        with torch.no_grad():
            output = model(full_seq, labels=full_seq)
            loss_val = output.loss.item()
            losses.append(loss_val)
        
        # 5) Decode the generated tokens for the target portion
        #    We only look at the newly generated tokens after context_ids
        generated_ids = generated[0]
        predicted_target_ids = generated_ids[len(context_ids):]

        # Decode both predicted target and true target tokens
        pred_text = tokenizer.decode(predicted_target_ids, skip_special_tokens=True)
        true_text = tokenizer.decode(target_ids, skip_special_tokens=True)

        # Convert them back to numeric sequences
        pred_numbers = decode_tokens_to_numbers(pred_text)
        true_numbers = decode_tokens_to_numbers(true_text)

        # 6) Compute Mean Squared Error if the lengths match
        if len(pred_numbers) == len(true_numbers) and len(pred_numbers) > 0:
            mse = np.mean((np.array(pred_numbers) - np.array(true_numbers)) ** 2)
            mses.append(mse)

    avg_loss = np.mean(losses) if len(losses) > 0 else float("inf")
    avg_perplexity = float(np.exp(avg_loss)) if avg_loss != float("inf") else float("inf")
    avg_mse = np.mean(mses) if len(mses) > 0 else float("inf")

    return avg_loss, avg_perplexity, avg_mse



In [42]:
model.eval()

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((896,), eps=1e-06)
    (rotary_emb): Qwen2RotaryEmbe

In [None]:

avg_loss, avg_perplexity, avg_mse = evaluate_untrained_forecasting(
    model, tokenizer, tokenized_test, context_ratio=0.7
)

# 4) Print results
print("=== Baseline Evaluation on Untrained Qwen2.5-Instruct ===")
print(f"Average Cross-Entropy Loss: {avg_loss:.4f}")
print(f"Average Perplexity:       {avg_perplexity:.4f}")
print(f"Average MSE (Forecast):  {avg_mse:.4f}")



=== Baseline Evaluation on Untrained Qwen2.5-Instruct ===
Average Cross-Entropy Loss: 0.6168
Average Perplexity:       1.8530
Average MSE (Forecast):  1.0159


## (c) Map each operation to its flops

In [45]:
from flops import flops_for_experiment

# Suppose we want to run inference for 10 steps (e.g., 10 forward passes)
num_steps = 1

# Hypothetical model configuration
batch_size = 2      # number of samples per batch
seq_len = 128       # input sequence length
hidden_dim = 512    # model hidden dimension
num_layers = 12     # number of Transformer blocks
num_heads = 8       # number of attention heads
ffn_ratio = 4.0     # typical ratio for feed-forward layer size

# Compute total FLOPS for the inference experiment
total_inference_flops = flops_for_experiment(
    num_steps=num_steps,
    batch_size=batch_size,
    seq_len=seq_len,
    hidden_dim=hidden_dim,
    num_layers=num_layers,
    num_heads=num_heads,
    ffn_ratio=ffn_ratio,
    training=False  # <--- Key: we are doing inference only
)

print("=== Inference FLOPS Calculation ===")
print(f"Number of inference steps: {num_steps}")
print(f"Batch size: {batch_size}")
print(f"Sequence length: {seq_len}")
print(f"Hidden dimension: {hidden_dim}")
print(f"Transformer blocks: {num_layers}")
print(f"Attention heads: {num_heads}")
print(f"FFN ratio: {ffn_ratio}")
print("------------------------------------")
print(f"Total FLOPS for inference: {total_inference_flops:.2e}")




=== Inference FLOPS Calculation ===
Number of inference steps: 1
Batch size: 2
Sequence length: 128
Hidden dimension: 512
Transformer blocks: 12
Attention heads: 8
FFN ratio: 4.0
------------------------------------
Total FLOPS for inference: 3.33e+10
