In [1]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/m2cw

Mounted at /content/drive
/content/drive/MyDrive/m2cw


In [2]:
import h5py
import numpy as np
import torch
from transformers import AutoTokenizer
import wandb
import matplotlib.pyplot as plt
from src.preprocessor import load_and_preprocess
from src.lora_skeleton import apply_lora, train_lora,load_data,LoRALinear
from src.qwen import load_qwen
from src.evaluation import evaluation

if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

wandb.login()



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mliuyihao649[0m ([33mliuyihao649-university-of-cambridge[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [3]:
np.random.seed(42)

# Load Qwen2.5 0.5B Model and Tokenizer ---
model_name = "Qwen/Qwen2.5-0.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

file_path = "lotka_volterra_data.h5"

# Use the function to load and preprocess the data
train_texts, val_texts, test_texts = load_and_preprocess(
    file_path,
    decimal_places=2,
    max_target_value=9.99
)

# Demonstrate tokenization using Qwen2.5
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

tokenized_train=[]
tokenized_val=[]
tokenized_test=[]
for i in range(len(train_texts)):
    tokenized_train.append(tokenizer(train_texts[i], return_tensors="pt", add_special_tokens=False)["input_ids"][0])
for i in range(len(val_texts)):
    tokenized_val.append(tokenizer(val_texts[i], return_tensors="pt", add_special_tokens=False)["input_ids"][0])
for i in range(len(test_texts)):
    tokenized_test.append(tokenizer(test_texts[i], return_tensors="pt", add_special_tokens=False)["input_ids"][0])



tokenizer_config.json:   0%|          | 0.00/7.23k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

In [4]:
train_ids, val_ids, test_ids = load_data(tokenizer)

In [5]:
# Save the weights for each model
# Only saving LoRA part
def save_lora_parameters(model, file_path):
    """
    Saves only the LoRA parameters from the model.

    This function iterates over all modules in the model and extracts the parameters
    from those modules that are instances of LoRALinear. It saves the 'A' and 'B' parameters
    into a state dictionary and then saves it using torch.save.

    Args:
        model: The model containing LoRA modules.
        file_path (str): The file path to save the LoRA parameters.
    """
    state_dict = {}
    for name, module in model.named_modules():
        # Check if the module is an instance of LoRALinear
        if isinstance(module, LoRALinear):
            state_dict[name + ".A"] = module.A.detach().cpu()
            state_dict[name + ".B"] = module.B.detach().cpu()
    # Also save LM head's trainable parameters
    if hasattr(model, "lm_head") and hasattr(model.lm_head, "bias"):
        if model.lm_head.bias.requires_grad:
            state_dict["lm_head.bias"] = model.lm_head.bias.detach().cpu()
    torch.save(state_dict, file_path)


### 3(b): Grid search

In [None]:
for rank in [2, 4, 8]:
    for lr in [1e-5, 5e-5, 1e-4]:
        # init each experiments
        wandb.init(
            project="Qwen-lora-grid-search",
            name=f"rank={rank}_lr={lr}_train",
            group="train_loss_curve",
            reinit=True
        )

        # load the original model and apply lora to it
        base_model,_ = load_qwen()
        apply_lora(base_model, r=rank)



        # LoRA train
        _ = train_lora(
            base_model,
            train_ids,
            learning_rate=lr,
            batch_size=4,
            max_steps=1000,

        )

        # Save the weights(only LoRA part, no need to save the whole model)
        save_path = f"/content/drive/MyDrive/m2cw/lora_rank={rank}_lr={lr}.pt"
        save_lora_parameters(base_model, save_path)

        # Evaluation
        wandb.init(project="Qwen-lora-grid-search",
            name=f"rank={rank}_lr={lr}_evaluation",
            reinit=True)
        val_loss, val_mse = evaluation(base_model, tokenizer, tokenized_val)


        print(f"rank{rank},lr{lr},evalaution results:")
        print(f"Average Cross-Entropy Loss: {val_loss:.4f}")
        print(f"Average MSE (Forecast):  {val_mse:.4f}")



        wandb.finish()



Training:   5%|▌         | 50/1000 [00:15<04:37,  3.42it/s, loss=0.71]

Step 50: loss = 0.7103


Training:  10%|█         | 100/1000 [00:30<04:23,  3.42it/s, loss=0.683]

Step 100: loss = 0.6827


Training:  15%|█▌        | 150/1000 [00:44<04:07,  3.43it/s, loss=0.813]

Step 150: loss = 0.8129


Training:  20%|██        | 200/1000 [00:59<03:53,  3.42it/s, loss=0.613]

Step 200: loss = 0.6125


Training:  25%|██▌       | 250/1000 [01:14<03:39,  3.42it/s, loss=0.63]

Step 250: loss = 0.6303


Training:  30%|███       | 300/1000 [01:28<03:24,  3.43it/s, loss=0.621]

Step 300: loss = 0.6205


Training:  35%|███▌      | 350/1000 [01:43<03:09,  3.42it/s, loss=0.642]

Step 350: loss = 0.6416


Training:  40%|████      | 400/1000 [01:58<02:55,  3.42it/s, loss=0.605]

Step 400: loss = 0.6046


Training:  45%|████▌     | 450/1000 [02:12<02:40,  3.42it/s, loss=0.607]

Step 450: loss = 0.6073


Training:  50%|█████     | 500/1000 [02:27<02:26,  3.42it/s, loss=0.57]

Step 500: loss = 0.5705


Training:  55%|█████▌    | 550/1000 [02:41<02:11,  3.42it/s, loss=0.571]

Step 550: loss = 0.5706


Training:  60%|██████    | 600/1000 [02:56<01:56,  3.43it/s, loss=0.621]

Step 600: loss = 0.6208


Training:  65%|██████▌   | 650/1000 [03:11<01:42,  3.42it/s, loss=0.556]

Step 650: loss = 0.5559


Training:  70%|███████   | 700/1000 [03:25<01:27,  3.42it/s, loss=0.575]

Step 700: loss = 0.5746


Training:  75%|███████▌  | 750/1000 [03:40<01:13,  3.41it/s, loss=0.678]

Step 750: loss = 0.6780


Training:  80%|████████  | 800/1000 [03:55<00:58,  3.41it/s, loss=0.72]

Step 800: loss = 0.7203


Training:  85%|████████▌ | 850/1000 [04:09<00:43,  3.41it/s, loss=0.613]

Step 850: loss = 0.6126


Training:  90%|█████████ | 900/1000 [04:24<00:29,  3.42it/s, loss=0.561]

Step 900: loss = 0.5606


Training:  95%|█████████▌| 950/1000 [04:38<00:14,  3.42it/s, loss=0.608]

Step 950: loss = 0.6076


Training: 100%|██████████| 1000/1000 [04:53<00:00,  3.41it/s, loss=0.505]


Step 1000: loss = 0.5053


0,1
loss,▇▅▄▄▄▃▇▅▆█▄█▅▄▅▇▄▅▄▄▄▇▁▃▄▃▄▃▅▄▅▆▃▅▃▃▄▃▂▄
step,▁▁▁▁▁▁▂▂▂▂▃▃▃▃▃▃▃▃▃▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇██

0,1
loss,0.50531
step,999.0


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


rank2,lr1e-05,evalaution results:
Average Cross-Entropy Loss: 1.7700
Average MSE (Forecast):  0.0561


0,1
avg_loss,▁
avg_mse,▁

0,1
avg_loss,1.77005
avg_mse,0.05613


Training:   5%|▌         | 50/1000 [00:14<04:38,  3.41it/s, loss=0.695]

Step 50: loss = 0.6949


Training:  10%|█         | 100/1000 [00:29<04:22,  3.42it/s, loss=0.553]

Step 100: loss = 0.5532


Training:  15%|█▌        | 150/1000 [00:43<04:08,  3.42it/s, loss=0.557]

Step 150: loss = 0.5568


Training:  20%|██        | 200/1000 [00:58<03:53,  3.43it/s, loss=0.513]

Step 200: loss = 0.5132


Training:  25%|██▌       | 250/1000 [01:13<03:39,  3.42it/s, loss=0.556]

Step 250: loss = 0.5559


Training:  30%|███       | 300/1000 [01:27<03:24,  3.42it/s, loss=0.456]

Step 300: loss = 0.4557


Training:  35%|███▌      | 350/1000 [01:42<03:10,  3.42it/s, loss=0.598]

Step 350: loss = 0.5977


Training:  40%|████      | 400/1000 [01:56<02:55,  3.42it/s, loss=0.462]

Step 400: loss = 0.4616


Training:  45%|████▌     | 450/1000 [02:11<02:40,  3.43it/s, loss=0.456]

Step 450: loss = 0.4560


Training:  50%|█████     | 500/1000 [02:26<02:26,  3.42it/s, loss=0.461]

Step 500: loss = 0.4610


Training:  55%|█████▌    | 550/1000 [02:40<02:11,  3.42it/s, loss=0.453]

Step 550: loss = 0.4530


Training:  60%|██████    | 600/1000 [02:55<01:57,  3.41it/s, loss=0.415]

Step 600: loss = 0.4151


Training:  65%|██████▌   | 650/1000 [03:10<01:42,  3.42it/s, loss=0.371]

Step 650: loss = 0.3711


Training:  70%|███████   | 700/1000 [03:24<01:27,  3.43it/s, loss=0.38]

Step 700: loss = 0.3804


Training:  75%|███████▌  | 750/1000 [03:39<01:13,  3.42it/s, loss=0.472]

Step 750: loss = 0.4719


Training:  80%|████████  | 800/1000 [03:53<00:58,  3.42it/s, loss=0.437]

Step 800: loss = 0.4374


Training:  85%|████████▌ | 850/1000 [04:08<00:43,  3.41it/s, loss=0.455]

Step 850: loss = 0.4554


Training:  90%|█████████ | 900/1000 [04:23<00:29,  3.42it/s, loss=0.378]

Step 900: loss = 0.3781


Training:  95%|█████████▌| 950/1000 [04:37<00:14,  3.42it/s, loss=0.473]

Step 950: loss = 0.4732


Training: 100%|██████████| 1000/1000 [04:52<00:00,  3.42it/s, loss=0.507]


Step 1000: loss = 0.5075


0,1
loss,▃▇▇▅▇██▆▅▄▂▅▂▂▂▂▅▂▃▂▄▂▂▁▂▃▃▃▁▃▃▂▃▂▁▂▂▃▂▃
step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▆▆▆▇▇▇▇██

0,1
loss,0.50747
step,999.0


rank2,lr5e-05,evalaution results:
Average Cross-Entropy Loss: 1.4630
Average MSE (Forecast):  0.0100


0,1
avg_loss,▁
avg_mse,▁

0,1
avg_loss,1.46296
avg_mse,0.01002


Training:   5%|▌         | 50/1000 [00:14<04:37,  3.42it/s, loss=0.599]

Step 50: loss = 0.5987


Training:  10%|█         | 100/1000 [00:29<04:22,  3.42it/s, loss=0.478]

Step 100: loss = 0.4784


Training:  15%|█▌        | 150/1000 [00:43<04:08,  3.43it/s, loss=0.493]

Step 150: loss = 0.4929


Training:  20%|██        | 200/1000 [00:58<03:53,  3.42it/s, loss=0.448]

Step 200: loss = 0.4484


Training:  25%|██▌       | 250/1000 [01:13<03:39,  3.42it/s, loss=0.526]

Step 250: loss = 0.5257


Training:  30%|███       | 300/1000 [01:27<03:25,  3.41it/s, loss=0.402]

Step 300: loss = 0.4022


Training:  35%|███▌      | 350/1000 [01:42<03:10,  3.42it/s, loss=0.479]

Step 350: loss = 0.4792


Training:  40%|████      | 400/1000 [01:57<02:55,  3.42it/s, loss=0.428]

Step 400: loss = 0.4276


Training:  45%|████▌     | 450/1000 [02:11<02:40,  3.42it/s, loss=0.32]

Step 450: loss = 0.3196


Training:  50%|█████     | 500/1000 [02:26<02:26,  3.42it/s, loss=0.461]

Step 500: loss = 0.4614


Training:  55%|█████▌    | 550/1000 [02:40<02:11,  3.42it/s, loss=0.411]

Step 550: loss = 0.4113


Training:  60%|██████    | 600/1000 [02:55<01:57,  3.41it/s, loss=0.363]

Step 600: loss = 0.3633


Training:  65%|██████▌   | 650/1000 [03:10<01:42,  3.42it/s, loss=0.358]

Step 650: loss = 0.3582


Training:  70%|███████   | 700/1000 [03:24<01:27,  3.42it/s, loss=0.33]

Step 700: loss = 0.3305


Training:  75%|███████▌  | 750/1000 [03:39<01:13,  3.42it/s, loss=0.282]

Step 750: loss = 0.2818


Training:  80%|████████  | 800/1000 [03:54<00:58,  3.42it/s, loss=0.467]

Step 800: loss = 0.4670


Training:  85%|████████▌ | 850/1000 [04:08<00:43,  3.42it/s, loss=0.361]

Step 850: loss = 0.3607


Training:  90%|█████████ | 900/1000 [04:23<00:29,  3.41it/s, loss=0.408]

Step 900: loss = 0.4084


Training:  95%|█████████▌| 950/1000 [04:37<00:14,  3.42it/s, loss=0.325]

Step 950: loss = 0.3251


Training: 100%|██████████| 1000/1000 [04:52<00:00,  3.42it/s, loss=0.411]

Step 1000: loss = 0.4110





0,1
loss,█▇▆█▅▆▃▂▂▃▄▄▄▂▃▂▃▂▃▃▃▃▃▅▂▄▁▂▂▁▁▃▂▂▂▂▂▂▂▂
step,▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇█

0,1
loss,0.411
step,999.0


rank2,lr0.0001,evalaution results:
Average Cross-Entropy Loss: 1.6718
Average MSE (Forecast):  0.0148


0,1
avg_loss,▁
avg_mse,▁

0,1
avg_loss,1.67181
avg_mse,0.01481


Training:   5%|▌         | 50/1000 [00:14<04:37,  3.42it/s, loss=0.715]

Step 50: loss = 0.7151


Training:  10%|█         | 100/1000 [00:29<04:23,  3.42it/s, loss=0.804]

Step 100: loss = 0.8043


Training:  15%|█▌        | 150/1000 [00:43<04:08,  3.42it/s, loss=0.695]

Step 150: loss = 0.6954


Training:  20%|██        | 200/1000 [00:58<03:53,  3.43it/s, loss=0.571]

Step 200: loss = 0.5709


Training:  25%|██▌       | 250/1000 [01:13<03:38,  3.43it/s, loss=0.736]

Step 250: loss = 0.7362


Training:  30%|███       | 300/1000 [01:27<03:24,  3.41it/s, loss=0.556]

Step 300: loss = 0.5561


Training:  35%|███▌      | 350/1000 [01:42<03:10,  3.42it/s, loss=0.681]

Step 350: loss = 0.6809


Training:  40%|████      | 400/1000 [01:57<02:55,  3.42it/s, loss=0.639]

Step 400: loss = 0.6389


Training:  45%|████▌     | 450/1000 [02:11<02:40,  3.42it/s, loss=0.5]

Step 450: loss = 0.5005


Training:  50%|█████     | 500/1000 [02:26<02:26,  3.41it/s, loss=0.523]

Step 500: loss = 0.5235


Training:  55%|█████▌    | 550/1000 [02:40<02:11,  3.42it/s, loss=0.554]

Step 550: loss = 0.5541


Training:  60%|██████    | 600/1000 [02:55<01:56,  3.43it/s, loss=0.627]

Step 600: loss = 0.6269


Training:  65%|██████▌   | 650/1000 [03:10<01:42,  3.42it/s, loss=0.573]

Step 650: loss = 0.5730


Training:  70%|███████   | 700/1000 [03:24<01:27,  3.42it/s, loss=0.516]

Step 700: loss = 0.5160


Training:  75%|███████▌  | 750/1000 [03:39<01:13,  3.42it/s, loss=0.471]

Step 750: loss = 0.4714


Training:  80%|████████  | 800/1000 [03:54<00:58,  3.42it/s, loss=0.533]

Step 800: loss = 0.5335


Training:  85%|████████▌ | 850/1000 [04:08<00:43,  3.42it/s, loss=0.421]

Step 850: loss = 0.4210


Training:  90%|█████████ | 900/1000 [04:23<00:29,  3.42it/s, loss=0.512]

Step 900: loss = 0.5125


Training:  95%|█████████▌| 950/1000 [04:37<00:14,  3.42it/s, loss=0.548]

Step 950: loss = 0.5479


Training: 100%|██████████| 1000/1000 [04:52<00:00,  3.42it/s, loss=0.469]


Step 1000: loss = 0.4690


0,1
loss,▃▆▅▄█▄▆▄▇▆▄▄▆▄▆▅▄▃▆▂▁▃▄▄▄▁▄▄▂▁▄▃▂▃▂▃▁▂▃▂
step,▁▁▁▂▂▂▃▃▃▃▃▄▄▄▅▅▅▅▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇███

0,1
loss,0.46902
step,999.0


rank4,lr1e-05,evalaution results:
Average Cross-Entropy Loss: 1.6040
Average MSE (Forecast):  0.0375


0,1
avg_loss,▁
avg_mse,▁

0,1
avg_loss,1.60404
avg_mse,0.03751


Training:   5%|▌         | 50/1000 [00:14<04:37,  3.42it/s, loss=0.758]

Step 50: loss = 0.7585


Training:  10%|█         | 100/1000 [00:29<04:22,  3.43it/s, loss=0.63]

Step 100: loss = 0.6299


Training:  15%|█▌        | 150/1000 [00:43<04:08,  3.42it/s, loss=0.52]

Step 150: loss = 0.5202


Training:  20%|██        | 200/1000 [00:58<03:54,  3.41it/s, loss=0.487]

Step 200: loss = 0.4866


Training:  25%|██▌       | 250/1000 [01:13<03:39,  3.42it/s, loss=0.526]

Step 250: loss = 0.5261


Training:  30%|███       | 300/1000 [01:27<03:24,  3.42it/s, loss=0.409]

Step 300: loss = 0.4093


Training:  35%|███▌      | 350/1000 [01:42<03:10,  3.42it/s, loss=0.413]

Step 350: loss = 0.4126


Training:  40%|████      | 400/1000 [01:57<02:55,  3.42it/s, loss=0.395]

Step 400: loss = 0.3947


Training:  45%|████▌     | 450/1000 [02:11<02:40,  3.42it/s, loss=0.491]

Step 450: loss = 0.4913


Training:  50%|█████     | 500/1000 [02:26<02:26,  3.41it/s, loss=0.42]

Step 500: loss = 0.4202


Training:  55%|█████▌    | 550/1000 [02:41<02:11,  3.41it/s, loss=0.438]

Step 550: loss = 0.4381


Training:  60%|██████    | 600/1000 [02:55<01:57,  3.41it/s, loss=0.377]

Step 600: loss = 0.3766


Training:  65%|██████▌   | 650/1000 [03:10<01:42,  3.42it/s, loss=0.476]

Step 650: loss = 0.4759


Training:  70%|███████   | 700/1000 [03:25<01:27,  3.42it/s, loss=0.374]

Step 700: loss = 0.3735


Training:  75%|███████▌  | 750/1000 [03:39<01:13,  3.42it/s, loss=0.31]

Step 750: loss = 0.3096


Training:  80%|████████  | 800/1000 [03:54<00:58,  3.42it/s, loss=0.5]

Step 800: loss = 0.5000


Training:  85%|████████▌ | 850/1000 [04:08<00:43,  3.41it/s, loss=0.456]

Step 850: loss = 0.4556


Training:  90%|█████████ | 900/1000 [04:23<00:29,  3.42it/s, loss=0.368]

Step 900: loss = 0.3684


Training:  95%|█████████▌| 950/1000 [04:38<00:14,  3.41it/s, loss=0.47]

Step 950: loss = 0.4699


Training: 100%|██████████| 1000/1000 [04:52<00:00,  3.41it/s, loss=0.337]


Step 1000: loss = 0.3369


0,1
loss,██▄▅▆▃▃▃▆▃▄▂▅▂▃▃▃▃▃▄▁▂▃▃▂▁▁▁▃▂▃▂▂▁▄▂▁▂▁▂
step,▁▁▁▁▁▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▄▄▅▅▆▆▆▆▆▆▇▇▇▇▇▇███

0,1
loss,0.3369
step,999.0


rank4,lr5e-05,evalaution results:
Average Cross-Entropy Loss: 1.7418
Average MSE (Forecast):  0.0251


0,1
avg_loss,▁
avg_mse,▁

0,1
avg_loss,1.74181
avg_mse,0.02507


Training:   5%|▌         | 50/1000 [00:14<04:38,  3.41it/s, loss=0.492]

Step 50: loss = 0.4918


Training:  10%|█         | 100/1000 [00:29<04:23,  3.42it/s, loss=0.541]

Step 100: loss = 0.5414


Training:  15%|█▌        | 150/1000 [00:44<04:08,  3.41it/s, loss=0.528]

Step 150: loss = 0.5284


Training:  20%|██        | 200/1000 [00:58<03:54,  3.41it/s, loss=0.466]

Step 200: loss = 0.4663


Training:  25%|██▌       | 250/1000 [01:13<03:39,  3.41it/s, loss=0.469]

Step 250: loss = 0.4690


Training:  30%|███       | 300/1000 [01:27<03:24,  3.42it/s, loss=0.403]

Step 300: loss = 0.4032


Training:  35%|███▌      | 350/1000 [01:42<03:10,  3.41it/s, loss=0.384]

Step 350: loss = 0.3844


Training:  40%|████      | 400/1000 [01:57<02:55,  3.41it/s, loss=0.405]

Step 400: loss = 0.4047


Training:  45%|████▌     | 450/1000 [02:11<02:40,  3.42it/s, loss=0.418]

Step 450: loss = 0.4177


Training:  50%|█████     | 500/1000 [02:26<02:26,  3.41it/s, loss=0.394]

Step 500: loss = 0.3944


Training:  55%|█████▌    | 550/1000 [02:41<02:11,  3.41it/s, loss=0.402]

Step 550: loss = 0.4017


Training:  60%|██████    | 600/1000 [02:55<01:57,  3.42it/s, loss=0.408]

Step 600: loss = 0.4080


Training:  65%|██████▌   | 650/1000 [03:10<01:42,  3.41it/s, loss=0.403]

Step 650: loss = 0.4031


Training:  70%|███████   | 700/1000 [03:25<01:27,  3.41it/s, loss=0.476]

Step 700: loss = 0.4760


Training:  75%|███████▌  | 750/1000 [03:39<01:13,  3.42it/s, loss=0.423]

Step 750: loss = 0.4234


Training:  80%|████████  | 800/1000 [03:54<00:58,  3.41it/s, loss=0.361]

Step 800: loss = 0.3613


Training:  85%|████████▌ | 850/1000 [04:09<00:43,  3.42it/s, loss=0.375]

Step 850: loss = 0.3751


Training:  90%|█████████ | 900/1000 [04:23<00:29,  3.41it/s, loss=0.374]

Step 900: loss = 0.3738


Training:  95%|█████████▌| 950/1000 [04:38<00:14,  3.42it/s, loss=0.36]

Step 950: loss = 0.3597


Training: 100%|██████████| 1000/1000 [04:53<00:00,  3.41it/s, loss=0.361]


Step 1000: loss = 0.3606


0,1
loss,██▆▄▄▄▄▂▄▃▃▃▂▄▃▁▂▂▃▂▂▂▂▁▂▂▃▂▂▂▁▃▃▂▃▂▂▃▃▂
step,▁▁▁▂▃▃▃▃▄▄▄▄▄▄▄▄▄▄▅▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇████

0,1
loss,0.36055
step,999.0


rank4,lr0.0001,evalaution results:
Average Cross-Entropy Loss: 1.8379
Average MSE (Forecast):  0.0133


0,1
avg_loss,▁
avg_mse,▁

0,1
avg_loss,1.83793
avg_mse,0.01329


Training:   5%|▌         | 50/1000 [00:14<04:55,  3.21it/s, loss=0.52]

Step 50: loss = 0.5198


Training:  10%|█         | 100/1000 [00:29<04:23,  3.42it/s, loss=0.707]

Step 100: loss = 0.7067


Training:  15%|█▌        | 150/1000 [00:43<04:08,  3.42it/s, loss=0.753]

Step 150: loss = 0.7530


Training:  20%|██        | 200/1000 [00:58<03:54,  3.42it/s, loss=0.643]

Step 200: loss = 0.6427


Training:  25%|██▌       | 250/1000 [01:13<03:39,  3.42it/s, loss=0.683]

Step 250: loss = 0.6834


Training:  30%|███       | 300/1000 [01:27<03:24,  3.42it/s, loss=0.615]

Step 300: loss = 0.6150


Training:  35%|███▌      | 350/1000 [01:42<03:10,  3.41it/s, loss=0.554]

Step 350: loss = 0.5536


Training:  40%|████      | 400/1000 [01:57<02:55,  3.42it/s, loss=0.591]

Step 400: loss = 0.5911


Training:  45%|████▌     | 450/1000 [02:11<02:41,  3.41it/s, loss=0.568]

Step 450: loss = 0.5677


Training:  50%|█████     | 500/1000 [02:26<02:26,  3.42it/s, loss=0.497]

Step 500: loss = 0.4968


Training:  55%|█████▌    | 550/1000 [02:41<02:11,  3.41it/s, loss=0.446]

Step 550: loss = 0.4456


Training:  60%|██████    | 600/1000 [02:55<01:57,  3.42it/s, loss=0.516]

Step 600: loss = 0.5163


Training:  65%|██████▌   | 650/1000 [03:10<01:42,  3.42it/s, loss=0.523]

Step 650: loss = 0.5228


Training:  70%|███████   | 700/1000 [03:24<01:27,  3.42it/s, loss=0.436]

Step 700: loss = 0.4362


Training:  75%|███████▌  | 750/1000 [03:39<01:13,  3.42it/s, loss=0.417]

Step 750: loss = 0.4165


Training:  80%|████████  | 800/1000 [03:54<00:58,  3.41it/s, loss=0.479]

Step 800: loss = 0.4791


Training:  85%|████████▌ | 850/1000 [04:08<00:43,  3.42it/s, loss=0.562]

Step 850: loss = 0.5617


Training:  90%|█████████ | 900/1000 [04:23<00:29,  3.42it/s, loss=0.339]

Step 900: loss = 0.3387


Training:  95%|█████████▌| 950/1000 [04:38<00:14,  3.42it/s, loss=0.583]

Step 950: loss = 0.5829


Training: 100%|██████████| 1000/1000 [04:52<00:00,  3.42it/s, loss=0.579]


Step 1000: loss = 0.5792


0,1
loss,█▅▆▅▅▄▄▅▅▃▄▃▆▃▂▂▃▃▃▃▄▃▃▃▃▃▂▃▃▂▃▃▃▂▂▂▁▂▁▂
step,▁▁▁▁▁▂▂▂▃▃▄▄▄▄▄▄▄▄▄▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇██████

0,1
loss,0.57924
step,999.0


rank8,lr1e-05,evalaution results:
Average Cross-Entropy Loss: 1.6141
Average MSE (Forecast):  0.0328


0,1
avg_loss,▁
avg_mse,▁

0,1
avg_loss,1.61407
avg_mse,0.03277


Training:   5%|▌         | 50/1000 [00:14<04:37,  3.42it/s, loss=0.626]

Step 50: loss = 0.6258


Training:  10%|█         | 100/1000 [00:29<04:23,  3.42it/s, loss=0.592]

Step 100: loss = 0.5920


Training:  15%|█▌        | 150/1000 [00:43<04:08,  3.42it/s, loss=0.432]

Step 150: loss = 0.4316


Training:  20%|██        | 200/1000 [00:58<03:54,  3.42it/s, loss=0.521]

Step 200: loss = 0.5212


Training:  25%|██▌       | 250/1000 [01:13<03:39,  3.42it/s, loss=0.371]

Step 250: loss = 0.3708


Training:  30%|███       | 300/1000 [01:27<03:24,  3.42it/s, loss=0.395]

Step 300: loss = 0.3946


Training:  35%|███▌      | 350/1000 [01:42<03:09,  3.42it/s, loss=0.493]

Step 350: loss = 0.4929


Training:  40%|████      | 400/1000 [01:57<02:55,  3.42it/s, loss=0.484]

Step 400: loss = 0.4840


Training:  45%|████▌     | 450/1000 [02:11<02:40,  3.42it/s, loss=0.419]

Step 450: loss = 0.4192


Training:  50%|█████     | 500/1000 [02:26<02:26,  3.42it/s, loss=0.339]

Step 500: loss = 0.3387


Training:  55%|█████▌    | 550/1000 [02:40<02:11,  3.42it/s, loss=0.355]

Step 550: loss = 0.3550


Training:  60%|██████    | 600/1000 [02:55<01:57,  3.42it/s, loss=0.462]

Step 600: loss = 0.4616


Training:  65%|██████▌   | 650/1000 [03:10<01:42,  3.42it/s, loss=0.373]

Step 650: loss = 0.3730


Training:  70%|███████   | 700/1000 [03:24<01:27,  3.42it/s, loss=0.436]

Step 700: loss = 0.4362


Training:  75%|███████▌  | 750/1000 [03:39<01:13,  3.42it/s, loss=0.366]

Step 750: loss = 0.3664


Training:  80%|████████  | 800/1000 [03:54<00:58,  3.42it/s, loss=0.396]

Step 800: loss = 0.3959


Training:  85%|████████▌ | 850/1000 [04:08<00:43,  3.41it/s, loss=0.353]

Step 850: loss = 0.3530


Training:  90%|█████████ | 900/1000 [04:23<00:29,  3.42it/s, loss=0.465]

Step 900: loss = 0.4645


Training:  95%|█████████▌| 950/1000 [04:38<00:14,  3.42it/s, loss=0.478]

Step 950: loss = 0.4776


Training: 100%|██████████| 1000/1000 [04:52<00:00,  3.42it/s, loss=0.262]


Step 1000: loss = 0.2619


0,1
loss,▆█▆▅▃▄▄▄▄▃▄▃▄▃▃▂▃▃▂▂▂▂▂▃▂▃▃▁▃▂▃▂▂▂▂▁▂▂▃▂
step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇████

0,1
loss,0.2619
step,999.0


rank8,lr5e-05,evalaution results:
Average Cross-Entropy Loss: 1.9905
Average MSE (Forecast):  0.0256


0,1
avg_loss,▁
avg_mse,▁

0,1
avg_loss,1.99047
avg_mse,0.02555


Training:   5%|▌         | 50/1000 [00:14<04:38,  3.41it/s, loss=0.487]

Step 50: loss = 0.4870


Training:  10%|█         | 100/1000 [00:29<04:23,  3.42it/s, loss=0.393]

Step 100: loss = 0.3927


Training:  15%|█▌        | 150/1000 [00:43<04:08,  3.42it/s, loss=0.407]

Step 150: loss = 0.4071


Training:  20%|██        | 200/1000 [00:58<03:53,  3.42it/s, loss=0.484]

Step 200: loss = 0.4844


Training:  25%|██▌       | 250/1000 [01:13<03:39,  3.42it/s, loss=0.369]

Step 250: loss = 0.3690


Training:  30%|███       | 300/1000 [01:27<03:24,  3.42it/s, loss=0.376]

Step 300: loss = 0.3763


Training:  35%|███▌      | 350/1000 [01:42<03:10,  3.42it/s, loss=0.444]

Step 350: loss = 0.4436


Training:  40%|████      | 400/1000 [01:57<02:55,  3.42it/s, loss=0.403]

Step 400: loss = 0.4035


Training:  45%|████▌     | 450/1000 [02:11<02:40,  3.42it/s, loss=0.411]

Step 450: loss = 0.4115


Training:  50%|█████     | 500/1000 [02:26<02:26,  3.42it/s, loss=0.332]

Step 500: loss = 0.3316


Training:  55%|█████▌    | 550/1000 [02:41<02:11,  3.42it/s, loss=0.369]

Step 550: loss = 0.3692


Training:  60%|██████    | 600/1000 [02:55<01:56,  3.42it/s, loss=0.333]

Step 600: loss = 0.3330


Training:  65%|██████▌   | 650/1000 [03:10<01:42,  3.42it/s, loss=0.405]

Step 650: loss = 0.4050


Training:  70%|███████   | 700/1000 [03:24<01:27,  3.42it/s, loss=0.362]

Step 700: loss = 0.3624


Training:  75%|███████▌  | 750/1000 [03:39<01:13,  3.42it/s, loss=0.34]

Step 750: loss = 0.3396


Training:  80%|████████  | 800/1000 [03:54<00:58,  3.42it/s, loss=0.379]

Step 800: loss = 0.3789


Training:  85%|████████▌ | 850/1000 [04:08<00:43,  3.42it/s, loss=0.473]

Step 850: loss = 0.4727


Training:  90%|█████████ | 900/1000 [04:23<00:29,  3.41it/s, loss=0.315]

Step 900: loss = 0.3151


Training:  95%|█████████▌| 950/1000 [04:38<00:14,  3.42it/s, loss=0.289]

Step 950: loss = 0.2891


Training: 100%|██████████| 1000/1000 [04:52<00:00,  3.42it/s, loss=0.323]


Step 1000: loss = 0.3227


0,1
loss,█▇▄▅▅▄▅▅▄▄▂▄▃▃▃▃▁▃▃▂▃▁▂▂▂▃▂▂▂▂▃▂▂▂▃▁▂▁▂▂
step,▁▂▂▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇███

0,1
loss,0.32271
step,999.0


rank8,lr0.0001,evalaution results:
Average Cross-Entropy Loss: 1.6746
Average MSE (Forecast):  0.0071


0,1
avg_loss,▁
avg_mse,▁

0,1
avg_loss,1.67464
avg_mse,0.00707


# Now we select the best hyperparameter settings to train models again by varying the length of sequence of training set.

In [None]:
max_context_length=[128,512,768]

for length in max_context_length:
    train_ids,_,_=load_data(tokenizer,max_ctx_length=length)
    wandb.init(
        project="Qwen-lora-different-context-len",
        name=f"context-length={length}_train",
        group="train_loss_curve",
        reinit=True
    )
    base_model,_ = load_qwen()
    apply_lora(base_model,r=8)
    _ = train_lora(
        base_model,
        train_ids,
        learning_rate=1e-4,
        batch_size=4,
        max_steps=1000,
    )
    # Save the weights(only LoRA part, no need to save the whole model)
    save_path = f"/content/drive/MyDrive/m2cw/Context_length={length}_rank=2_lr=1e-4.pt"
    save_lora_parameters(base_model, save_path)


    wandb.init(
        project="Qwen-lora-different-context-len",
        name=f"context-length:{length}_evaluation",
        reinit=True
    )
    val_loss, val_mse = evaluation(base_model, tokenizer, tokenized_val)


    print(f"rank{rank},lr{lr},evalaution results:")
    print(f"Average Cross-Entropy Loss: {val_loss:.4f}")
    print(f"Average MSE (Forecast):  {val_mse:.4f}")

    wandb.finish()




Training:   5%|▌         | 51/1000 [00:05<01:44,  9.05it/s, loss=0.598]

Step 50: loss = 0.6213


Training:  10%|█         | 101/1000 [00:11<01:40,  8.93it/s, loss=0.555]

Step 100: loss = 0.5399


Training:  15%|█▌        | 151/1000 [00:16<01:34,  9.00it/s, loss=0.488]

Step 150: loss = 0.5025


Training:  20%|██        | 201/1000 [00:22<01:29,  8.97it/s, loss=0.545]

Step 200: loss = 0.4568


Training:  25%|██▌       | 251/1000 [00:27<01:22,  9.03it/s, loss=0.574]

Step 250: loss = 0.5830


Training:  30%|███       | 301/1000 [00:33<01:15,  9.23it/s, loss=0.488]

Step 300: loss = 0.5189


Training:  35%|███▌      | 351/1000 [00:39<01:12,  8.99it/s, loss=0.465]

Step 350: loss = 0.4573


Training:  40%|████      | 401/1000 [00:44<01:04,  9.31it/s, loss=0.47] 

Step 400: loss = 0.4124


Training:  45%|████▌     | 451/1000 [00:49<01:02,  8.73it/s, loss=0.482]

Step 450: loss = 0.5213


Training:  50%|█████     | 501/1000 [00:55<00:56,  8.90it/s, loss=0.457]

Step 500: loss = 0.3726


Training:  55%|█████▌    | 551/1000 [01:00<00:48,  9.16it/s, loss=0.481]

Step 550: loss = 0.4837


Training:  60%|██████    | 601/1000 [01:06<00:45,  8.85it/s, loss=0.455]

Step 600: loss = 0.4742


Training:  65%|██████▌   | 651/1000 [01:11<00:38,  9.17it/s, loss=0.474]

Step 650: loss = 0.4821


Training:  70%|███████   | 701/1000 [01:17<00:33,  8.82it/s, loss=0.442]

Step 700: loss = 0.4237


Training:  75%|███████▌  | 751/1000 [01:23<00:27,  9.10it/s, loss=0.37] 

Step 750: loss = 0.5074


Training:  80%|████████  | 801/1000 [01:28<00:21,  9.19it/s, loss=0.391]

Step 800: loss = 0.4474


Training:  85%|████████▌ | 851/1000 [01:34<00:16,  8.89it/s, loss=0.525]

Step 850: loss = 0.5755


Training:  90%|█████████ | 901/1000 [01:39<00:11,  8.80it/s, loss=0.533]

Step 900: loss = 0.4525


Training:  95%|█████████▌| 951/1000 [01:45<00:05,  8.82it/s, loss=0.504]

Step 950: loss = 0.4782


Training: 100%|██████████| 1000/1000 [01:50<00:00,  9.05it/s, loss=0.404]


Step 1000: loss = 0.4038


0,1
loss,██▆▆▆▂▆▄▄▂▄▃▂▃▅▅▄▂▂▃▂▂▂▃▃▃▁▂▃▂▄▂▃▃▃▄▃▃▁▁
step,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███

0,1
loss,0.40382
step,999.0


rank8,lr0.0001,evalaution results:
Average Cross-Entropy Loss: 2.6090
Average MSE (Forecast):  0.1177


0,1
avg_loss,▁
avg_mse,▁

0,1
avg_loss,2.60895
avg_mse,0.11765


Training:   5%|▌         | 50/1000 [00:14<04:37,  3.42it/s, loss=0.579]

Step 50: loss = 0.5793


Training:  10%|█         | 100/1000 [00:29<04:22,  3.42it/s, loss=0.462]

Step 100: loss = 0.4624


Training:  15%|█▌        | 150/1000 [00:43<04:08,  3.41it/s, loss=0.468]

Step 150: loss = 0.4677


Training:  20%|██        | 200/1000 [00:58<03:53,  3.42it/s, loss=0.447]

Step 200: loss = 0.4470


Training:  25%|██▌       | 250/1000 [01:13<03:40,  3.41it/s, loss=0.406]

Step 250: loss = 0.4065


Training:  30%|███       | 300/1000 [01:27<03:24,  3.42it/s, loss=0.361]

Step 300: loss = 0.3606


Training:  35%|███▌      | 350/1000 [01:42<03:10,  3.42it/s, loss=0.436]

Step 350: loss = 0.4361


Training:  40%|████      | 400/1000 [01:57<02:56,  3.41it/s, loss=0.421]

Step 400: loss = 0.4215


Training:  45%|████▌     | 450/1000 [02:11<02:41,  3.42it/s, loss=0.487]

Step 450: loss = 0.4867


Training:  50%|█████     | 500/1000 [02:26<02:26,  3.41it/s, loss=0.382]

Step 500: loss = 0.3815


Training:  55%|█████▌    | 550/1000 [02:41<02:11,  3.42it/s, loss=0.375]

Step 550: loss = 0.3748


Training:  60%|██████    | 600/1000 [02:55<01:57,  3.42it/s, loss=0.371]

Step 600: loss = 0.3710


Training:  65%|██████▌   | 650/1000 [03:10<01:42,  3.41it/s, loss=0.432]

Step 650: loss = 0.4322


Training:  70%|███████   | 700/1000 [03:24<01:27,  3.42it/s, loss=0.385]

Step 700: loss = 0.3855


Training:  75%|███████▌  | 750/1000 [03:39<01:13,  3.42it/s, loss=0.332]

Step 750: loss = 0.3318


Training:  80%|████████  | 800/1000 [03:54<00:58,  3.42it/s, loss=0.331]

Step 800: loss = 0.3309


Training:  85%|████████▌ | 850/1000 [04:08<00:44,  3.41it/s, loss=0.388]

Step 850: loss = 0.3880


Training:  90%|█████████ | 900/1000 [04:23<00:29,  3.42it/s, loss=0.314]

Step 900: loss = 0.3140


Training:  95%|█████████▌| 950/1000 [04:38<00:14,  3.42it/s, loss=0.289]

Step 950: loss = 0.2887


Training: 100%|██████████| 1000/1000 [04:52<00:00,  3.42it/s, loss=0.336]


Step 1000: loss = 0.3364


0,1
loss,█▆▅▅▅▄▄▄▄▃▃▂▃▂▂▃▃▃▂▂▂▂▃▃▃▂▂▂▃▃▁▁▃▃▂▂▂▃▄▃
step,▁▁▂▂▂▂▂▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇█

0,1
loss,0.33635
step,999.0


rank8,lr0.0001,evalaution results:
Average Cross-Entropy Loss: 2.0401
Average MSE (Forecast):  0.0230


0,1
avg_loss,▁
avg_mse,▁

0,1
avg_loss,2.0401
avg_mse,0.02298


Training:   5%|▌         | 50/1000 [00:21<06:39,  2.38it/s, loss=0.564]

Step 50: loss = 0.5641


Training:  10%|█         | 100/1000 [00:42<06:18,  2.38it/s, loss=0.371]

Step 100: loss = 0.3715


Training:  15%|█▌        | 150/1000 [01:03<06:02,  2.35it/s, loss=0.503]

Step 150: loss = 0.5034


Training:  20%|██        | 200/1000 [01:24<05:36,  2.38it/s, loss=0.437]

Step 200: loss = 0.4371


Training:  25%|██▌       | 250/1000 [01:45<05:16,  2.37it/s, loss=0.433]

Step 250: loss = 0.4331


Training:  30%|███       | 300/1000 [02:06<04:54,  2.38it/s, loss=0.378]

Step 300: loss = 0.3782


Training:  35%|███▌      | 350/1000 [02:27<04:33,  2.37it/s, loss=0.534]

Step 350: loss = 0.5344


Training:  40%|████      | 400/1000 [02:48<04:12,  2.37it/s, loss=0.428]

Step 400: loss = 0.4277


Training:  45%|████▌     | 450/1000 [03:09<03:51,  2.38it/s, loss=0.381]

Step 450: loss = 0.3813


Training:  50%|█████     | 500/1000 [03:30<03:30,  2.37it/s, loss=0.389]

Step 500: loss = 0.3891


Training:  55%|█████▌    | 550/1000 [03:51<03:09,  2.37it/s, loss=0.346]

Step 550: loss = 0.3460


Training:  60%|██████    | 600/1000 [04:12<02:48,  2.37it/s, loss=0.392]

Step 600: loss = 0.3922


Training:  65%|██████▌   | 650/1000 [04:33<02:27,  2.37it/s, loss=0.395]

Step 650: loss = 0.3954


Training:  70%|███████   | 700/1000 [04:54<02:06,  2.37it/s, loss=0.415]

Step 700: loss = 0.4146


Training:  75%|███████▌  | 750/1000 [05:16<01:45,  2.37it/s, loss=0.337]

Step 750: loss = 0.3369


Training:  80%|████████  | 800/1000 [05:37<01:24,  2.37it/s, loss=0.291]

Step 800: loss = 0.2908


Training:  85%|████████▌ | 850/1000 [05:58<01:03,  2.37it/s, loss=0.315]

Step 850: loss = 0.3149


Training:  90%|█████████ | 900/1000 [06:19<00:42,  2.37it/s, loss=0.335]

Step 900: loss = 0.3348


Training:  95%|█████████▌| 950/1000 [06:40<00:21,  2.37it/s, loss=0.272]

Step 950: loss = 0.2720


Training: 100%|██████████| 1000/1000 [07:01<00:00,  2.37it/s, loss=0.274]


Step 1000: loss = 0.2743


0,1
loss,█▆▄▅▄▅▃▃▃▃▁▁▃▃▂▂▂▅▄▃▄▃▁▃▃▂▃▃▂▂▄▂▃▂▄▂▁▂▂▂
step,▁▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▆▆▆▆▆▇▇███

0,1
loss,0.27428
step,999.0


rank8,lr0.0001,evalaution results:
Average Cross-Entropy Loss: 1.5362
Average MSE (Forecast):  0.0051


0,1
avg_loss,▁
avg_mse,▁

0,1
avg_loss,1.53623
avg_mse,0.00513


# 3(c) By using the context length and hyperparameter, we train the model again with 3000 steps.

In [6]:
wandb.init(
    project="Qwen-lora-final-long-run-8-1e-4-768",
        name=f"final_train",
        reinit=True)
train_ids,_,_=load_data(tokenizer,max_ctx_length=768)

base_model,_ = load_qwen()

apply_lora(base_model,r=8)

_ = train_lora(
    base_model,
    train_ids,
    learning_rate=1e-4,
    batch_size=4,
    max_steps=3000
)

# Save the weights(only LoRA part, no need to save the whole model)
save_path = f"/content/drive/MyDrive/m2cw/Final_context=768_rank=8_lr=1e-4.pt"
save_lora_parameters(base_model, save_path)


wandb.init(
    project="Qwen-lora-final-long-run-8-1e-4-768",
    name=f"final_evaluation_validation_set",
    reinit=True
)
val_loss, val_mse = evaluation(base_model, tokenizer, tokenized_val)

wandb.init(
    project="Qwen-lora-final-long-run-8-1e-4-768",
    name=f"final_evaluation_test_set",
    reinit=True
)

test_loss, test_mse = evaluation(base_model, tokenizer, tokenized_test)

print(f"final evalaution (val) results:")
print(f"Average Cross-Entropy Loss: {val_loss:.4f}")
print(f"Average MSE (Forecast):  {val_mse:.4f}")


print(f"final evalaution (test) results:")
print(f"Average Cross-Entropy Loss: {test_loss:.4f}")
print(f"Average MSE (Forecast):  {test_mse:.4f}")

wandb.finish()




Training:   2%|▏         | 50/3000 [00:22<20:38,  2.38it/s, loss=0.485]

Step 50: loss = 0.4854


Training:   3%|▎         | 100/3000 [00:43<20:17,  2.38it/s, loss=0.699]

Step 100: loss = 0.6991


Training:   5%|▌         | 150/3000 [01:04<19:55,  2.38it/s, loss=0.365]

Step 150: loss = 0.3646


Training:   7%|▋         | 200/3000 [01:25<19:34,  2.38it/s, loss=0.359]

Step 200: loss = 0.3590


Training:   8%|▊         | 250/3000 [01:46<19:14,  2.38it/s, loss=0.371]

Step 250: loss = 0.3712


Training:  10%|█         | 300/3000 [02:07<18:55,  2.38it/s, loss=0.341]

Step 300: loss = 0.3405


Training:  12%|█▏        | 350/3000 [02:28<18:32,  2.38it/s, loss=0.416]

Step 350: loss = 0.4161


Training:  13%|█▎        | 400/3000 [02:48<18:12,  2.38it/s, loss=0.392]

Step 400: loss = 0.3916


Training:  15%|█▌        | 450/3000 [03:09<17:50,  2.38it/s, loss=0.423]

Step 450: loss = 0.4227


Training:  17%|█▋        | 500/3000 [03:30<17:29,  2.38it/s, loss=0.335]

Step 500: loss = 0.3345


Training:  18%|█▊        | 550/3000 [03:51<17:09,  2.38it/s, loss=0.32]

Step 550: loss = 0.3201


Training:  20%|██        | 600/3000 [04:12<16:49,  2.38it/s, loss=0.343]

Step 600: loss = 0.3433


Training:  22%|██▏       | 650/3000 [04:33<16:27,  2.38it/s, loss=0.367]

Step 650: loss = 0.3673


Training:  23%|██▎       | 700/3000 [04:54<16:07,  2.38it/s, loss=0.433]

Step 700: loss = 0.4326


Training:  25%|██▌       | 750/3000 [05:16<15:45,  2.38it/s, loss=0.3]

Step 750: loss = 0.3004


Training:  27%|██▋       | 800/3000 [05:37<15:23,  2.38it/s, loss=0.337]

Step 800: loss = 0.3373


Training:  28%|██▊       | 850/3000 [05:58<15:03,  2.38it/s, loss=0.28]

Step 850: loss = 0.2795


Training:  30%|███       | 900/3000 [06:19<14:41,  2.38it/s, loss=0.289]

Step 900: loss = 0.2892


Training:  32%|███▏      | 950/3000 [06:40<14:22,  2.38it/s, loss=0.318]

Step 950: loss = 0.3176


Training:  33%|███▎      | 1000/3000 [07:01<14:00,  2.38it/s, loss=0.347]

Step 1000: loss = 0.3465


Training:  35%|███▌      | 1050/3000 [07:22<13:39,  2.38it/s, loss=0.254]

Step 1050: loss = 0.2539


Training:  37%|███▋      | 1100/3000 [07:43<13:17,  2.38it/s, loss=0.388]

Step 1100: loss = 0.3876


Training:  38%|███▊      | 1150/3000 [08:04<12:56,  2.38it/s, loss=0.325]

Step 1150: loss = 0.3251


Training:  40%|████      | 1200/3000 [08:25<12:35,  2.38it/s, loss=0.284]

Step 1200: loss = 0.2838


Training:  42%|████▏     | 1250/3000 [08:46<12:13,  2.39it/s, loss=0.287]

Step 1250: loss = 0.2872


Training:  43%|████▎     | 1300/3000 [09:07<11:53,  2.38it/s, loss=0.269]

Step 1300: loss = 0.2691


Training:  45%|████▌     | 1350/3000 [09:28<11:33,  2.38it/s, loss=0.316]

Step 1350: loss = 0.3157


Training:  47%|████▋     | 1400/3000 [09:49<11:12,  2.38it/s, loss=0.359]

Step 1400: loss = 0.3589


Training:  48%|████▊     | 1450/3000 [10:10<10:51,  2.38it/s, loss=0.358]

Step 1450: loss = 0.3576


Training:  50%|█████     | 1500/3000 [10:31<10:29,  2.38it/s, loss=0.327]

Step 1500: loss = 0.3265


Training:  52%|█████▏    | 1550/3000 [10:52<10:09,  2.38it/s, loss=0.257]

Step 1550: loss = 0.2571


Training:  53%|█████▎    | 1600/3000 [11:13<09:48,  2.38it/s, loss=0.323]

Step 1600: loss = 0.3231


Training:  55%|█████▌    | 1650/3000 [11:34<09:26,  2.38it/s, loss=0.262]

Step 1650: loss = 0.2618


Training:  57%|█████▋    | 1700/3000 [11:55<09:04,  2.39it/s, loss=0.244]

Step 1700: loss = 0.2437


Training:  58%|█████▊    | 1750/3000 [12:16<08:44,  2.38it/s, loss=0.314]

Step 1750: loss = 0.3143


Training:  60%|██████    | 1800/3000 [12:36<08:22,  2.39it/s, loss=0.332]

Step 1800: loss = 0.3315


Training:  62%|██████▏   | 1850/3000 [12:57<08:02,  2.38it/s, loss=0.273]

Step 1850: loss = 0.2727


Training:  63%|██████▎   | 1900/3000 [13:18<07:40,  2.39it/s, loss=0.303]

Step 1900: loss = 0.3031


Training:  65%|██████▌   | 1950/3000 [13:39<07:20,  2.38it/s, loss=0.242]

Step 1950: loss = 0.2420


Training:  67%|██████▋   | 2000/3000 [14:00<06:59,  2.39it/s, loss=0.276]

Step 2000: loss = 0.2760


Training:  68%|██████▊   | 2050/3000 [14:21<06:38,  2.39it/s, loss=0.26]

Step 2050: loss = 0.2603


Training:  70%|███████   | 2100/3000 [14:42<06:17,  2.38it/s, loss=0.269]

Step 2100: loss = 0.2686


Training:  72%|███████▏  | 2150/3000 [15:03<05:56,  2.38it/s, loss=0.275]

Step 2150: loss = 0.2748


Training:  73%|███████▎  | 2200/3000 [15:24<05:35,  2.38it/s, loss=0.276]

Step 2200: loss = 0.2756


Training:  75%|███████▌  | 2250/3000 [15:45<05:14,  2.39it/s, loss=0.254]

Step 2250: loss = 0.2540


Training:  77%|███████▋  | 2300/3000 [16:06<04:53,  2.38it/s, loss=0.281]

Step 2300: loss = 0.2806


Training:  78%|███████▊  | 2350/3000 [16:27<04:32,  2.38it/s, loss=0.3]

Step 2350: loss = 0.2996


Training:  80%|████████  | 2400/3000 [16:48<04:11,  2.38it/s, loss=0.274]

Step 2400: loss = 0.2739


Training:  82%|████████▏ | 2450/3000 [17:09<03:50,  2.38it/s, loss=0.247]

Step 2450: loss = 0.2474


Training:  83%|████████▎ | 2500/3000 [17:30<03:29,  2.38it/s, loss=0.317]

Step 2500: loss = 0.3174


Training:  85%|████████▌ | 2550/3000 [17:51<03:08,  2.38it/s, loss=0.333]

Step 2550: loss = 0.3326


Training:  87%|████████▋ | 2600/3000 [18:12<02:47,  2.38it/s, loss=0.286]

Step 2600: loss = 0.2856


Training:  88%|████████▊ | 2650/3000 [18:33<02:26,  2.38it/s, loss=0.388]

Step 2650: loss = 0.3883


Training:  90%|█████████ | 2700/3000 [18:54<02:05,  2.39it/s, loss=0.246]

Step 2700: loss = 0.2462


Training:  92%|█████████▏| 2750/3000 [19:15<01:44,  2.38it/s, loss=0.275]

Step 2750: loss = 0.2751


Training:  93%|█████████▎| 2800/3000 [19:36<01:23,  2.38it/s, loss=0.314]

Step 2800: loss = 0.3142


Training:  95%|█████████▌| 2850/3000 [19:57<01:02,  2.39it/s, loss=0.258]

Step 2850: loss = 0.2576


Training:  97%|█████████▋| 2900/3000 [20:18<00:41,  2.38it/s, loss=0.253]

Step 2900: loss = 0.2530


Training:  98%|█████████▊| 2950/3000 [20:39<00:20,  2.38it/s, loss=0.311]

Step 2950: loss = 0.3115


Training: 100%|██████████| 3000/3000 [21:00<00:00,  2.38it/s, loss=0.257]


Step 3000: loss = 0.2569


0,1
loss,█▅▇▅▆▄▆▅▅▄▅▆▄▅▆▃▆▁▃▄▂▃▄▄▅▃▄▃▄▅▁▃▃▄▅▃▃▂▃▂
step,▁▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▇▇▇▇▇▇▇██

0,1
loss,0.25688
step,2999.0


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


0,1
avg_loss,▁
avg_mse,▁

0,1
avg_loss,1.21281
avg_mse,0.00209


final evalaution (val) results:
Average Cross-Entropy Loss: 1.2128
Average MSE (Forecast):  0.0021
final evalaution (test) results:
Average Cross-Entropy Loss: 1.9080
Average MSE (Forecast):  0.0071


0,1
avg_loss,▁
avg_mse,▁

0,1
avg_loss,1.90797
avg_mse,0.00715
