In [1]:
!pip install transformers
!pip install datasets
!pip install peft
!pip install -U "huggingface_hub[cli]"

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To updat

In [2]:
!huggingface-cli login --token $HF

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
The token `transformer2` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `transformer2`


In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from datasets import load_dataset
from peft import LoraConfig, get_peft_model

# 1. Load the base model and tokenizer

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_name, device_map=torch.device('cuda:0'))
tokenizer = AutoTokenizer.from_pretrained(model_name)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [5]:
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# 2. Configure LoRA with PEFT

In [6]:
lora_config = LoraConfig(
    r=8,                   # LoRA rank
    lora_alpha=32,         # Scaling factor for LoRA
    target_modules=['gate_proj', 'down_proj', 'up_proj', 'q_proj', 'v_proj', 'k_proj', 'o_proj'],  # Modify as needed based on the model architecture
    lora_dropout=0.1,      # Dropout rate for LoRA layers
    bias="none"
)
model = get_peft_model(model, lora_config)
print("LoRA configuration applied.")

LoRA configuration applied.


# 3. Load the VMLU dataset

In [7]:
dataset = load_dataset("vinhnq29/zalo_vmlu")
train_data = dataset["validation"]

# 4. Preprocess the dataset

In [10]:
from string import Template
#%% Define prompt template
# Define prompt template - keeping the Vietnamese prompt from your example
preamble = 'Chỉ đưa ra chữ cái đứng trước câu trả lời đúng (A, B, C, D hoặc E) của câu hỏi trắc nghiệm sau: '

# For Llama models, modify the template to use their chat format
chat_template = Template('''<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful assistant. Answer the user's question with a single letter: A, B, C, D, or E, corresponding to the correct answer to the multiple-choice question.

<|start_header_id|>user<|end_header_id|>

$preamble

$prompt

$a
$b
$c
$d
$e

Đáp án:$ans
''')

In [11]:
def preprocess_function(example):
    # Extract the question
    question = example["question"]
    choices = example["choices"]
    try:
        a = choices[0]
    except:
        a = ''
    try:
        b = choices[1]
    except:
        b = ''
    try:
        c = choices[2]
    except:
        c = ''
    try:
        d = choices[3]
    except:
        d = ''
    try:
        e = choices[4]
    except:
        e = ''

    answer = example['answer']
    
    prompt2 = chat_template.substitute(
        preamble=preamble, prompt=question, a=a, b=b, c=c, d=d, e=e, ans=answer)
    
    # Tokenize the prompt with truncation and padding
    tokenized = tokenizer(prompt2, truncation=True, max_length=512)

    # tokenized_answer = tokenizer(answer, truncation=True, max_length=512)
    
    # For causal language modeling, set labels equal to input_ids
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

# Apply the preprocessing function to the dataset
tokenized_dataset = train_data.map(preprocess_function)
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

Map:   0%|          | 0/1047 [00:00<?, ? examples/s]

# 5. Set up training arguments

In [13]:
training_args = TrainingArguments(
    output_dir="./lora-finetuned-llama-vmlu",
    per_device_train_batch_size=1,          # Adjust based on your VRAM (A40 should handle larger sizes)
    gradient_accumulation_steps=8,            # Simulate a larger batch size
    num_train_epochs=5,
    learning_rate=2e-4,
    logging_steps=10,
    save_steps=500,
    fp16=True,                              # Enable mixed precision training
    eval_strategy="no",               # Change if you plan to evaluate during training
    remove_unused_columns=False
)

# Data collator for seq2seq training (adapt if needed)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# 6. Initialize the Trainer

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)

No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


# 7. Start training

In [15]:
trainer.train()

  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


Step,Training Loss
10,1.7439
20,0.9358
30,0.8387
40,0.7618
50,0.8059
60,0.811
70,0.7414
80,0.7358
90,0.728
100,0.7487


TrainOutput(global_step=650, training_loss=0.3867996177306542, metrics={'train_runtime': 603.1091, 'train_samples_per_second': 8.68, 'train_steps_per_second': 1.078, 'total_flos': 3.738400127800934e+16, 'train_loss': 0.3867996177306542, 'epoch': 4.962750716332378})

# 8. Save the fine-tuned model along with the LoRA adapters

In [16]:

model.save_pretrained("./lora-finetuned-llama-vmlu")
tokenizer.save_pretrained("./lora-finetuned-llama-vmlu")
print("Model fine-tuning complete and saved.")

Model fine-tuning complete and saved.


# 9. Upload the model to your Hugging Face Hub repository

In [17]:
# Replace "your-username/your-model-repo" with your HF username and desired repo name.
my_repo = 'JosephTn/meta-llama-3-8b-vlmu-instruct'
model.push_to_hub(my_repo, use_auth_token=True)
tokenizer.push_to_hub(my_repo, use_auth_token=True)
print("Model and tokenizer uploaded to Hugging Face Hub.")



adapter_model.safetensors:   0%|          | 0.00/83.9M [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Model and tokenizer uploaded to Hugging Face Hub.


In [18]:
prompt = (
    "Which factor will most likely cause a person to develop a fever?\n"
    "A. a leg muscle relaxing after exercise\n"
    "B. a bacterial population in the bloodstream\n"
    "C. several viral particles on the skin\n"
    "D. carbohydrates being digested in the stomach\n\n"
    "Answer:"
)

# Tokenize the prompt
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

# Generate a response
outputs = model.generate(inputs.input_ids, max_length=256, do_sample=True, top_p=0.95, top_k=50)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Generated Response:")
print(response)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Generated Response:
Which factor will most likely cause a person to develop a fever?
A. a leg muscle relaxing after exercise
B. a bacterial population in the bloodstream
C. several viral particles on the skin
D. carbohydrates being digested in the stomach

Answer: B. a bacterial population in the bloodstream

Explanation: A fever is a response to the presence of a foreign substance, such as a bacterium, in the body. When a bacterium enters the body, it is engulfed by a phagocytic cell, which then breaks down the bacterium and presents its components to the immune system. This triggers an immune response, which includes the production of cytokines that cause the body's temperature to rise. Therefore, the presence of a bacterial population in the bloodstream (option B) is the most likely cause of a fever.

The other options are not correct because:

* A leg muscle relaxing after exercise (option A) would not cause a fever.
* Several viral particles on the skin (option C) would not cause 

In [20]:
test_data = dataset["test"]

In [27]:
example = test_data[100]

In [28]:
example

{'id': '28-0123',
 'question': 'Năm 1914, Công ty Henry Pho (thương hiệu về đồ may mặc) trả cho công nhân 5$/ngày, trong khi mức lương phổ biến trên thị trường đương thời là 2 – 3$/ngày. Công ty này đã:',
 'choices': ['A. Chịu sự áp chế của luật tiền lương tối thiểu',
  'B. Công đoàn và thương lượng tập thể trong công ty đã quyết định mức lương',
  'C. Công ty áp dụng lý thuyết tiền lương hiệu quả'],
 'answer': None}

In [None]:
from tqdm import tqdm
answers = []
ids = []
for example in tqdm(test_data):
# if True:
    # Extract the question
    ids.append(example["id"])
    question = example["question"]
    choices = example["choices"]
    try:
        a = choices[0]
    except:
        a = ''
    try:
        b = choices[1]
    except:
        b = ''
    try:
        c = choices[2]
    except:
        c = ''
    try:
        d = choices[3]
    except:
        d = ''
    try:
        e = choices[4]
    except:
        e = ''

    ans = ''
    
    prompt2 = chat_template.substitute(
        preamble=preamble, prompt=question, a=a, b=b, c=c, d=d, e=e, ans=ans)

    # print(prompt2)
    # Tokenize the prompt
    inputs = tokenizer(prompt2, return_tensors="pt").to(model.device)
    
    # Generate a response
    outputs = model.generate(
        **inputs, 
        max_new_tokens=5,  # We only need a few tokens for the answer letter
        temperature=0.0,   # More deterministic
        do_sample=False,    # Greedy decoding
        pad_token_id=tokenizer.eos_token_id
    )

    answer_decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    # Extract just the letter answer (A, B, C, D, or E)
    response = answer_decoded[0]
    # print(response)
    last_part = response.split("Đáp án:")[-1].strip()
    
    # Try to extract the single letter answer
    import re
    letter_match = re.search(r'[A-E]', last_part)
    
    if letter_match:
        answer = letter_match.group(0)
    else:
        # Fallback if no clear letter is found
        answer = last_part[:1]  # Take first character as best guess
    # print(answer)
    answers.append(answer)

 97%|█████████▋| 9573/9833 [27:47<00:45,  5.75it/s]

In [None]:
df = pd.DataFrame({ "id": ids, "answer": answers})
df[['id','answer']].to_csv(f"./logs/meta-llama-Meta-Llama-3-8B-Instruct-vmlu.csv", index=False)