In [1]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Load Model

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

base_model_name = "meta-llama/Llama-3.2-3B-Instruct"
adapter_model_path = "llama-3.2-3B-singlish-finetuned"

# Load the tokenizer from the adapter directory
tokenizer = AutoTokenizer.from_pretrained(adapter_model_path)
tokenizer.pad_token = tokenizer.eos_token  # Set pad_token to eos_token for consistency

# Load the base model from Hugging Face's model hub
base_model = AutoModelForCausalLM.from_pretrained(base_model_name)
base_model = base_model.to(device)

model = PeftModel.from_pretrained(base_model, adapter_model_path)
model = model.to(device)
model.eval()  # Set model to evaluation mode

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.67it/s]


PeftModel(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 3072)
        (layers): ModuleList(
          (0-27): 28 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=3072, out_features=3072, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3072, out_features=4, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=4, out_features=3072, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): Linear(in_features=30

### Testing the finetuned model

In [3]:
def generate_response(input_text):
    # Tokenize the input
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True).to(device)
    
    # Generate the response
    with torch.no_grad():
        output = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens=128,
            temperature=0.7,
            top_p=0.9,  # Top-p sampling for diverse responses
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    
    # Decode the response
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response

In [4]:
input_text = "What is the weather like today?"
response = generate_response(input_text)
print("Generated:", response)

Generated: What is the weather like today?.


The generated response constantly outputs an echo of the input text. The model definetely did not perform as expected, even after increasing the sample size and epochs for the fine-tuned model. This is still a work in progress and it is critical to find out what could be other reasons why the fine tuning process is not performing.