<a href="https://colab.research.google.com/github/vektor8891/llm/blob/main/projects/20_instruction_tuning/20_instruction_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# !pip install --user -qq datasets==2.20.0
# !pip install --user -qq evaluate==0.4.2
# !pip install --user -qq sacrebleu==2.4.2

### Define the device

In [2]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Dataset description

In [3]:
!wget https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/WzOT_CwDALWedTtXjwH7bA/CodeAlpaca-20k.json

--2025-06-02 21:02:45--  https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/WzOT_CwDALWedTtXjwH7bA/CodeAlpaca-20k.json
Resolving cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)... 169.45.118.108
Connecting to cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)|169.45.118.108|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6957007 (6.6M) [application/json]
Saving to: ‘CodeAlpaca-20k.json.4’


2025-06-02 21:02:47 (9.49 MB/s) - ‘CodeAlpaca-20k.json.4’ saved [6957007/6957007]



In [4]:
from datasets import load_dataset

dataset = load_dataset("json", data_files="CodeAlpaca-20k.json", split="train")
dataset

Dataset({
    features: ['output', 'instruction', 'input'],
    num_rows: 20022
})

In [5]:
dataset[1000]

{'output': 's = "Hello world" \ns = s[::-1] \nprint(s)',
 'instruction': 'Reverse the string given in the input',
 'input': 'Hello world'}

In [6]:
dataset = dataset.filter(lambda example: example["input"] == '')

In [7]:
dataset = dataset.shuffle(seed=42)

In [8]:
dataset

Dataset({
    features: ['output', 'instruction', 'input'],
    num_rows: 9764
})

In [9]:
# Select only the first 100 rows before splitting
dataset_first_100 = dataset.select(range(100))

dataset_split = dataset_first_100.train_test_split(test_size=0.2, seed=42)
train_dataset = dataset_split['train']
test_dataset = dataset_split['test']
dataset_first_100

Dataset({
    features: ['output', 'instruction', 'input'],
    num_rows: 100
})

In [10]:
# Select a small set of data for the resource limitation
# This dataset will be only used for evaluation parts, not for the training
tiny_test_dataset=test_dataset.select(range(10))
tiny_train_dataset=train_dataset.select(range(10))

# Model and tokenizer

In [11]:
from transformers import AutoModelForCausalLM

# Base model
model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m").to(device)

In [12]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m", padding_side='left')

In [13]:
tokenizer.eos_token

'</s>'

# Preprocessing the data

In [14]:
def formatting_prompts_func(mydataset):
    output_texts = []
    for i in range(len(mydataset['instruction'])):
        text = (
            f"### Instruction:\n{mydataset['instruction'][i]}"
            f"\n\n### Response:\n{mydataset['output'][i]}</s>"
        )
        output_texts.append(text)
    return output_texts

def formatting_prompts_func_no_response(mydataset):
    output_texts = []
    for i in range(len(mydataset['instruction'])):
        text = (
            f"### Instruction:\n{mydataset['instruction'][i]}"
            f"\n\n### Response:\n"
        )
        output_texts.append(text)
    return output_texts

In [15]:
from tqdm import tqdm

expected_outputs = []
instructions_with_responses = formatting_prompts_func(test_dataset)
instructions = formatting_prompts_func_no_response(test_dataset)
for i in tqdm(range(len(instructions_with_responses))):
    tokenized_instruction_with_response = tokenizer(instructions_with_responses[i], return_tensors="pt", max_length=1024, truncation=True, padding=False)
    tokenized_instruction = tokenizer(instructions[i], return_tensors="pt")
    expected_output = tokenizer.decode(tokenized_instruction_with_response['input_ids'][0][len(tokenized_instruction['input_ids'][0])-1:], skip_special_tokens=True)
    expected_outputs.append(expected_output)

100%|██████████| 20/20 [00:00<00:00, 605.81it/s]


In [16]:
print('############## instructions ##############\n' + instructions[0])
print('############## instructions_with_responses ##############\n' + instructions_with_responses[0])
print('\n############## expected_outputs ##############' + expected_outputs[0])

############## instructions ##############
### Instruction:
Design a regular expression to match strings that begin with the letter "a".

### Response:

############## instructions_with_responses ##############
### Instruction:
Design a regular expression to match strings that begin with the letter "a".

### Response:
^a.*</s>

############## expected_outputs ##############
^a.*


In [17]:
from torch.utils.data import Dataset

class ListDataset(Dataset):
    def __init__(self, original_list):
        self.original_list = original_list

    def __len__(self):
        return len(self.original_list)

    def __getitem__(self, i):
        return self.original_list[i]

instructions_torch = ListDataset(instructions)

In [18]:
instructions_torch[0]

'### Instruction:\nDesign a regular expression to match strings that begin with the letter "a".\n\n### Response:\n'

# Test the base model

In [19]:
from transformers import pipeline

gen_pipeline = pipeline("text-generation",
                        model=model,
                        tokenizer=tokenizer,
                        device=device,
                        batch_size=2,
                        max_length=50,
                        truncation=True,
                        padding=False,
                        return_full_text=False)

Device set to use cpu


In [20]:
tokenizer.padding_side = 'left'

with torch.no_grad():
    # Due to resource limitation, only apply the function on 3 records using "instructions_torch[:10]"
    pipeline_iterator= gen_pipeline(instructions_torch[:3],
                                    max_length=50, # this is set to 50 due to resource constraint, using a GPU, you can increase it to the length of your choice
                                    num_beams=5,
                                    early_stopping=True,)

generated_outputs_base = []
for text in pipeline_iterator:
    generated_outputs_base.append(text[0]["generated_text"])

In [21]:
from urllib.request import urlopen
import pickle
import io

# load generated responses from CUDA-enabled GPU machine
urlopened = urlopen('https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/VvQRrSqS1P0_GobqtL-SKA/instruction-tuning-generated-outputs-base.pkl')
generated_outputs_base = pickle.load(io.BytesIO(urlopened.read()))

In [22]:
for i in range(3):
    print('@@@@@@@@@@@@@@@@@@@@')
    print('@@@@@ Instruction '+ str(i+1) +': ')
    print(instructions[i])
    print('\n\n')
    print('@@@@@ Expected response '+ str(i+1) +': ')
    print(expected_outputs[i])
    print('\n\n')
    print('@@@@@ Generated response '+ str(i+1) +': ')
    print(generated_outputs_base[i])
    print('\n\n')
    print('@@@@@@@@@@@@@@@@@@@@')

@@@@@@@@@@@@@@@@@@@@
@@@@@ Instruction 1: 
### Instruction:
Design a regular expression to match strings that begin with the letter "a".

### Response:




@@@@@ Expected response 1: 

^a.*



@@@@@ Generated response 1: 
Thank you for your question.

### Response:
Thank you for your question.

### Response:
Thank you for your question.

### Response:
Thank you for your question.

### Response:
Thank you for your question.

### Response:
Thank you for your question.



@@@@@@@@@@@@@@@@@@@@
@@@@@@@@@@@@@@@@@@@@
@@@@@ Instruction 2: 
### Instruction:
Suggest a solution for validating a password which should contain at least 1 uppercase character, 1 lowercase character and 1 digit.

### Response:




@@@@@ Expected response 2: 

We can use a regular expression to validate a password. A regular expression to validate a password with at least 1 uppercase character, 1 lowercase character and 1 digit should be:

^(?=.*[a-z])(?=.*[A-Z])(?=.*\d).+$

If the password matches the regular expressio

Issues of base model responses:

1. responses not up to mark
1. tendency to eextend and repeat the answers

-> Instruction-tuning can fix both issues!

## BLEU score

In [31]:
import sacrebleu

sacrebleu = evaluate.load("sacrebleu")
results_base = sacrebleu.compute(predictions=generated_outputs_base,
                                 references=expected_outputs)

print(list(results_base.keys()))
print(round(results_base["score"], 1))

FileNotFoundError: Couldn't find a module script at /content/sacrebleu/sacrebleu.py. Module 'sacrebleu' doesn't exist on the Hugging Face Hub either.

In [29]:
pip show sacrebleu

Name: sacrebleu
Version: 2.4.2
Summary: Hassle-free computation of shareable, comparable, and reproducible BLEU, chrF, and TER scores
Home-page: https://github.com/mjpost/sacrebleu
Author: Matt Post
Author-email: post@cs.jhu.edu
License: Apache License 2.0
Location: /root/.local/lib/python3.11/site-packages
Requires: colorama, lxml, numpy, portalocker, regex, tabulate
Required-by: 


## Perform instruction fine-tuning with LoRA

In [None]:
from peft import get_peft_model, LoraConfig, TaskType

lora_config = LoraConfig(
    r=16,  # Low-rank dimension
    lora_alpha=32,  # Scaling factor
    target_modules=["q_proj", "v_proj"],  # Modules to apply LoRA
    lora_dropout=0.1,  # Dropout rate
    task_type=TaskType.CAUSAL_LM  # Task type should be causal language model
)

model = get_peft_model(model, lora_config)

In [24]:
!rm -rf ~/.cache/huggingface/evaluate