Reference: https://deci.ai/blog/fine-tune-llama-2-with-lora-for-question-answering/


In [None]:
!pip install -q  torch peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7 accelerate

import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline
)
from peft import LoraConfig
from trl import SFTTrainer

In [None]:
# Dataset
#data_name = "mlabonne/guanaco-llama2-1k"
#training_data = load_dataset(data_name, split="train")

In [None]:
import json
with open('/content/train.json') as f_in:
  train= json.load(f_in)

for i in range(len(train)):
  train[i]['text'] = train[i]['text'].replace('Given the question delimited by triple backticks ```{','<s>[INST] ').replace('}```, what is the answer? Answer: {',' [/INST] ').replace('}',' </s>')

with open('/content/train_llama.json', 'w') as outfile:
    json.dump(train, outfile, ensure_ascii=False)

In [None]:
# Model and tokenizer names
base_model_name = "NousResearch/Llama-2-7b-chat-hf"
refined_model = "llama-2-7b-mlabonne-enhanced" #You can give it your own name

In [None]:
# Tokenizer
llama_tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
llama_tokenizer.pad_token = llama_tokenizer.eos_token
llama_tokenizer.padding_side = "right"  # Fix for fp16


In [None]:
training_data = load_dataset("json", data_files="/content/train_llama.json",split="train")

In [None]:
training_data[0]

In [None]:
# Quantization Config
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False
)

In [None]:
# Model
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=quant_config,
    device_map={"": 0}
)
base_model.config.use_cache = False
base_model.config.pretraining_tp = 1

In [None]:
# LoRA Config
peft_parameters = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=8,
    bias="none",
    task_type="CAUSAL_LM"
)

# Training Params
train_params = TrainingArguments(
    output_dir="./results_modified",
    num_train_epochs=10,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard"
)

# Trainer
fine_tuning = SFTTrainer(
    model=base_model,
    train_dataset=training_data,
    peft_config=peft_parameters,
    dataset_text_field="text",
    tokenizer=llama_tokenizer,
    args=train_params
)

# Training
fine_tuning.train()

# Save Model
fine_tuning.model.save_pretrained(refined_model)



Map:   0%|          | 0/76 [00:00<?, ? examples/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
25,2.0514
50,1.1587
75,0.8568
100,0.5841
125,0.3701
150,0.2463
175,0.1841




In [None]:
from peft import PeftConfig, PeftModel

model = PeftModel.from_pretrained(base_model, refined_model)


In [None]:
import json
with open('/content/test.json') as f_in:
  test= json.load(f_in)

for i in range(len(test)):
  test[i]['text'] = test[i]['text'].replace('Given the question delimited by triple backticks ```{','<s>[INST] ').replace('}```, what is the answer? Answer: {',' [/INST] ').replace('}',' </s>')

with open('/content/test_llama.json', 'w') as outfile:
    json.dump(test, outfile, ensure_ascii=False)

In [None]:
question = [x['text'][10:x['text'].index('?')+1] for x in test]

In [None]:
# Generate Text
pred_llama = []
i = 0
for x in test:
  print(i)
  #query = "what is an AI virtual assistant?"
  text_gen = pipeline(task="text-generation", model=base_model, tokenizer=llama_tokenizer, max_length=200)
  output = text_gen(f"<s>[INST] {x} [/INST]")
  pred_llama.append(output[0]['generated_text'])
  i+=1

#print(output[0]['generated_text'])

In [None]:
pred_llama = [x[x.index('? [/INST]')+10:] for x in pred_llama]

In [None]:
with open('/content/pred_llama.json', 'w') as outfile:
    json.dump(pred_llama, outfile, ensure_ascii=False)

In [None]:
with open('/content/test_llama.json') as f_in:
  test= json.load(f_in)

In [None]:
test = [x['text'][x['text'].index('? [/INST]')+10:-5] for x in test]

In [None]:
import nltk
import numpy as np
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction
nltk.download('punkt')
def bleu_score(ref,pred):

  pred_tokens = nltk.word_tokenize(pred.lower())
  ref_tokens = nltk.word_tokenize(ref.lower())

  # Calculate BLEU score
  bleu_score = sentence_bleu(ref_tokens, pred_tokens,smoothing_function=SmoothingFunction().method1)

  return bleu_score

score= []
for i in range(len(test)):
  score.append(bleu_score(test[i],pred_llama[i]))

print(np.mean(score))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


0.001766395086111293


In [None]:
import json
with open('/content/pred2.json') as f_in:
  pred= json.load(f_in)

In [None]:
score= []
for i in range(len(test)):
  score.append(bleu_score(test[i],pred[i]))

print(np.mean(score))

0.00272857343512289


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
nltk.download('stopwords')

def cosine_distance(reference_answer,generated_answer ):

  reference_tokens = nltk.word_tokenize(reference_answer.lower())
  generated_tokens = nltk.word_tokenize(generated_answer.lower())


  stop_words = set(stopwords.words('english'))
  reference_tokens = [token for token in reference_tokens if token not in stop_words]
  generated_tokens = [token for token in generated_tokens if token not in stop_words]


  reference_text = ' '.join(reference_tokens)
  generated_text = ' '.join(generated_tokens)


  vectorizer = CountVectorizer().fit_transform([reference_text, generated_text])
  vectors = vectorizer.toarray()


  cosine_sim = cosine_similarity([vectors[0]], [vectors[1]])

  return cosine_sim[0][0]

cosine_results = []
for i in range(len(test)):
  cosine_results.append(cosine_distance(test[i],pred[i]))

print(np.mean(cosine_results))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0.4711368925945207


In [None]:
cosine_results = []
for i in range(len(test)):
  cosine_results.append(cosine_distance(test[i],pred_llama[i]))

print(np.mean(cosine_results))

0.4366043670856687


In [None]:
def print_responses(question_num,question,test,pred):

    # Retrieve the responses corresponding to the given question number
    question = question[question_num - 1]
    model_response = test[question_num - 1]
    chatgpt_response = pred[question_num - 1]

    screen_width = 120
    print(textwrap.fill('Question:'+ question, width=screen_width)+ '\n')
    print(textwrap.fill("Model's Response:"+ model_response , width=screen_width)+ '\n')
    print(textwrap.fill("ChatGPT's Response:"+ chatgpt_response, width=screen_width))


In [None]:
# Example
question_number = int(input("Enter the question number: "))
print_responses(question_number,question,test,pred)

Enter the question number: 33
Question:Can you explain how speech recognition systems are trained?

Model's Response:Training a speech recognition system involves teaching the system to recognize patterns in speech data
and associate them with corresponding textual transcriptions. The training process for speech recognition systems
include: Data Collection, Data Preprocessing, Annotation, Model Selection, Model Training, Evaluation, Optimization and
Tuning.

ChatGPT's Response:Speech recognition systems, also known as speech-to-text or voice recognition systems, are trained
using a large dataset of speech recordings with corresponding transcriptions or translations. They learn to recognize
patterns in speech sounds, syntax, and grammar to convert spoken language into text. The training process typically
involves the following steps: Data Collection, Data Preprocessing, Model Selection, Training, Evaluation, Fine-Tuning
and Deployment. Data collection involves gathering a vast amount of