Reference: https://github.com/dvianna/LegalQA-bloomz-560m/blob/main/test_finetuned_model.ipynb

In [None]:
import pandas as pd
import json

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [None]:
dataset = pd.read_csv("/content/VA-Questions-Answers - Sheet2.csv")

In [None]:
dataset.dropna()

In [None]:
dataset.head()

Unnamed: 0,question,answer
0,What is Artificial Intelligence?,"Artificial Intelligence (AI) refers to the development of computer systems capable of performing tasks that typically require human intelligence. These tasks include understanding natural language, recognizing patterns, learning from experience, reasoning, problem-solving, and adapting to new situations."
1,What is Machine Learning?,"Machine Learning (ML) is a subset of artificial intelligence (AI) that focuses on the development of algorithms and models that enable computers to learn from and make predictions or decisions based on data, without being explicitly programmed to perform specific tasks."
2,What is Deep Learning?,"Deep learning is a subset of machine learning that focuses on the development and training of artificial neural networks with many layers (hence the term ""deep""). These neural networks are composed of interconnected layers of nodes (neurons) that process and transform input data to generate output predictions or decisions."
3,What are the types of Machine Learning?,"Machine learning can be broadly categorized into three main types, based on the nature of the learning process and the availability of labeled data: Supervised Learning, Unsupervised Learning and Reinforcement Learning."
4,What is supervised Machine Learning?,"Supervised machine learning is a type of machine learning where the algorithm learns from labeled data, meaning that each training example consists of both input data and the corresponding correct output (or target). The goal of supervised learning is to learn a mapping from input features to output labels based on the provided examples."


In [None]:
def buildprompts(data):
    prompt = {}
    prompt['text'] = "Given the question delimited by triple backticks ```{" + str(data['question']) + "}```, what is the answer? Answer: {" + str(data['answer']) + "}"
    return prompt

In [None]:
dataset['prompt'] = dataset.apply(buildprompts, axis=1)

In [None]:
result = dataset['prompt'].to_list()

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split( test_size=0.33, random_state=42)

In [None]:
with open('/content/train.json', 'w') as outfile:
    json.dump(train, outfile, ensure_ascii=False)
with open('/content/test.json', 'w') as outfile:
    json.dump(test, outfile, ensure_ascii=False)

In [None]:
!pip install accelerate -U
!pip install transformers==4.30
!pip install datasets

In [None]:
import pandas as pd
import torch
import json
from transformers import BloomTokenizerFast, BloomForCausalLM, TrainingArguments, Trainer
from datasets import load_dataset
from accelerate import Accelerator

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [None]:
tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloomz-560m")
model1 = BloomForCausalLM.from_pretrained("bigscience/bloomz-560m").to("cuda")

In [None]:
dataset_train = load_dataset("json", data_files="/content/train.json")

In [None]:
dataset_train

In [None]:
def prepare_train_data(data):
    # prompt + completion
    #text_input = data['prompt'] + ' ' + data['completion']
    text_input = data['text']
    # tokenize the input (prompt + completion) text
    tokenized_input = tokenizer(text_input, return_tensors='pt', padding=True)
    # generative models: labels are the same as the input
    tokenized_input['labels'] = tokenized_input['input_ids']
    return tokenized_input

In [None]:
train_dataset = dataset_train['train'].map(prepare_train_data, batched=True, remove_columns=["text"])

In [None]:
train_dataset

In [None]:
import pandas as pd
import numpy as np
import torch
import json
from transformers import BloomTokenizerFast, BloomForCausalLM, TrainingArguments, Trainer
from datasets import load_dataset
from accelerate import Accelerator

In [None]:
# setting arguments to be used during training
training_arguments = TrainingArguments(
    'LegalQA-bloom-560m',
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    num_train_epochs=10,
    weight_decay=0.01,
    fp16=True,
    optim="adafactor",
    gradient_accumulation_steps=4,
    gradient_checkpointing=True
)

In [None]:
trainer = Trainer(
    model = model1,
    args = training_arguments,
    train_dataset = train_dataset
)

In [None]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss


TrainOutput(global_step=90, training_loss=6.580828179253472, metrics={'train_runtime': 118.7965, 'train_samples_per_second': 6.397, 'train_steps_per_second': 0.758, 'total_flos': 222019013836800.0, 'train_loss': 6.580828179253472, 'epoch': 9.47})

In [None]:
trainer.save_model()

In [None]:
import torch
from transformers import pipeline
from transformers import BloomTokenizerFast, BloomForCausalLM, TrainingArguments, Trainer

In [None]:
tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloomz-560m")
model = BloomForCausalLM.from_pretrained("LegalQA-bloom-560m",low_cpu_mem_usage=False).to("cpu")
#prompt = 'Given the question delimited by triple backticks ```{What is Machine Learning?}```, what is the answer? Answer:'
#generator = pipeline('text-generation', model=model, tokenizer=tokenizer,do_sample=False)
#result = generator(prompt, max_length=128)
#print(result)

In [None]:
import json
with open('/content/test.json') as f_in:
  dataset_test= json.load(f_in)

In [None]:
with open('/content/pred.json') as f_in:
  pred_test= json.load(f_in)

In [None]:
pred_test = []
i=0
for x in dataset_test:
  print(i+1)
  prompt = x['text'][:x['text'].index('Answer:')+7]
  generator = pipeline('text-generation', model=model, tokenizer=tokenizer,do_sample=False)
  result = generator(prompt, max_length=128)
  pred_test.append(result)
  i+=1

In [None]:
test = [x['text'][x['text'].index('Answer:')+9:-1] for x in dataset_test]
pred_test = [x[x.index('Answer:')+9:-1] for x in pred_test]

In [None]:
import nltk
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction
nltk.download('punkt')
def bleu_score(ref,pred):

  pred_tokens = nltk.word_tokenize(pred.lower())
  ref_tokens = nltk.word_tokenize(ref.lower())

  # Calculate BLEU score
  bleu_score = sentence_bleu(ref_tokens, pred_tokens,smoothing_function=SmoothingFunction().method1)

  return bleu_score


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
score= []
for i in range(len(test)):
  score.append(bleu_score(test[i],pred_test[i]))

In [None]:
np.mean(score)

0.0047915106707577156

In [None]:
with open('/content/pred.json', 'w') as outfile:
    json.dump(pred_test, outfile, ensure_ascii=False)

In [None]:
tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloomz-560m")
model = BloomForCausalLM.from_pretrained("bigscience/bloomz-560m",low_cpu_mem_usage=False).to("cpu")

config.json:   0%|          | 0.00/715 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

In [None]:
dataset_test

In [None]:
pred_test_bloom = []
i=0
for x in dataset_test:
  print(i+1)
  prompt = x['text'][:x['text'].index('Answer:')+7]
  generator = pipeline('text-generation', model=model, tokenizer=tokenizer,do_sample=False)
  result = generator(prompt, max_length=128)
  pred_test_bloom.append(result)
  i+=1

In [None]:
pred_test_bloom = [x[0]['generated_text'] for x in pred_test_bloom]

In [None]:
pred_test_bloom = [x[x.index('Answer:')+7:] for x in pred_test_bloom]

In [None]:
score_bloom= []
for i in range(len(test)):
  score_bloom.append(bleu_score(test[i],pred_test_bloom[i]))

In [None]:
np.mean(score_bloom)

0.008655345601838156

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
nltk.download('stopwords')

def cosine_distance(reference_answer,generated_answer ):

  reference_tokens = nltk.word_tokenize(reference_answer.lower())
  generated_tokens = nltk.word_tokenize(generated_answer.lower())


  stop_words = set(stopwords.words('english'))
  reference_tokens = [token for token in reference_tokens if token not in stop_words]
  generated_tokens = [token for token in generated_tokens if token not in stop_words]


  reference_text = ' '.join(reference_tokens)
  generated_text = ' '.join(generated_tokens)


  vectorizer = CountVectorizer().fit_transform([reference_text, generated_text])
  vectors = vectorizer.toarray()


  cosine_sim = cosine_similarity([vectors[0]], [vectors[1]])

  return cosine_sim[0][0]


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
cosine_results_bloom = []
for i in range(len(test)):
  cosine_results_bloom.append(cosine_distance(test[i],pred_test_bloom[i]))

In [None]:
np.mean(cosine_results_bloom)

0.2515916378289134

In [None]:
cosine_results = []
for i in range(len(test)):
  cosine_results.append(cosine_distance(test[i],pred_test[i]))

In [None]:
np.mean(cosine_results)

0.3317349616484712