In [None]:
!pip install transformers==4.33.0 accelerate==0.22.0 einops==0.6.1 langchain==0.0.300 xformers==0.0.21 \
bitsandbytes==0.41.1 sentence_transformers==2.2.2 chromadb==0.4.12

In [4]:
from torch import cuda, bfloat16
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
from time import time
from langchain.llms import HuggingFacePipeline
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma


### Initialize model, tokenizer, query pipeline

In [5]:
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

In [6]:
# Set quantization configuration to load large model with less GPU memory
bnb_config = transformers.BitsAndBytesConfig(
    # enables loading the model in 4-bit precision rather than the usual 16-bit or 32-bit.
    # Reducing the precision of the model helps save GPU memory, which is especially useful
    # for very large models.
    load_in_4bit = True,
    # This sets the quantization type to "nf4" (Normalized 4-bit Floating Point).
    # NF4 is a specific quantization method that can offer better accuracy in
    # 4-bit precision compared to basic fixed-point quantization.
    bnb_4biit_quant_type = "nf4",
    # This enables "double quantization," which applies an additional layer of quantization
    # to the model parameters. This technique can further reduce the memory usage while maintaining
    # model accuracy.
    bnb_4bit_use_double_quant = True,
    # This specifies the data type used for computations during model execution. Here,
    # it's set to bfloat16 (Brain Floating Point 16), a lower precision type compared
    # to float32, which also helps reduce memory usage while providing a good balance
    # between performance and accuracy.
    bnb_4bit_compute_dtype = bfloat16
)

In [7]:
import accelerate
import bitsandbytes as bnb

In [8]:
time_1 = time()

model_id = "openai-community/gpt2-xl"

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code = True,
    quantization_config = bnb_config,
    device_map = 'auto',
)

tokenizer = AutoTokenizer.from_pretrained(
    model_id
)

time_2 = time()

print(f"Prepare model, tokenizer {round(time_2-time_1, 3)} sec.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/689 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/6.43G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Prepare model, tokenizer 90.892 sec.


In [9]:
time_1 = time()

query_pipeline = transformers.pipeline(
    "text-generation",
    model = model,
    tokenizer = tokenizer,
    device_map = "auto"
)

time_2 = time()
print(f"Prepare query pipeline {round(time_2-time_1, 3)} sec.")

Prepare query pipeline 1.549 sec.


In [10]:
def test_model(tokenizer, pipeline, prompt_to_test):
  time_1 = time()
  sequences = pipeline(
      prompt_to_test,
      do_sample = True,
      # top_k: Controls how many of the most probable tokens to consider at each step
      top_k = 10,
      num_return_sequences = 1,
      eos_token_id = tokenizer.eos_token_id,
      max_length = 300,)

  time_2 = time()
  print(f"Test inference: {round(time_2-time_1, 3)} sec.")

  for seq in sequences:
    print(f"Result: {seq['generated_text']}")

# Beam Search
# def test_model(tokenizer, pipeline, prompt_to_test):
#   time_1 = time()
#   sequences = pipeline(
#       prompt_to_test,
#       do_sample = False,
#       # Stop beams when EOS is reached
#       num_beams=10,
#       early_stopping=True,
#       # Number of sequences to return
#       num_return_sequences = 1,
#       eos_token_id = tokenizer.eos_token_id,
#       max_length = 300,)

#   time_2 = time()
#   print(f"Test inference: {round(time_2-time_1, 3)} sec.")

#   for seq in sequences:
#     print(f"Result: {seq['generated_text']}")

In [11]:
test_model(
    tokenizer,
    query_pipeline,
    "What is Artificial Intelligence?"
)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Test inference: 30.843 sec.
Result: What is Artificial Intelligence? Artificial Intelligence is a set of software programs designed to understand, plan, communicate and interact with natural and social phenomena.

What is Natural Language Processor (NLP)? NLP is a set of programs designed to extract information from text and interpret it.

What is Natural Language Understanding? Natural Language Understanding (NLA) is an advanced technology used in Artificial Intelligence (AI). It is used to understand natural language text.

What is Artificial Neural Network (ANN)? Artificial Neural Network or ANN is an Artificial Intelligence based Neural Network which can learn from data and make recommendations.

What is Artificial Intelligence (AI)? Artificial Intelligence means a machine that is able to mimic the human brain and learn from data. Artificial Intelligence will help us to make more intelligent decisions. This will help us to solve some of the toughest problems in society.

How is Art

### Retrieval Augmented Generation

### Check the model with a HuggingFace pipeline

In [12]:
llm = HuggingFacePipeline(
    pipeline = query_pipeline
)

llm(prompt = "What is Artificial Intelligence?")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'\n\nFor a quick explanation: An "Intelligent Unit" (also called a "Program" in computer science) is a digital program which can perform calculations and make predictions on its own.\n\nThis definition is in'

In [13]:
loader = TextLoader("/content/test.txt",
                    encoding = "utf-8")
documents = loader.load()

### Split data in chunks

In [14]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 150,
    chunk_overlap = 20
)

all_splits = text_splitter.split_documents(
    documents
)

### Create Embeddings and Storing in Vector Store

In [15]:
model_name = "openai-community/gpt2-xl"
model_kwargs = {'device': device}

embeddings = HuggingFaceEmbeddings(
    model_name = model_name,
    model_kwargs = model_kwargs,
)



.gitattributes:   0%|          | 0.00/445 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/12.0k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/689 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

(…)neration_config_for_text_generation.json:   0%|          | 0.00/165 [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/6.43G [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/6.43G [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]



In [None]:
vectordb = Chroma.from_documents(
    documents = all_splits,
    embedding = embeddings,
    persist_directory = "chroma_db"
)

# tokenizer = AutoTokenizer.from_pretrained(pretrained_weights)
# if tokenizer.pad_token is None:
#     tokenizer.add_special_tokens({'pad_token': '[PAD]'})

### Initialize chain

In [None]:
retriever = vectordb.as_retriever()

qa = RetrievalQA.from_chain_type(
    llm = llm,
    chain_type = "stuff",
    retriever = retriever,
    return_source_documents = True,
    verbose = True
)

### Test the RAG

In [None]:
def test_rag(qa, query):
  print(f"Query: {query}")
  time_1 = time()
  result = qa.run(query)
  time_2 = time()
  print(f"Inference time: {round(time_2-time_1, 3)} sec.")
  print("\nResult: ", result)

In [None]:
query = "PE?"

test_rag(qa, query)