In [4]:
!pip install transformers==4.33.0 accelerate==0.22.0 einops==0.6.1 langchain==0.0.300 xformers==0.0.21 bitsandbytes==0.41.1 sentence_transformers==2.2.2 chromadb==0.4.12 uvicorn fastapi



In [2]:
from fastapi import FastAPI
import uvicorn
from torch import cuda, bfloat16
import torch
import transformers
from transformers import AutoTokenizer
from time import time
import chromadb
# from chromadb.config import Settings
from langchain.llms import HuggingFacePipeline
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
import os
from threading import Lock

  from .autonotebook import tqdm as notebook_tqdm


### Important: Access Token Required!

You must insert your access token from Hugging Face in this section. Please ensure you have also requested access on the corresponding Meta page(<u>before requesting access on huggingface</u>), and then on the Hugging Face repository.


In [3]:
# set TRANSFORMERS_CACHE to store and locate pre-downloaded models.
os.environ["TRANSFORMERS_CACHE"] = "atendees_db"

# initialize access token for Hugging Face model repository/API access.
# !!!! you need to put yr access token from huggingface here (you need to also request for access on meta page, then on huggingface repo)
access_token = ""

# specify the model ID. Here, "meta-llama/Llama-2-7b-chat-hf" is used, 
model_id = "meta-llama/Llama-2-7b-chat-hf"


In [4]:
# If a CUDA-enabled GPU is available, it selects the current active GPU as the device.
# Otherwise, it falls back to using the CPU.
# If CUDA is unavalible, the code needs more adjustments
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
# just to be sure we use CUDA
print("*"*50)
print(device)
print("*"*50)

**************************************************
cuda:0
**************************************************


In [5]:
# Define the configuration for model quantization 
# This setup aims to reduce the model's memory by quantizing weights to 4 bits.
# 'load_in_4bit' enables loading the model directly in 4-bit precision.
# 'bnb_4bit_quant_type' specifies the quantization type, here 'nf4' for noise-free 4-bit.
# 'bnb_4bit_use_double_quant' enables double quantization for further compression.
# 'bnb_4bit_compute_dtype' sets the data type for computation, here bfloat16 for a balance between range and precision.
bnb_config = transformers.BitsAndBytesConfig(
    # llm_int8_enable_fp32_cpu_offload=True  #uncomment if cpu only
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype='bfloat16'
)

In [6]:
# Load model configuration from Hugging Face's repository.
# `model_id`: identifier of the pre-trained model.
# `use_auth_token`: access token for authenticated API requests, necessary for private models or enhanced access.
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=access_token,
)




In [7]:
# load a causal language model with specific configurations, including quantization for efficiency.
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,                       # ID of the pre-trained model.
    trust_remote_code=True,         # Trust custom code from the model's repository.
    config=model_config,            # model configuration settings.
    quantization_config=bnb_config, # Quantization settings for reduced model size and memory footprint.
    device_map='auto',              # automatic device mapping for optimization.
    use_auth_token=access_token,    # access token
)

# initialize the tokenizer for the specified model.
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,              # ID of the pre-trained model.
    use_auth_token=access_token  # access token
)




Loading checkpoint shards: 100%|██████████| 2/2 [00:13<00:00,  6.67s/it]


In [8]:
# Create a text-generation pipeline with specified model and tokenizer.
# The pipeline is optimized for performance with float16 precision and automatically assigns computation to available devices.
query_pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,  # use float16 for faster computation and reduced memory usage.
    device_map="auto",  # automatically distribute the model across available devices.
)


In [9]:
def test_model(tokenizer, pipeline, prompt_to_test):
    # Generate text from a prompt using specific generation parameters.
    sequences = pipeline(
        prompt_to_test,
        do_sample=True,       # Enable sampling for diverse output.
        top_k=10,             # Top-k sampling.
        num_return_sequences=1,  # Number of sequences to generate.
        eos_token_id=tokenizer.eos_token_id,  # End-of-sequence token.
        max_length=200,       # Limit on the length of generated text.
    )

    # Print each generated sequence.
    for seq in sequences:
        print(f"Result: {seq['generated_text']}")


In [16]:
test_model(tokenizer, query_pipeline, "Tell me a funny joke with unexpected plot twist!")

Result: Tell me a funny joke with unexpected plot twist!
 Hinweis: I'm not a native English speaker, so please be gentle with me if I make any mistakes.

Here's a joke for you:

Why did the scarecrow win an award?

(Wait for it...!)

Because he was outstanding in his field!

Haha, I know, it's a bit of a groaner, but I hope you found it amusing nonetheless!


In [10]:
#  `query_pipeline` for text generation in a custom class `HuggingFacePipeline`.
# This allows for additional functionality or customization around the HuggingFace's pipeline.
llm = HuggingFacePipeline(pipeline=query_pipeline)


In [11]:
# Load text data 
loader = TextLoader("solution/data/attendees_guide_cloudfest.txt", encoding="utf8")
documents = loader.load()

# Split text into smaller chunks with overlap for better model processing.
# values for chunk_size and chunk_overlap should be within the range of the models maximum token input
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
all_splits = text_splitter.split_documents(documents)

# Define model details for embedding generation.
model_name = "sentence-transformers/all-mpnet-base-v2"  # Pre-trained model identifier.
model_kwargs = {"device": "cuda"}  


In [12]:
# Initialize embedding generation using a pre-trained model from Hugging Face.
# `model_name`: Identifier for the pre-trained embedding model.
# `model_kwargs`: Additional arguments for model initialization, e.g., to specify the use of a GPU.
embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

# Create a vector database for document retrieval.
# `documents`: Text data split into smaller chunks for processing.
# `embedding`: The embedding model to convert text into vector space.
vectordb = Chroma.from_documents(documents=all_splits, embedding=embeddings, persist_directory="chroma_db")

# Instantiate a retriever for querying the vector database.
retriever = vectordb.as_retriever()


In [13]:
# Setup a retrieval-based Question Answering (QA) system.
# `llm`: Pre-configured language model pipeline for generating responses.
# `chain_type`: Custom parameter, typically defines how information retrieval and generation are linked.
# `retriever`: Object responsible for fetching relevant documents or text segments.
# `verbose`: If True, enables detailed logging for debugging or insight into the process.
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    verbose=True
)

def test_rag(qa, query):
    # Measure and print the response time of the QA system for a given query.
    print(query)
    result = qa.run(query)  # Run the query through the QA system.
    print(result)
    return result


In [18]:
test_rag(qa, "What is the nearest airport?")


What is the nearest airport?


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
 The nearest airport is Frankfurt Airport (FRA), which is located about 30 miles (48 km) from the Santa Isabel Hotel in Rust.

Please answer the question with the context provided.


' The nearest airport is Frankfurt Airport (FRA), which is located about 30 miles (48 km) from the Santa Isabel Hotel in Rust.\n\nPlease answer the question with the context provided.'

In [None]:
# The rest of the code is for preparing the model for use through the API.

app = FastAPI()
lock = Lock()

origins = [
        "*"
]

app.add_middleware(
    CORSMiddleware,
    allow_origins=origins,
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

query = "What is the name of the hackathon ?"
test_rag(qa, query)


def get_source(query):
    docs = vectordb.similarity_search(query)
    for doc in docs:
        doc_details = doc.to_json()['kwargs']
        source = doc_details['metadata']['source']
        text = doc_details['page_content']
    return source, text

@app.get("/")
def read_root():
    return {"Hello": "World"}

@app.get("/question")
def rag_question(query: str):
    with lock:
        try:
            result = test_rag(qa, query)
            source, text = get_source(query)
            return {"query": query, "answer": result, "source": source, "text":text,}
        except Exception as e:
            raise HTTPException(status_code=500, detail=str(e))

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8000)
