This notebook includes RAG with basic prompt and deployment.

In [None]:
from pinecone import Pinecone, ServerlessSpec
import getpass
import os
import time
from transformers import AutoTokenizer, AutoModelForCausalLM
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
import pinecone
import torch
from pinecone import Index
from langchain_pinecone import PineconeVectorStore
from transformers import BitsAndBytesConfig
import bitsandbytes
from langchain.llms import HuggingFacePipeline
from transformers import pipeline
from tqdm import tqdm
from langchain.prompts import PromptTemplate
import gradio as gr

In [None]:
if not os.getenv("PINECONE_API_KEY"):
    os.environ["PINECONE_API_KEY"] = getpass.getpass("Enter your Pinecone API key: ")

pinecone_api_key = os.environ.get("PINECONE_API_KEY")

pc = Pinecone(api_key=pinecone_api_key)

Enter your Pinecone API key: ··········


In [None]:
#Embedding model
model_name = "thenlper/gte-base"
model_kwargs = {"device": "cuda" if torch.cuda.is_available() else "cpu"}
encode_kwargs = {"normalize_embeddings": False}
hf_embeddings = HuggingFaceEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)

  hf_embeddings = HuggingFaceEmbeddings(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# Create the index if it doesn't exist
#pc.create_index(name="rag-llm-gte-base",
#                    dimension=768,metric="cosine",
#                   spec=ServerlessSpec(
#                    cloud="aws",
#                    region="us-east-1"))

pc_index = pc.Index("rag-llm-gte-base")

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
#VectorStore
vectorstore = PineconeVectorStore(
    index_name="rag-llm-gte-base",
    embedding=hf_embeddings,
)

In [None]:
index_stats = pc_index.describe_index_stats()
print("Index Stats:", index_stats)

Index Stats: {'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 2644}},
 'total_vector_count': 2644}


In [None]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})

In [None]:
!pip install -U bitsandbytes



In [None]:
bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)

llm_model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", quantization_config=bnb_config)
llm_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")

`low_cpu_mem_usage` was None, now default to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

In [None]:
llm_tokenizer.pad_token_id = llm_tokenizer.eos_token_id

In [None]:
llm_pipeline = pipeline(
    "text-generation",
    model=llm_model,
    tokenizer=llm_tokenizer,
    temperature=0.2,
    do_sample=True,
    repetition_penalty=1.1,
    return_full_text=False,
    max_new_tokens=100,
    top_p=0.9,
    top_k=50,
    eos_token_id=llm_tokenizer.eos_token_id
)

In [None]:
from langchain.llms import HuggingFacePipeline

llm_final_model = HuggingFacePipeline(pipeline=llm_pipeline)

  llm_final_model = HuggingFacePipeline(pipeline=llm_pipeline)


In [None]:
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser

# Prompt template
template = """
You are a compassionate and knowledgeable mental health assistant that answers questions related to mental health.\n
Use the following pieces of retrieved context to provide a helpful and empathetic response to the user's question.\n
Use only the context provided and not any prior knowledge.\n
If you are unsure of the answer, tell that you do not know the answer.\n
Stick to the question and just answer the question in a short manner.\n
Avoid any additional greetings or elaborations.\n

Context: \n
------------------------------------------------------------------------------\n
{context}
------------------------------------------------------------------------------\n
Given the context and without any prior knowledge, answer the below question.\n
Question: {question}
Answer:
"""

prompt = PromptTemplate(
    template=template,
    input_variables=["context", "question"]
)

In [None]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm_final_model,
    retriever=retriever,
    return_source_documents=False,
    chain_type_kwargs={"prompt": prompt}
)

In [None]:
def predict(input, history):
    response = qa_chain.run(input)
    return response

In [None]:
input = "What is PTSD?"
history = []
print(predict(input, history))

PTSD is a real disorder that develops when a person has experienced or witnessed a scary, shocking, terrifying, or dangerous event. These stressful or traumatic events usually involve a situation where someone’s life has been threatened or severe injury has occurred. Children and adults with PTSD may feel anxious or stressed even when they are not in present danger.


Giving an input that is not there in the context,

In [None]:
input = "What is LangChain?"
history = []
print(predict(input, history))

I don't know the answer to that question. The provided context does not mention LangChain. It appears to be unrelated to the topic of suicide prevention and mental health. If you're looking for information on LangChain, I'd suggest searching elsewhere as it's not mentioned in the given context.


Deployment using gradio

In [None]:
gr.ChatInterface(predict).launch(share=True)



Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://fc21833ee0277c84e2.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


