In [17]:
!pip install -q torch transformers accelerate bitsandbytes transformers sentence-transformers faiss-gpu

In [18]:
# If running in Google Colab, you may need to run this cell to make sure you're using UTF-8 locale to install LangChain
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [19]:
!pip install -q langchain

In [None]:
! pip install chromadb==0.4.15

In [6]:
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.vectorstores import Chroma
from langchain.llms import HuggingFaceHub
from langchain.chains import RetrievalQA
from transformers import AutoTokenizer, pipeline
from langchain import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

In [7]:
loader = CSVLoader(file_path='/content/test_vectordb.csv')

data = loader.load()

In [9]:
data[0]

Document(page_content='\ufeffCompany: Pfizer\nAcquired: Seagen\nDate(dd-mm-yyyy): 13-01-2024\nValue (in billions USD): 43', metadata={'source': '/content/test_vectordb.csv', 'row': 0})

In [None]:
# Define the path to the pre-trained model you want to use
modelPath = "sentence-transformers/all-MiniLM-l6-v2"

# Create a dictionary with model configuration options, specifying to use the CPU for computations
model_kwargs = {'device':'cpu'}

# Create a dictionary with encoding options, specifically setting 'normalize_embeddings' to False
encode_kwargs = {'normalize_embeddings': False}

# Initialize an instance of HuggingFaceEmbeddings with the specified parameters
embeddings = HuggingFaceEmbeddings(
    model_name=modelPath,     # Provide the pre-trained model's path
    model_kwargs=model_kwargs, # Pass the model configuration options
    encode_kwargs=encode_kwargs # Pass the encoding options
)

In [11]:
vectordb = Chroma.from_documents(data,embeddings)

In [13]:
query="which company acquired Abceutics"

In [14]:
retriver=vectordb.as_retriever(search_type="mmr",search_kwargs={"k":1})

In [15]:
retriver.get_relevant_documents(query)



[Document(page_content='\ufeffCompany: Merck\nAcquired: Abceutics\nDate(dd-mm-yyyy): 05-04-2024\nValue (in billions USD): 0.2', metadata={'row': 5, 'source': '/content/test_vectordb.csv'})]

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_name = 'HuggingFaceH4/zephyr-7b-beta'

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config)
tokenizer = AutoTokenizer.from_pretrained(model_name)

## Setup the LLM chain

In [None]:
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from transformers import pipeline
from langchain_core.output_parsers import StrOutputParser

text_generation_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.1,
    do_sample=True,
    repetition_penalty=1.1,
    return_full_text=True,
    max_new_tokens=400,
)

llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

prompt_template = """
<|system|>
Answer the question based on your knowledge. Use the following context to help:

{context}

</s>
<|user|>
{question}
</s>
<|assistant|>

 """

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

llm_chain = prompt | llm | StrOutputParser()

In [None]:
from langchain_core.runnables import RunnablePassthrough

# retriever = vectordb.as_retriever(search_kwargs={"k":5})
retriever = vectordb.as_retriever(search_type="mmr")

rag_chain = (
 {"context": retriever, "question": RunnablePassthrough()}
    | llm_chain
)


In [None]:
question = "which company acquired Abceutics?"

First, let's see what kind of answer we can get with just the model itself, no context added:

In [None]:
llm_chain.invoke({"context":"", "question": question})

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


'\n<|system|>\nAnswer the question based on your knowledge. Use the following context to help:\n\n\n\n</s>\n<|user|>\nwhich company acquired Abceutics?\n</s>\n<|assistant|>\n\n  According to the given context, there is no information provided about which company acquired Abceutics. The question seems to be incomplete or incorrect as it does not match with the available context. Please provide more details or clarify the question for me to answer accurately.'

In [None]:
rag_chain.invoke(question)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


"\n<|system|>\nAnswer the question based on your knowledge. Use the following context to help:\n\n[Document(page_content='\\ufeffCompany: Merck\\nAcquired: Abceutics\\nDate: 05-04-2024\\nValue (in billions USD): 0.2', metadata={'row': 5, 'source': '/content/test_vectordb.csv'}), Document(page_content='\\ufeffCompany: Bristol Myers Squibb\\nAcquired: RayzeBio\\nDate: 05-01-2024\\nValue (in billions USD): 4.1', metadata={'row': 2, 'source': '/content/test_vectordb.csv'}), Document(page_content='\\ufeffCompany: Lonza\\nAcquired: Roche biologics manufacturing plant\\nDate: 19-03-2023\\nValue (in billions USD): Undisclosed', metadata={'row': 8, 'source': '/content/test_vectordb.csv'}), Document(page_content='\\ufeffCompany: Oruka Therapeutics\\nAcquired: ARCA Biopharma\\nDate: 03-04-2024\\nValue (in billions USD): Undisclosed', metadata={'row': 7, 'source': '/content/test_vectordb.csv'})]\n\n</s>\n<|user|>\nwhich company acquired Abceutics?\n</s>\n<|assistant|>\n\n  The company that acquire

In [None]:
question = "list all the companies which got acquired in march 2024?"

In [None]:
rag_chain.invoke(question)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


"\n<|system|>\nAnswer the question based on your knowledge. Use the following context to help:\n\n[Document(page_content='\\ufeffCompany: Genmab\\nAcquired: ProfoundBio\\nDate: 03-04-2024\\nValue (in billions USD): 1.8', metadata={'row': 6, 'source': '/content/test_vectordb.csv'}), Document(page_content='\\ufeffCompany: Merck\\nAcquired: Abceutics\\nDate: 05-04-2024\\nValue (in billions USD): 0.2', metadata={'row': 5, 'source': '/content/test_vectordb.csv'}), Document(page_content='\\ufeffCompany: Exxon Mobil\\nAcquired: Pioneer Natural Resources\\nDate: 01-10-2023\\nValue (in billions USD): 59.5', metadata={'row': 9, 'source': '/content/test_vectordb.csv'}), Document(page_content='\\ufeffCompany: AstraZeneca\\nAcquired: Fusion\\nDate: 19-03-2023\\nValue (in billions USD): 2.4', metadata={'row': 4, 'source': '/content/test_vectordb.csv'})]\n\n</s>\n<|user|>\nlist all the companies which got acquired in march 2024?\n</s>\n<|assistant|>\n\n  Based on the provided context, there are no co

In [None]:
question = "which acquisition has highest valuation?"

In [None]:
rag_chain.invoke(question)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


"\n<|system|>\nAnswer the question based on your knowledge. Use the following context to help:\n\n[Document(page_content='\\ufeffCompany: Genmab\\nAcquired: ProfoundBio\\nDate: 03-04-2024\\nValue (in billions USD): 1.8', metadata={'row': 6, 'source': '/content/test_vectordb.csv'}), Document(page_content='\\ufeffCompany: Genmab\\nAcquired: ProfoundBio\\nDate: 03-04-2024\\nValue (in billions USD): 1.8', metadata={'row': 6, 'source': '/content/test_vectordb.csv'}), Document(page_content='\\ufeffCompany: Merck\\nAcquired: Abceutics\\nDate: 05-04-2024\\nValue (in billions USD): 0.2', metadata={'row': 5, 'source': '/content/test_vectordb.csv'}), Document(page_content='\\ufeffCompany: Merck\\nAcquired: Abceutics\\nDate: 05-04-2024\\nValue (in billions USD): 0.2', metadata={'row': 5, 'source': '/content/test_vectordb.csv'}), Document(page_content='\\ufeffCompany: Exxon Mobil\\nAcquired: Pioneer Natural Resources\\nDate: 01-10-2023\\nValue (in billions USD): 59.5', metadata={'row': 9, 'source':