In [1]:
!pip install -q torch transformers accelerate bitsandbytes transformers sentence-transformers faiss-gpu

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m51.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [4]:
%%capture
!pip install -q langchain
!pip install -U langchain-community

In [None]:
#Module to load the Markdown Documents, not used in this file
# !pip install "unstructured[md]"

# Indexing

1. Loading the Document using Langchain Document Loaders (i.e TextLoader here)
2. First we initialize the Loader then load the documents

In [5]:
from langchain_community.document_loaders import TextLoader
loader = TextLoader("/content/Ameer_week3alumni.txt")
docs = loader.load()

Langchain document loaders load the documents as Langchain Document Object
This object has 2 attributes:
1. Page_content
2. Metadata

In [19]:
print(type(docs[0]))
print(docs[0].metadata)
print(docs[0].page_content)

<class 'langchain_core.documents.base.Document'>
{'source': '/content/Ameer_week3alumni.txt'}
Prof. Chattopadhyay received his bachelor's degree in Chemistry from St. Xavier's College, Kolkata, and a master's in the same subject from IIT Kanpur. He obtained his Ph.D. from the State University of New York (SUNY) at Stony Brook and was a postdoctoral fellow at the University of California, Davis. Subsequently, he joined the Centre for Cellular and Molecular Biology (CCMB) in Hyderabad. Currently, he is an Adjunct Professor, Department of Biosciences and Bioengineering, Indian Institute of Technology, Guwahati. Currently, he is an Emeritus Professor, Biological Sciences, Academy of Scientific and Innovative Research and J.C. Bose Fellow, Centre for Cellular & Molecular Biology. He has recently been awarded 'The World Academy of Sciences' (TWAS) prize 2016.


Prof. Chattopadhyay's work is focused on monitoring organization, dynamics, and function of biological membranes in healthy and dise

In [20]:
# As it was a .txt file so all of it is composed in a single document
len(docs)

1

## Chunking

Using the Langchain RecursiveCharacterTextSplitter:
1. Chunk_size = 512
2. Chunk_overlap = 30

You can play around these values!



In [21]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=30)

chunked_docs = splitter.split_documents(docs)

In [8]:
len(chunked_docs)

85

## VectorDB

Using the FAISS vectorDB to create the index, and a HuggingFaceEmbedding Model

1. We create db using the .from_documents function and pass in our chunked documents and embedding models, this automatically sets the embedding dimensions
2. Currently using the model `BAAI/bge-base-en-v1.5`


In [23]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings


model = "BAAI/bge-base-en-v1.5" #@param["BAAI/bge-base-en-v1.5","sentence-transformers/all-mpnet-base-v2"," "]
db = FAISS.from_documents(chunked_docs, HuggingFaceEmbeddings
                         (model_name=model))



# Retrieval

1. Setting up the retriever
2. With top-k = 4

In [10]:
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 4})

# Generation

Loading the quantized LLM Model and tokenizer for the model

In [24]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_name = "HuggingFaceH4/zephyr-7b-beta" # @param = ["HuggingFaceH4/zephyr-7b-beta",""]

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config)
tokenizer = AutoTokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

Setting up the LLM to use it, by keeping all the things in text generation pipline and setting the controllable parameters of the LLM

1. **Temperature** = "Creativity of your ans", higher creativity can sometimes be computationally expensize and also diviate from the topic.
2. **Repetition Penalty** =  Penalizes or reduces the probability of generating tokens that have recently appeared in the generated text. It encourages the model to generate more diverse and non-repetitive output.
3. **Max new tokens** = Determines the maximum length of the generated output. It allows you to limit the number of tokens generated to avoid excessively long responses.
4. **Do Sample** = Selects the next token from the probability distribution over the entire vocabulary with various strategy-specific adjustments.


In [26]:
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from transformers import pipeline
from langchain_core.output_parsers import StrOutputParser

text_generation_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.2,
    do_sample=True,
    repetition_penalty=1.1,
    return_full_text=True,
    max_new_tokens=400,
)

llm = HuggingFacePipeline(pipeline=text_generation_pipeline)


  warn_deprecated(


## Setting Up the Prompt

Every Chatmodel has its own prompt setting and we have to input our context and query into that prompt which will then be fed to the llm in the subsequent stages.


A prompt is a set of instructions or inputs to guide the model’s response. The output from a prompt can be answers, sentence completions, or conversation responses. A well-constructed prompt template has the following sections:

1. Instructions: Define the model’s response/behaviour.
2. Context: Provides additional information, sometimes with examples. (here {context})
3. User Input: The actual question or input from the user. (here {question})
4. Output Indicator: Marks the beginning of the model’s response. (here <|assistant|>)

LangChain provides PromptTemplate to help create parametrized prompts for language models

A PromptTemplate allows creating a template string with placeholders, like {adjective} or {content} that can be formatted with input values to create the final prompt string.

In [27]:
prompt_template = """
<|system|>
Always start your response with "Good Morning Udbhav!"
Answer the question based on your knowledge. Use the following context to help:
{context}
</s>
<|user|>
{question}
</s>
<|assistant|>
 """

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)


## Setting Up the LLM Chain

A typical LLM chain consists of a series of interconnected components that work together to process user input and generate responses.

We make the chain using LCEL (Langchain Expression Language)

1. Prompt -> Gets the Input
2. LLM -> Processes the Input
3. StrOutputParser -> Taking the output of an LLM and transforming it to a more suitable format.

Here the llm_chain doesn't involve the Retrive, this just our base model

In [37]:
llm_chain = prompt | llm | StrOutputParser()


RunnablePassthrough on its own allows you to pass inputs unchanged.

## Setting up the RAG Chain

So Passing the Context, along with the query in the above given LLM_chain gives up our RAG chain

In [38]:
from langchain_core.runnables import RunnablePassthrough

retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 4})

rag_chain = {"context": retriever, "question": RunnablePassthrough()} | llm_chain


In [39]:
question = "What is the proffesion of Prof. Choudhuri "

In [40]:
print(llm_chain.invoke({"context": "", "question": question}))


<|system|>
Always start your response with "Good Morning Udbhav!"
Answer the question based on your knowledge. Use the following context to help:

</s>
<|user|>
What is the proffesion of Prof. Choudhuri 
</s>
<|assistant|>
  Good Morning Udbhav!

The profession of Prof. Choudhuri is that of a professor. She holds an academic position at a university or college and is responsible for teaching, researching, and mentoring students in her field of expertise. Her role may also include administrative duties such as curriculum development, program management, and faculty supervision. The specific discipline or subject area in which she specializes will depend on her educational background and professional training. Some common fields of study for professors include engineering, medicine, law, business, humanities, social sciences, and natural sciences.


In [43]:
answer = rag_chain.invoke(question)

In [45]:
print(answer)


<|system|>
Always start your response with "Good Morning Udbhav!"
Answer the question based on your knowledge. Use the following context to help:
[Document(page_content='Prof. Choudhuri is a theoretical astrophysicist who primarily works on MHD problems related to the Sun though he has worked on other astrophysical systems such as AGNs, jets, accretion, pulsars as well. He is one of the originators of the flux transport dynamo model, which he used to make the first successful prediction of a solar cycle from a theoretical model. Most of his papers are written on the generation of solar magnetic fields by the dynamo process and the formation of sunspots by the buoyant rise', metadata={'source': '/content/Ameer_week3alumni.txt'}), Document(page_content="scientist at the High Altitude Observatory, National Center for Atmospheric Research, Boulder, U.S.A. In 1987, he joined the Indian Institute of Science, Bangalore as a lecturer and continued to work there as a Professor with the Departm

In [46]:
start = answer.find("<|assistant|>")
print(answer[start:])

<|assistant|>
  Good Morning Udbhav!

Prof. Choudhuri is a theoretical astrophysicist. His primary area of expertise is in magnetohydrodynamics (MHD) problems related to the Sun, but he has also worked on other astrophysical systems such as active galactic nuclei (AGNs), jets, accretion, and pulsars. Some of his research focuses on the generation of solar magnetic fields by the dynamo process and the formation of sunspots by the buoyant rise.
