# Build your own RAG application

## Install dependencies

In [14]:
%pip install ollama

Note: you may need to restart the kernel to use updated packages.


## Load dataset into memory from file

In [21]:
dataset = []
with open('data/cat-facts.txt', 'r') as file:
    dataset = file.readlines()
    print(f" Loaded {len(dataset)} entries")

 Loaded 150 entries


## Create the embeddings for the dataset

- Uses the BAAI text embedding model
- Chunking: each entry line in the dataset is treated as a chunk
- Convert each chunk into an embedding vector 
- Store the chunk and its corresponding vector into a list

In [22]:
import ollama

em = 'hf.co/CompendiumLabs/bge-base-en-v1.5-gguf'
lm = 'hf.co/bartowski/Llama-3.2-1B-Instruct-GGUF'

# Each element in the vector_db will be a tuple (chunk, embedding)
# The embedding is a list of floats, for example: [0.1, 0.04, -0.34, 0.21, ...]
vector_db = []

def add_chunk_to_database(chunk):
  embedding = ollama.embed(model=em, input=chunk)['embeddings'][0]
  vector_db.append((chunk, embedding))


In [23]:
for i, chunk in enumerate(dataset):
  add_chunk_to_database(chunk)
  print(f'Added chunk {i+1}/{len(dataset)} to the database')

Added chunk 1/150 to the database
Added chunk 2/150 to the database
Added chunk 3/150 to the database
Added chunk 4/150 to the database
Added chunk 5/150 to the database
Added chunk 6/150 to the database
Added chunk 7/150 to the database
Added chunk 8/150 to the database
Added chunk 9/150 to the database
Added chunk 10/150 to the database
Added chunk 11/150 to the database
Added chunk 12/150 to the database
Added chunk 13/150 to the database
Added chunk 14/150 to the database
Added chunk 15/150 to the database
Added chunk 16/150 to the database
Added chunk 17/150 to the database
Added chunk 18/150 to the database
Added chunk 19/150 to the database
Added chunk 20/150 to the database
Added chunk 21/150 to the database
Added chunk 22/150 to the database
Added chunk 23/150 to the database
Added chunk 24/150 to the database
Added chunk 25/150 to the database
Added chunk 26/150 to the database
Added chunk 27/150 to the database
Added chunk 28/150 to the database
Added chunk 29/150 to the dat

## Retrieval/Knowledge Extraction

I/P: Query
O/P: Returns the top N most relevant chunks

"Relevant" chunks - based on cosine similarity
Higher the cosine similarity between  the two vectors, they are more similar in terms of meaning.

In [24]:
def cosine_similarity(a, b):
  dot_product = sum([x * y for x, y in zip(a, b)])
  norm_a = sum([x ** 2 for x in a]) ** 0.5
  norm_b = sum([x ** 2 for x in b]) ** 0.5
  return dot_product / (norm_a * norm_b)

In [27]:
def retrieve(query, top_n=3):
  query_embedding = ollama.embed(model=em, input=query)['embeddings'][0]
  # temporary list to store (chunk, similarity) pairs
  similarities = []
  for chunk, embedding in vector_db:
    similarity = cosine_similarity(query_embedding, embedding)
    similarities.append((chunk, similarity))
  # sort by similarity in descending order, because higher similarity means more relevant chunks
  similarities.sort(key=lambda x: x[1], reverse=True)
  # finally, return the top N most relevant chunks
  return similarities[:top_n]


## Parse the input Query and Retrieve relevant chunks

In [29]:
input_query = input('Ask me a question: ')
retrieved_knowledge = retrieve(input_query)

print('Retrieved knowledge:')
for chunk, similarity in retrieved_knowledge:
  print(f' - (similarity: {similarity:.2f}) {chunk}')

new_line = '\n'
instruction_prompt = f'''You are a helpful chatbot.
Use only the following pieces of context to answer the question. Don't make up any new information:
{new_line.join([f' - {chunk}' for chunk, similarity in retrieved_knowledge])}
'''

#print(instruction_prompt)

Retrieved knowledge:
 - (similarity: 0.82) The most popular pedigreed cat is the Persian cat, followed by the Main Coon cat and the Siamese cat.

 - (similarity: 0.76) Today there are about 100 distinct breeds of the domestic cat.

 - (similarity: 0.76) Cats are North America’s most popular pets: there are 73 million cats compared to 63 million dogs. Over 30% of households in North America own a cat.



## Generate a response based on retrieved knowledge 
Use `ollama` to generate the response based on the instruction_prompt as system message

In [30]:
stream = ollama.chat(
  model=lm,
  messages=[
    {'role': 'system', 'content': instruction_prompt},
    {'role': 'user', 'content': input_query},
  ],
  stream=True,
)

# print the response from the chatbot in real-time
print('Chatbot response:')
for chunk in stream:
  print(chunk['message']['content'], end='', flush=True)


Chatbot response:
According to the two pieces of context, we can infer that the popular cat breeds in North America are:

1. Persian
2. Main Coon
3. Siamese