#**Building a Gen AI RAG Chatbot from Scratch**

---




In [1]:
!pip install langchain langchain_community sentence_transformers chromadb pypdf langchain-groq

Collecting langchain
  Downloading langchain-0.3.1-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain_community
  Downloading langchain_community-0.3.1-py3-none-any.whl.metadata (2.8 kB)
Collecting sentence_transformers
  Downloading sentence_transformers-3.1.1-py3-none-any.whl.metadata (10 kB)
Collecting chromadb
  Downloading chromadb-0.5.11-py3-none-any.whl.metadata (6.8 kB)
Collecting pypdf
  Downloading pypdf-5.0.1-py3-none-any.whl.metadata (7.4 kB)
Collecting langchain-groq
  Downloading langchain_groq-0.2.0-py3-none-any.whl.metadata (2.9 kB)
Collecting langchain-core<0.4.0,>=0.3.6 (from langchain)
  Downloading langchain_core-0.3.7-py3-none-any.whl.metadata (6.3 kB)
Collecting langchain-text-splitters<0.4.0,>=0.3.0 (from langchain)
  Downloading langchain_text_splitters-0.3.0-py3-none-any.whl.metadata (2.3 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.129-py3-none-any.whl.metadata (13 kB)
Collecting tenacity!=8.4.0,<9.0.0,>=8.1.0 (from

In [2]:
#importing required libraries

import os
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import TextLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_groq import ChatGroq
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

In [3]:
# Set your Groq API key
os.environ["GROQ_API_KEY"] = ""
#loading the document
PERSIST_DIRECTORY = "/content/chroma_db"
folder_path = r"/content/aml"
DOCUMENT_PATH = r"/content/aml/2307.pdf"

In [4]:
def load_or_create_vector_store(texts, embeddings):
    # Check if the vector store already exists
    if os.path.exists(PERSIST_DIRECTORY):
        print("Loading existing vector store...")
        return Chroma(persist_directory=PERSIST_DIRECTORY, embedding_function=embeddings)

    # If not, create a new one
    print("Creating new vector store...")
    return Chroma.from_documents(texts, embeddings, persist_directory=PERSIST_DIRECTORY)

In [5]:
# Load and preprocess documents
def load_documents(document_path):
    all_texts = []  # To store the texts from all PDFs
    # Iterate over each file in the folder
    for filename in os.listdir(folder_path):
        # Only process files that end with .pdf
        if filename.endswith(".pdf"):
            file_path = os.path.join(folder_path, filename)
            loader = PyPDFLoader(file_path)  # Load the individual PDF
            documents = loader.load()  # Load documents from the PDF

            # Split the loaded documents into smaller chunks
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=1000,
                chunk_overlap=200,
                length_function=len,
                separators=["\n\n", "\n", " ", ""]
            )
            texts = text_splitter.split_documents(documents)
            all_texts.extend(texts)
    return all_texts

In [6]:
texts = load_documents(folder_path)
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/msmarco-distilbert-base-v4")
db = Chroma.from_documents(texts, embeddings)
# db.add_documents(texts)
db.persist()

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/msmarco-distilbert-base-v4")
  from tqdm.autonotebook import tqdm, trange


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.75k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/545 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/319 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  db.persist()


In [7]:
# Set up the language model
llm = ChatGroq(model="llama-3.2-3b-preview") # Choose an appropriate open-source model

In [8]:
# Create a custom prompt template. You can give instructions to the model. This Defines a prompt template for a RAG model to format context and question for answer generation.

template = """Use the following pieces of context to answer the question at the end.

{context}

Question: {question}
Answer: """
QA_CHAIN_PROMPT = PromptTemplate(input_variables=["context", "question"], template=template)




In [9]:
# Set up the RAG pipeline
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=db.as_retriever(),
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)

In [10]:
# Chatbot interface. Chnage it on your own preference
print("Welcome to the RAG Chatbot! Type 'exit' to end the conversation.")
while True:
    user_input = input("You: ")
    if user_input.lower() == 'exit':
        print("Goodbye!")
        break

    response = qa_chain.invoke({"query": user_input})

    print("Chatbot:", response['result'])

Welcome to the RAG Chatbot! Type 'exit' to end the conversation.
You: summarize the auto grader project
Chatbot: The Auto Grader project is a research initiative focused on developing an advanced system for retrieving and analyzing student assignments. The primary objective is to create a comprehensive technology stack that can effectively assess student submissions, retrieve relevant assignments, compare responses to correct solutions, analyze code quality, and dynamically grade submissions.

The project involves several key components, including:

1. Building a Retriever: This involves structuring student assignments in a standardized format and implementing a retrieval system using LangChain for semantic search capabilities.
2. Data Structuring: Student assignments are stored in a Chroma DB with metadata, questions, and answers.
3. Dynamic Grading: The system will utilize advanced technology to dynamically grade submissions.

While the main focus of the project is on developing a ro