Importing the required libraries

In [5]:
import os
from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

2. Convert the text into the embeddings

In [6]:
# Load environment variables
load_dotenv()

# Set up Groq API key
# Create a .env file with: GROQ_API_KEY=your_api_key_here
if not os.getenv("GROQ_API_KEY"):
    os.environ["GROQ_API_KEY"] = "GROQ_API_KEY"  # Replace with actual key

print("✅ Environment setup complete!")

✅ Environment setup complete!


In [7]:
loader = PyPDFLoader('resume.pdf')
docs = loader.load()


In [9]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 300, 
    chunk_overlap = 50,
    length_function = len,
    separators=['\n\n', '\n', " ", '']
)

chunks = text_splitter.split_documents(docs)


In [10]:
for i, chunk in enumerate(chunks[:3]):  # Show first 3 chunks
        print(f"\nChunk {i+1}:")
        print(f"Content: {chunk.page_content}")
        print(f"Metadata: {chunk.metadata}")
        print(f"Length: {len(chunk.page_content)} characters")
        print("-" * 50)


Chunk 1:
Content: Vinay Sharma
♂¶ap-¶arker-altU.P. /envel⌢pevinayiet435@gmail.com ♂phone-alt+91 7379771194 /linkedin-invinayiet /githubvinayiet
Experience
Technical Trainer (Machine Learning)
W3Grads
Oct 2024 – Present
Metadata: {'producer': 'pdfTeX-1.40.26', 'creator': 'LaTeX with RenderCV', 'creationdate': '2025-07-22T03:44:43+00:00', 'author': 'Vinay Sharma', 'keywords': '', 'moddate': '2025-07-22T03:44:43+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.26 (TeX Live 2024) kpathsea version 6.4.0', 'subject': '', 'title': "Vinay Sharma's CV", 'trapped': '/False', 'source': 'resume.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1'}
Length: 200 characters
--------------------------------------------------

Chunk 2:
Content: W3Grads
Oct 2024 – Present
◦ Trained over 270 students in core Python, OOP, SQL, file I/O, and foundational concepts in Generative AI
including RAG and prompt engineering
Data Scientist Associate
Blackcoffer
Sep 2024 – Dec 2024
Metadata: 

In [11]:
from langchain_community.embeddings import HuggingFaceEmbeddings


embeddings_model = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2",
        model_kwargs={'device': 'cpu'},  # Use 'cuda' if you have GPU
        encode_kwargs={'normalize_embeddings': True}  # Normalize for better similarity search
)

In [13]:
texts = [chunk.page_content for chunk in chunks]
embeded_text = embeddings_model.embed_documents(texts)

embeded_text

[[-0.0957011878490448,
  -0.06105615571141243,
  0.0176278967410326,
  0.009033359587192535,
  0.010548359714448452,
  -0.045628927648067474,
  0.03161485120654106,
  0.009130197577178478,
  -0.034222811460494995,
  -0.016317401081323624,
  -0.02262955904006958,
  -0.01234795805066824,
  0.04239741712808609,
  -0.01592278480529785,
  -0.04341617226600647,
  0.04183420538902283,
  -0.05733775720000267,
  -0.022756723687052727,
  -0.004433637950569391,
  -0.10369845479726791,
  -0.050694920122623444,
  -0.0015198428882285953,
  0.001047707861289382,
  0.014933772385120392,
  0.07167676836252213,
  -0.06772848218679428,
  -0.024205470457673073,
  0.0034096953459084034,
  0.027405980974435806,
  0.016414722427725792,
  -0.03729056566953659,
  0.04482639208436012,
  -0.05493323877453804,
  0.012573091313242912,
  -0.05555598437786102,
  0.09491294622421265,
  -0.05272412300109863,
  0.029858097434043884,
  0.006093174684792757,
  0.05606578662991524,
  -0.03257739171385765,
  -0.11551091820

In [14]:
# Creating vector store 
from langchain_community.vectorstores import Chroma

vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings_model,
    persist_directory=".new_db",
    collection_name="rag_collection"
)


Searching the query from the vector databases

In [None]:
query = "what is this document type?"

similar_docs = vectorstore.similarity_search(
    query= query,
    k=2
)

similar_docs


# what is your name ? Vinay sharma -> My name is name Vinay sharma 


[Document(metadata={'creationdate': '2025-07-22T03:44:43+00:00', 'keywords': '', 'source': 'resume.pdf', 'page_label': '1', 'moddate': '2025-07-22T03:44:43+00:00', 'author': 'Vinay Sharma', 'trapped': '/False', 'title': "Vinay Sharma's CV", 'subject': '', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.26 (TeX Live 2024) kpathsea version 6.4.0', 'producer': 'pdfTeX-1.40.26', 'page': 0, 'total_pages': 2, 'creator': 'LaTeX with RenderCV'}, page_content='for fixes and improvements\n◦ Automated extraction and analysis of files using the GitHub API, allowing fast auditing and understanding\nof open-source repositories\nRAG Chatbot for Documents Streamlit, HuggingFace,\nPinecone\n◦ Developed a PDF-based RAG chatbot capable of answering context-rich queries'),
 Document(metadata={'subject': '', 'author': 'Vinay Sharma', 'moddate': '2025-07-22T03:44:43+00:00', 'total_pages': 2, 'page': 0, 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.26 (TeX Live 2024) kpath

In [18]:
# from langchain_community.vectorstores import Chroma
from langchain_groq import ChatGroq
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

# Initialize the LLM from Groq
codellm = ChatGroq(
    model_name="llama3-8b-8192",  # ✅ Note: Correct model name (without hyphen after "llama")
    temperature=0.1,
    max_tokens=1024,
    timeout=60,
    max_retries=2
)

# Create your custom prompt template
prompt_template = """You are a helpful AI assistant. Use the following context to answer the question accurately and comprehensively.

Context Information:
{context}

Question: {question}

Instructions:
1. Answer based primarily on the provided context
2. If the context doesn't contain enough information, clearly state this
3. Be concise but thorough in your response
4. Don't make up information not present in the context

Answer:"""

prompt = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"]
)

# 🧠 Assume these are retrieved from your vector database
retrieved_context = similar_docs
question = "What is this document based on?"

# ✅ Format the prompt with actual values
formatted_prompt = prompt.format(context=retrieved_context, question=question)

# ✅ Invoke LLM with formatted string
answer = codellm.invoke(formatted_prompt)
print("Generated Answer:\n", answer.content)


Generated Answer:
 Based on the provided context, this document appears to be a resume or CV (Curriculum Vitae) of Vinay Sharma. The document is a PDF file created using LaTeX with RenderCV, and it contains information about Vinay's experience, skills, and contact details.
