In [8]:
from langchain_community.document_loaders import PyPDFLoader
#loading the pdf file
loader = PyPDFLoader("agency.pdf")
pages = loader.load_and_split()
print(pages[0].page_content)

Ignoring wrong pointing object 16 0 (offset 0)
Ignoring wrong pointing object 17 0 (offset 0)


agency
agency_id agency_name specialization skills description
1 Nexus Digital Digital Marketing SEO, PPC, Content Creation Nexus Digital is a results-driven digital marketing agency. We leverage data and creativity to build powerful online presences for our clients, ensuring maximum ROI through targeted strategies.
2 Apex Communications Public Relations Media Relations, Crisis Management, Press Releases At Apex Communications, we shape public perception. Our team of seasoned PR professionals excels at crafting compelling narratives, managing reputations, and securing high-impact media coverage.
3 Stellar Strategy Group Brand Strategy Market Research, Brand Positioning, Identity Design Stellar Strategy Group helps brands discover their true north. We combine in-depth market research with creative insights to build timeless brand identities and eﬀective market-entry strategies.
4 Quantum Creative Creative Advertising Video Production, Copywriting, Graphic Design Quantum Creative is an a

In [9]:
#Splitting the pdf file into chunks
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
text=text_splitter.split_documents(pages)
# This prints only the text content
print(text[0].page_content)

agency
agency_id agency_name specialization skills description
1 Nexus Digital Digital Marketing SEO, PPC, Content Creation Nexus Digital is a results-driven digital marketing agency. We leverage data and creativity to build powerful online presences for our clients, ensuring maximum ROI through targeted strategies.
2 Apex Communications Public Relations Media Relations, Crisis Management, Press Releases At Apex Communications, we shape public perception. Our team of seasoned PR professionals excels at crafting compelling narratives, managing reputations, and securing high-impact media coverage.
3 Stellar Strategy Group Brand Strategy Market Research, Brand Positioning, Identity Design Stellar Strategy Group helps brands discover their true north. We combine in-depth market research with creative insights to build timeless brand identities and eﬀective market-entry strategies.
4 Quantum Creative Creative Advertising Video Production, Copywriting, Graphic Design Quantum Creative is an a

In [10]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents = text_splitter.split_documents(pages)
print(f"Splitted into {len(documents)} chunks of text")

Splitted into 15 chunks of text


In [11]:
from dotenv import load_dotenv
load_dotenv()

True

In [12]:
#Embedding the text chunks
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import os
API_KEY = os.getenv("Google_API_KEY")

embeddings = GoogleGenerativeAIEmbeddings(
    model="models/text-embedding-004",
    google_api_key=API_KEY
)
    
embedded_query = embeddings.embed_query("what is agency?")
print(f"Embedding length: {len(embedded_query)}")
print(f"Embedding: {embedded_query}")

E0000 00:00:1760680692.272377 1981370 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


Embedding length: 768
Embedding: [0.04665990173816681, 0.017178256064653397, 0.019394660368561745, 0.013999304734170437, -0.06340420246124268, 0.05293895676732063, -0.022750448435544968, 0.06483644992113113, 0.05790889635682106, 0.02519158460199833, -0.033616263419389725, -0.010672819800674915, 0.10925289988517761, -0.09094943851232529, 0.004616389516741037, -0.0612606555223465, 0.011472431011497974, 0.0018287760904058814, -0.09828978031873703, -0.01710587553679943, -0.004942489322274923, -0.014795134775340557, -0.043945878744125366, -0.05189420282840729, -0.002531003439798951, -0.028303995728492737, 0.043373238295316696, 0.044013720005750656, -0.018479924649000168, 0.0016347082564607263, 0.03966779634356499, 0.0004910776042379439, 0.02800567075610161, -0.0329865887761116, 0.0331537090241909, -0.09375783801078796, -0.039138518273830414, 0.044918932020664215, 0.0015835576923564076, -0.07691213488578796, -0.06633520871400833, 0.02662636712193489, -0.07450801134109497, -0.0031149345450103

In [13]:
from pinecone import Pinecone
import os

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.Index("agency")

In [14]:
from pinecone import ServerlessSpec

cloud = 'aws'
region = 'us-east-1'

spec = ServerlessSpec(
    cloud=cloud,
    region=region,
)

In [15]:
index_name = "agency"

In [16]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(text, embeddings, index_name=index_name)



For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from langchain_pinecone.vectorstores import Pinecone, PineconeVectorStore


In [17]:
docsearch.as_retriever()

VectorStoreRetriever(tags=['PineconeVectorStore', 'GoogleGenerativeAIEmbeddings'], vectorstore=<langchain_pinecone.vectorstores.PineconeVectorStore object at 0x112911e80>, search_kwargs={})

In [18]:
# import google.generativeai as genai
# import os

# # Make sure your API key is set
# genai.configure(api_key=os.environ["Google_API_KEY"])

# print("Available models that support 'generateContent':")

# # List all available models
# for model in genai.list_models():
#   # Check if the model supports the 'generateContent' method used by chat models
#   if 'generateContent' in model.supported_generation_methods:
#     print(f"- {model.name}")

In [19]:
from langchain.chains import ConversationalRetrievalChain
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.vectorstores import FAISS
from langchain.chains.conversational_retrieval.prompts import CONDENSE_QUESTION_PROMPT

load_dotenv()
API_KEY = os.getenv("Google_API_KEY")
llm = ChatGoogleGenerativeAI(model="gemini-2.5-pro", temoperature=0.5, google_api_key=API_KEY)
qa = ConversationalRetrievalChain.from_llm(llm=llm,
                                             retriever=docsearch.as_retriever(),
                                             condense_question_prompt=CONDENSE_QUESTION_PROMPT,
                                             return_source_documents=True,
                                             verbose=False
                                             )

chat_history = []
query = "which agency specializes in content creation?"
result = qa({"question": query, "chat_history": chat_history})
print("Answer:", result['answer'])

  result = qa({"question": query, "chat_history": chat_history})
E0000 00:00:1760680702.338828 1981370 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


Answer: Based on the context provided, two agencies list "Content Creation" as a direct skill:

*   **Nexus Digital** (agency_id 1): A Digital Marketing agency with skills in SEO, PPC, and **Content Creation**.
*   **The Daily Post** (agency_id 146): A Social Media Management agency with skills in **Content Creation**, Copywriting, and Scheduling.

Other agencies specialize in specific types of content creation, such as:

*   **Quantum Creative** (agency_id 4): Specializes in Video Production, Copywriting, and Graphic Design.
*   **Focal Point Creative** (agency_id 179): Specializes in Photography, Videography, and Product Shoots.
*   **Storyboard Studios** (agency_id 123): Specializes in Animated Explainers, Whiteboard Videos, and Character Animation.


In [20]:
# # Let's see what Python is actually loading.
# api_key_from_env = os.getenv("PINECONE_API_KEY")

# # Use the loaded key
# pc = Pinecone(api_key=api_key_from_env)
# index = pc.Index(host="agency-xa1d4pq.svc.aped-4627-b74a.pinecone.io")

# # Initialize the embedding model
# API_KEY = os.getenv("Google_API_KEY")

# embeddings = GoogleGenerativeAIEmbeddings(
#     model="models/text-embedding-004",
#     google_api_key=API_KEY
# )

# # --- Step 3: Create a Vector from Your Query Text ---
# query_text = "which agency specializes in content creation?"
# query_vector = embeddings.embed_query(query_text)

# # --- Step 4: Search Pinecone Using the Vector ---
# results = index.query(
#     vector=query_vector,   # Use the keyword argument 'vector'
#     namespace="",
#     top_k=2,
#     include_metadata=True
# )


# print("\nSearch Results:")
# print(results)

In [21]:
# from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
# # --- Step 4: Extract the Text from the Metadata ---
# context = ""
# for match in results['matches']:
#     if 'text' in match['metadata']:
#         context += match['metadata']['text'] + "\n---\n"




# # --- Step 5: Augment and Generate the Final Answer ---
# # Initialize the chat model
# llm = ChatGoogleGenerativeAI(model="gemini-1.0-pro", google_api_key=os.getenv("Google_API_KEY"))

# # Create a prompt template
# prompt = f"""
# Answer the following question based ONLY on the context provided below.
# If the context doesn't contain the answer, say "I don't have enough information to answer."

# Context:
# {context}

# Question:
# {query}
# """

# # Ask the model to generate the answer
# final_answer = llm.invoke(prompt)

# print("--- Final Answer ---")
# print(final_answer.content)