In [1]:
%pwd

'C:\\Users\\vinod\\OneDrive\\Desktop\\Coding\\AIWorld\\chatbot\\Bird_chatbot\\birds_agent\\research'

In [2]:
import os
os.chdir("../")

In [3]:
%pwd

'C:\\Users\\vinod\\OneDrive\\Desktop\\Coding\\AIWorld\\chatbot\\Bird_chatbot\\birds_agent'

In [5]:
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader

In [6]:
#Extract PDF from PDF file

def load_pdf_file(data):
    loader = DirectoryLoader(data,
                            glob="*.pdf",
                            loader_cls=PyPDFLoader)
    documents=loader.load()
    return documents

In [7]:
extracted_data = load_pdf_file(data="Data/")

In [8]:
len(extracted_data)

152

In [9]:
# extracted_data

In [10]:
#Split the DATA into text Chunks perform chunk

def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunk = text_splitter.split_documents(extracted_data)
    return text_chunk

In [11]:
text_chunks = text_split(extracted_data)
print("Length of text chunks", len(text_chunks))

Length of text chunks 167


In [12]:
# text_chunks

In [13]:
#Download the Embeddings from Hugging face

from langchain_huggingface import HuggingFaceEmbeddings

In [14]:
def downlaod_hugging_face_embedding():
    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

In [15]:
embeddings = downlaod_hugging_face_embedding()

In [16]:
query_result = embeddings.embed_query('Hello world')
print("Length: ",len(query_result))# This is a sentence-transformers model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search.

Length:  384


In [17]:
# query_result

In [18]:
# create a index in pinecone
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv

load_dotenv(".env", override=True)

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "birdbot"

pc.create_index(
    name= index_name,
    dimension=384,
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
)

PineconeApiException: (409)
Reason: Conflict
HTTP response headers: HTTPHeaderDict({'content-type': 'text/plain; charset=utf-8', 'access-control-allow-origin': '*', 'vary': 'origin,access-control-request-method,access-control-request-headers', 'access-control-expose-headers': '*', 'x-pinecone-api-version': '2025-01', 'x-cloud-trace-context': '9ff3db5420cd49863ca5dbbb9f6211a1', 'date': 'Sun, 01 Jun 2025 15:45:57 GMT', 'server': 'Google Frontend', 'Content-Length': '85', 'Via': '1.1 google', 'Alt-Svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000'})
HTTP response body: {"error":{"code":"ALREADY_EXISTS","message":"Resource  already exists"},"status":409}


In [19]:


from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings
)

In [20]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x22ea9512240>

In [21]:
# Load Existing index

from langchain_pinecone import PineconeVectorStore

#embed each chunk and upsert the embeddings into your Pincone index
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)


In [22]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x22eab083e30>

In [32]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":1})

retrieved_docs = retriever.invoke("what is the size of Egyptian vulture")

retrieved_docs

[Document(id='e0ca279e-a5e2-46c3-b78a-938e9c159dae', metadata={'author': 'SONY', 'creationdate': '2015-09-30T18:58:42+05:30', 'creator': 'CorelDRAW X7', 'moddate': '2015-09-30T18:58:42+05:30', 'page': 66.0, 'page_label': '67', 'producer': 'Corel PDF Engine Version 17.0.0.491', 'source': 'Data\\Birds-of-Indian-Subcontinent.pdf', 'title': 'BIRD BOOK.cdr', 'total_pages': 152.0}, page_content='v Size-85cm\nv Red colour head and neck \nwith black bill\nv White puﬀ in the neck\nv Black tail.\nv Reddish legs and feet.\nv While ﬂight-Dark black colour \n(wings) coverts with light grey \nprimary and secondaries\nRED-HEADED VULTURE\n55\nTour Guide on Avifauna IGNFA')]

In [34]:
retrieved_docs[0].page_content

'v Size-85cm\nv Red colour head and neck \nwith black bill\nv White puﬀ in the neck\nv Black tail.\nv Reddish legs and feet.\nv While ﬂight-Dark black colour \n(wings) coverts with light grey \nprimary and secondaries\nRED-HEADED VULTURE\n55\nTour Guide on Avifauna IGNFA'

In [35]:
MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")

In [36]:
import os
os.environ["MISTRAL_API_KEY"] = MISTRAL_API_KEY

In [40]:
# ! pip install langchain_mistralai

In [41]:
from langchain_mistralai import ChatMistralAI
llm = ChatMistralAI(temperature=0.4, max_tokens=500)

In [42]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

# Chain

In [43]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [46]:
response = rag_chain.invoke({"input": "tell me about great barbet"})
print(response["answer"])

The Great Barbet is a bird species with a size of 33cm. It has a blackish blue head and a yellow beak. Its wings are pale brown and green, and its tail is green. The underparts are olive streaked yellowish, and it has red under tail-coverts.
