In [17]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader

def load_pdf(data,fn):
    loader = DirectoryLoader(data,
                    glob=fn,
                    loader_cls=PyPDFLoader)
    
    documents = loader.load()
    return documents

In [18]:
file_name = "sample2.pdf"
extracted_data = load_pdf("data/",file_name) 

In [19]:
print(len(extracted_data))

637


In [20]:
#Create text chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter #to convert to chunks

def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 20)
    text_chunks = text_splitter.split_documents(extracted_data)

    return text_chunks


In [21]:
text_chunks = text_split(extracted_data)


In [22]:
#download embedding model
# from langchain.embeddings import HuggingFaceEmbeddings
# def download_hugging_face_embeddings():
#     embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
#     return embeddings
# embeddings = download_hugging_face_embeddings()

from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings
embeddings = HuggingFaceInferenceAPIEmbeddings(
    api_key='hugging_face_api_key', model_name="sentence-transformers/all-MiniLM-l6-v2"
)

In [23]:
embeddings

HuggingFaceInferenceAPIEmbeddings(api_key=SecretStr('**********'), model_name='sentence-transformers/all-MiniLM-l6-v2', api_url=None, additional_headers={})

In [24]:
query_result = embeddings.embed_query("Hello world")
print("Length", len(query_result))

Length 384


In [25]:
# from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec, Pinecone
from langchain.vectorstores import Pinecone as PineconeStore
from langchain_pinecone import PineconeVectorStore, PineconeEmbeddings
import json

# Initialize a client
pc = Pinecone(api_key='pine_cone_api_key')

# Define index name and check/create it
index_name = "pdf-chatbot"

existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

index = pc.Index(index_name)

# Extract page content from text chunks
texts = [json.loads(t.json())['page_content'] for t in text_chunks]

vector_store = PineconeVectorStore(index=index,embedding=embeddings)
# embeddings = PineconeEmbeddings(model="sentence-transformers/all-MiniLM-L6-v2")

# index_name = "my-index"
# vectorstore = PineconeVectorStore.from_texts(
#     texts,
#     index_name=index_name,
#     embedding=embeddings,
# )

# Create docsearch object using from_texts method
# docsearch = PineconeStore.from_texts(texts, embeddings, index_name=index_name)

# You can now use docsearch for further operations like querying.


In [10]:
from uuid import uuid4
uuids = [str(uuid4()) for _ in range(len(extracted_data))]

vector_store.add_documents(documents=extracted_data, ids=uuids)

KeyboardInterrupt: 

In [11]:
#If we already have an index we can load it like this
# docsearch=PineconeStore.from_existing_index(index_name, embeddings)

query = "summarize the whole pdf"

docs=vector_store.similarity_search(query, k=3)

# print("Result", len(docs))
import json
for doc in docs:
  print(json.loads(doc.json())['page_content'])
  print('\n\n\n')
  

In [26]:
prompt_template="""
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Return a helpful answer and quote the reference from the pdf itself.
Helpful answer:
"""

In [27]:
from langchain import PromptTemplate
PROMPT=PromptTemplate(template=prompt_template, input_variables=["context", "question"])
chain_type_kwargs={"prompt": PROMPT}

In [28]:
# from langchain.llms import CTransformers #quantised model
# llm=CTransformers(model="TheBloke/Llama-2-7B-Chat-GGUF",
#                   model_type="llama",
#                   config={'max_new_tokens':512,
#                           'temperature':0.8})
import getpass
import os

if "GOOGLE_API_KEY" not in os.environ:
  os.environ["GOOGLE_API_KEY"] = 'google_gemini_api_key'
  
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(
  model="gemini-1.5-pro",
  temperature=0,
  max_tokens=None,
  timeout=None,
  max_retries=2,
  # other params...
)

In [29]:
from langchain.chains import RetrievalQA
qa=RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=vector_store.as_retriever(search_kwargs={'k': 2}),
    return_source_documents=True, 
    chain_type_kwargs=chain_type_kwargs)

In [None]:
while True:
    user_input=input(f"Input Prompt:")
    result=qa({"query": user_input})
    print("Response : ", result["result"])

Input Prompt: summerize the pdf


Response :  This excerpt from the Gale Encyclopedia of Medicine, 2nd Edition, provides information about acoustic neuromas. 

"An acoustic neuroma is a benign tumor involving cells of the myelin sheath that surrounds the vestibulocochlear nerve (eighth cranial nerve)." 



Input Prompt: what is allergy


In [6]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec

# Initialize a client
pc = Pinecone(api_key='pine_cone_api_key')

# Create a serverless index
index_name = "example-index"
if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=2,
        metric="cosine",
        spec=ServerlessSpec(
            cloud='aws', 
            region='us-east-1'
        ) 
    ) 


In [7]:
# Wait for the index to be ready
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)
    
index = pc.Index(index_name)

index.upsert(
    vectors=[
        {"id": "vec1", "values": [1.0, 1.5]},
        {"id": "vec2", "values": [2.0, 1.0]},
        {"id": "vec3", "values": [0.1, 3.0]},
    ],
    namespace="example-namespace1"
)

index.upsert(
    vectors=[
        {"id": "vec1", "values": [1.0, -2.5]},
        {"id": "vec2", "values": [3.0, -2.0]},
        {"id": "vec3", "values": [0.5, -1.5]},
    ],
    namespace="example-namespace2"
)


upserted_count: 3

In [8]:
print(index.describe_index_stats())
query_results1 = index.query(
    namespace="example-namespace1",
    vector=[1.0, 1.5],
    top_k=3,
    include_values=True
)

query_results2 = index.query(
    namespace="example-namespace2",
    vector=[1.0,-2.5],
    top_k=3,
    include_values=True
)

print(query_results1)
print(query_results2)

{'dimension': 2,
 'index_fullness': 0.0,
 'namespaces': {'example-namespace1': {'vector_count': 3},
                'example-namespace2': {'vector_count': 3}},
 'total_vector_count': 6}
{'matches': [{'id': 'vec1',
              'metadata': None,
              'score': 1.0,
              'sparse_values': {'indices': [], 'values': []},
              'values': [1.0, 1.5]},
             {'id': 'vec2',
              'metadata': None,
              'score': 0.86824316,
              'sparse_values': {'indices': [], 'values': []},
              'values': [2.0, 1.0]},
             {'id': 'vec3',
              'metadata': None,
              'score': 0.85006815,
              'sparse_values': {'indices': [], 'values': []},
              'values': [0.1, 3.0]}],
 'namespace': 'example-namespace1',
 'usage': {'read_units': 6}}
{'matches': [{'id': 'vec1',
              'metadata': None,
              'score': 1.0,
              'sparse_values': {'indices': [], 'values': []},
              'values':

In [9]:
pc.delete_index(index_name)