In [1]:
# Run this cell on G-Colab

# !pip install langchain
# !pip install pinecone
# !pip install pypdf
# !pip install sentence_transformers
# !pip install langchain_pinecone

In [3]:
import os
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
import pinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers
import pandas as pd

In [4]:
# Get the api from pinecone website
# Create a new index on pinecone site - [name: "healthcare-chatbot", dimension: 384(as per our embedding model "all-MiniLM-L6-V2")]
# write the api key in .env file and add .env to .gitignore
from dotenv import load_dotenv
load_dotenv()

API_KEY = os.getenv("PINECONE_API_KEY")

In [5]:
# PINECONE_API_KEY

In [6]:
def load_pdf(data):
    loader = DirectoryLoader(data,
                    glob="*.pdf",
                    loader_cls=PyPDFLoader)

    documents = loader.load()

    return documents

In [7]:
# create a "data" dir and move the pdf file inside
# extract data from the PDF

current_dir = os.getcwd()
pdf_path = os.path.join(current_dir, '..', 'data')
print(pdf_path)

c:\Users\Personal\Desktop\DSMP2023-2024\Self_Researched_Projects\Proj-11\Llama2-Medical-Chatbot\experiments\..\data


In [8]:
extracted_data = load_pdf(pdf_path)

In [9]:
# Create text chunks

def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 20)
    text_chunks = text_splitter.split_documents(extracted_data)

    return text_chunks

In [10]:
text_chunks = text_split(extracted_data)
print("length of my chunk:", len(text_chunks))

length of my chunk: 7020


In [15]:
text_chunks[5]

Document(page_content='mation presented in this publication, the Gale Group neither guaranteesthe accuracy of the data contained herein nor assumes any responsibili-ty for errors, omissions or discrepancies. The Gale Group accepts nopayment for listing, and inclusion in the publication of any organiza-tion, agency, institution, publication, service, or individual does notimply endorsement of the editor or publisher. Errors brought to theattention of the publisher and verified to the satisfaction of the publish-er', metadata={'source': 'c:\\Users\\Personal\\Desktop\\DSMP2023-2024\\Self_Researched_Projects\\Proj-11\\Llama2-Medical-Chatbot\\experiments\\..\\data\\Medical_book.pdf', 'page': 3})

In [16]:
text_chunks[5].page_content

'mation presented in this publication, the Gale Group neither guaranteesthe accuracy of the data contained herein nor assumes any responsibili-ty for errors, omissions or discrepancies. The Gale Group accepts nopayment for listing, and inclusion in the publication of any organiza-tion, agency, institution, publication, service, or individual does notimply endorsement of the editor or publisher. Errors brought to theattention of the publisher and verified to the satisfaction of the publish-er'

In [18]:
# Len will be lesser than 500
len(text_chunks[10].page_content)

490

In [19]:
#download embedding model
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [20]:
embeddings = download_hugging_face_embeddings()

In [21]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={})

In [22]:
# post embedding, we get 384 dimensions

query_result = embeddings.embed_query("Hello world")
print("Length", len(query_result))

Length 384


In [23]:
# Performing embedding for entire document(text_chunks)
embeds = [embeddings.embed_query(t.page_content) for t in text_chunks]

In [24]:
print(len(embeds[0]))
print(len(embeds))

384
7020


In [25]:
df = pd.DataFrame({'id': map(str, range(1, len(embeds)+1)), 'vectors': embeds})

In [26]:
df.shape

(7020, 2)

In [27]:
df.head()

Unnamed: 0,id,vectors
0,1,"[0.0017461641691625118, -0.03350285068154335, ..."
1,2,"[0.0029233540408313274, -0.014550319872796535,..."
2,3,"[0.011354200541973114, -0.04454658553004265, -..."
3,4,"[0.03532915189862251, -0.04230417683720589, 0...."
4,5,"[-0.09936312586069107, 0.027253154665231705, 0..."


In [46]:
import random
import itertools
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=API_KEY)
index_name = "healthcare-chatbot2"

pc.create_index(
    name=index_name,
    dimension=384,
    metric="cosine",
    spec=ServerlessSpec(
        cloud='aws',
        region='us-east-1'
    )
)


In [47]:
# Verify the index exists
indexes = pc.list_indexes()
if index_name not in indexes.names():
    # Create index if it doesn't exist (optional)
    pc.create_index(
        name=index_name,
        dimension=384,
        metric='cosine',
        spec=pinecone.ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )

# Connect to the index
index = pc.Index(index_name)

# Convert DataFrame to list of (id, vector) tuples
data_tuples = list(df.itertuples(index=False, name=None))

# Helper function to break an iterable into chunks of size batch_size
def chunks(iterable, batch_size=100):
    it = iter(iterable)
    chunk = tuple(itertools.islice(it, batch_size))
    while chunk:
        yield chunk
        chunk = tuple(itertools.islice(it, batch_size))

# Upsert data with 100 vectors per upsert request
for ids_vectors_chunk in chunks(data_tuples, batch_size=100):
    index.upsert(vectors=ids_vectors_chunk)

In [51]:
# Printing index related info
print(pc.list_indexes())
print("*******************************************************")
print(index.name)
print("*******************************************************")
for index in pc.list_indexes():
    print(index['name'])

{'indexes': [{'dimension': 384,
              'host': 'healthcare-chatbot2-gn0x07u.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'healthcare-chatbot2',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}},
             {'dimension': 384,
              'host': 'healthcare-chatbot-gn0x07u.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'healthcare-chatbot',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}},
             {'dimension': 384,
              'host': 'docs-quickstart-new-gn0x07u.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'docs-quickstart-new',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}}]}

In [39]:
from langchain_pinecone import PineconeVectorStore

# Initialize environment variables with your API keys
os.environ['PINECONE_API_KEY'] = API_KEY

# Define the Pinecone index name and embeddings
index_name = index_name

# Initialize the PineconeVectorStore with the existing index
vectorstore = PineconeVectorStore(index_name=index_name, embedding=embeddings)

In [40]:
query = "What are Allergies"
query_vector = embeddings.embed_query(query)

In [41]:
# Assuming you have already initialized the Pinecone client and connected to your index:
top_k = 3  # Number of top results to retrieve

# Perform a similarity search
query_response = index.query(
    vector=query_vector,
    top_k=top_k,
    include_metadata=True
)

# Process the query response
for match in query_response['matches']:
    print(f"ID: {match['id']}, Score: {match['score']}, Metadata: {match.get('metadata', 'No metadata')}")

ID: 1373, Score: 0.682266593, Metadata: No metadata
ID: 1356, Score: 0.678240061, Metadata: No metadata
ID: 1306, Score: 0.676807523, Metadata: No metadata


In [42]:
index.query(
    namespace="ns1",
    vector=query_vector,
    top_k=2,
    include_values=True,
    include_metadata=True,
    filter={"genre": {"$eq": "action"}}
)

{'matches': [], 'namespace': 'ns1', 'usage': {'read_units': 1}}