In [1]:
import os
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
import pinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers

  from tqdm.autonotebook import tqdm


In [2]:
# Get the api from pinecone website
# Create a new index on pinecone site - [name: "healthcare-chatbot", dimension: 384(as per our embedding model "all-MiniLM-L6-V2")]
# write the api key in .env file and add .env to .gitignore

from dotenv import load_dotenv
load_dotenv()

API_KEY = os.getenv("PINECONE_API_KEY")

# PINECONE_API_KEY = "2bd5fd9f-2c56-42f9-aa1b-037960262fff"

In [3]:
API_KEY

'dd91233d-fa53-4640-ba74-3d0ec646f71d'

In [4]:
def load_pdf(data):
    loader = DirectoryLoader(data,
                    glob="*.pdf",
                    loader_cls=PyPDFLoader)
    
    documents = loader.load()

    return documents

In [5]:
# create a "data" dir and move the pdf file inside
# extract data from the PDF

current_dir = os.getcwd()
pdf_path = os.path.join(current_dir, '..', 'data')
print(pdf_path)

c:\Users\Personal\Desktop\DSMP2023-2024\Self_Researched_Projects\Proj-11\Llama2-Medical-Chatbot\experiments\..\data


In [6]:
extracted_data = load_pdf(pdf_path)

In [7]:
# Create text chunks

def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 20)
    text_chunks = text_splitter.split_documents(extracted_data)

    return text_chunks

In [8]:
text_chunks = text_split(extracted_data)
print("length of my chunk:", len(text_chunks))

length of my chunk: 7020


In [9]:
#download embedding model
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [10]:
embeddings = download_hugging_face_embeddings()

In [11]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={})

In [12]:
# post embedding, we get 384 dimensions

query_result = embeddings.embed_query("Hello world")
print("Length", len(query_result))

Length 384


In [13]:
# Execute once from terminal: pip install --upgrade langchain-pinecone
from pinecone import Pinecone, PodSpec

pc = Pinecone(api_key=API_KEY)
index = pc.Index("healthcare-chatbot")

In [49]:
for index in pc.list_indexes():
    print(index['name'])

healthcare-chatbot


In [50]:
index

{'dimension': 384,
 'host': 'healthcare-chatbot-gn0x07u.svc.aped-4627-b74a.pinecone.io',
 'metric': 'cosine',
 'name': 'healthcare-chatbot',
 'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
 'status': {'ready': True, 'state': 'Ready'}}

In [45]:
pc.list_indexes()

{'indexes': [{'dimension': 384,
              'host': 'healthcare-chatbot-gn0x07u.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'healthcare-chatbot',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}}]}

In [52]:
# Ensure the Pinecone client is properly initialized and listing indexes
if 'healthcare-chatbot' not in [index['name'] for index in pc.list_indexes()]:
    # Handle the case where the index does not exist
    raise ValueError("Index 'healthcare-chatbot' does not exist")
else:
    print(index['name'])

healthcare-chatbot


In [65]:
from langchain.vectorstores import Pinecone as PineconeStore
from pinecone import Pinecone

# Initialize Pinecone client
pc = Pinecone(api_key="dd91233d-fa53-4640-ba74-3d0ec646f71d")

index_name = "healthcare-chatbot"

# Verify the index exists
indexes = pc.list_indexes()
if index_name not in indexes.names():
    raise ValueError(f"Index {index_name} does not exist")

In [66]:
# Use PineconeStore to create the document search
docsearch = PineconeStore.from_texts(
    [t.page_content for t in text_chunks],
    embeddings,
    index_name=index_name
)

AttributeError: list_indexes is no longer a top-level attribute of the pinecone package.

To use list_indexes, please create a client instance and call the method there instead.

Example:

    from pinecone import Pinecone
    
    pc = Pinecone(api_key='YOUR_API_KEY')

    index_name = "quickstart" # or your index name

    if index_name not in pc.list_indexes().names():
        # do something

