In [None]:
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.chains import RetrievalQA
from langchain.vectorstores import Pinecone
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFacePipeline
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import CTransformers
from pinecone import Pinecone as PineconeClient, ServerlessSpec

In [2]:
PINECONE_API_KEY="pcsk_B7UAM_K49AHaagksdc1uEKvYNEhC6CJ3r66UUrpzAcHLuu9Qw6JFvuRXmsuioXvD2X6fe"
PINECONE_ENV="https://medical-chatbot-347w7a5.svc.aped-4627-b74a.pinecone.io"

In [3]:
#Extract Data From the PDF File
def load_pdf_file(data):
    loader= DirectoryLoader(data,
                            glob="*.pdf",
                            loader_cls=PyPDFLoader)

    documents=loader.load()

    return documents

In [4]:
# %pip install pypdf

extracted_data= load_pdf_file(data='data/')

In [5]:
# Create a Text Splitter
def text_splitter(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    split_documents = text_splitter.split_documents(extracted_data)
    return split_documents

In [6]:
text_chunks = text_splitter(extracted_data)

In [7]:
print(f"Total number of text chunks: {len(text_chunks)}")

Total number of text chunks: 5859


In [8]:
# Download the embedding model 
def download_embedding_model():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [9]:
embeddings=download_embedding_model()

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [10]:
# %pip install --upgrade sentence-transformers

In [11]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False, 'architecture': 'BertModel'})
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={})

In [12]:
query_result = embeddings.embed_query("What is the treatment for diabetes?")
print(f"length of query result: {len(query_result)}")

length of query result: 384


In [None]:
# 📋 COMPLETE PINECONE INITIALIZATION GUIDE FOR v7.x

print("🔧 PINECONE VECTOR DATABASE INITIALIZATION")
print("=" * 50)

# Step 1: Import Pinecone components
from pinecone import Pinecone, ServerlessSpec

# Step 2: Initialize client
pc = Pinecone(api_key=PINECONE_API_KEY)
print("✅ Pinecone client initialized")

# Step 3: Index configuration
index_name = "medical-chatbot"
dimension = 384  # sentence-transformers/all-MiniLM-L6-v2 dimension (CORRECTED)

# Step 4: Check existing indexes
indexes = pc.list_indexes()
print(f"📁 Existing indexes: {indexes.names()}")

# Step 5: Create or connect to index
if index_name not in indexes.names():
    print(f"🚀 Creating new index: {index_name}")
    pc.create_index(
        name=index_name,
        dimension=dimension,
        metric='cosine',
        spec=ServerlessSpec(cloud='aws', region='us-east-1')
    )
    print("✅ Index created!")
else:
    print(f"🔗 Connecting to existing index: {index_name}")

# Step 6: Get index object
index = pc.Index(index_name)

# Step 7: Verify connection
try:
    stats = index.describe_index_stats()
    print(f"\n📊 INDEX STATS:")
    print(f"   Vectors stored: {stats.get('total_vector_count', 0)}")
    print(f"   Dimension: {stats.get('dimension', 'N/A')}")
    print(f"   Fullness: {stats.get('index_fullness', 0):.2%}")
    print("✅ Connection verified!")
except Exception as e:
    print(f"❌ Connection error: {e}")

print(f"\n🎯 READY TO USE:")
print(f"   Client variable: pc")
print(f"   Index variable: index")
print(f"   Index name: {index_name}")

🔧 PINECONE VECTOR DATABASE INITIALIZATION
✅ Pinecone client initialized
📁 Existing indexes: ['medical-chatbot']
🔗 Connecting to existing index: medical-chatbot
📁 Existing indexes: ['medical-chatbot']
🔗 Connecting to existing index: medical-chatbot

📊 INDEX STATS:
   Vectors stored: 0
   Dimension: 1024
   Fullness: 0.00%
✅ Connection verified!

🎯 READY TO USE:
   Client variable: pc
   Index variable: index
   Index name: medical-chatbot

📊 INDEX STATS:
   Vectors stored: 0
   Dimension: 1024
   Fullness: 0.00%
✅ Connection verified!

🎯 READY TO USE:
   Client variable: pc
   Index variable: index
   Index name: medical-chatbot


In [23]:
# 🗄️ WORKING WITH PINECONE VECTOR DATABASE

print("🔍 PINECONE DATABASE OPERATIONS")
print("=" * 40)

# Function to store documents in Pinecone
def store_documents_in_pinecone(documents, embeddings_model, pinecone_index):
    """Store document embeddings in Pinecone"""
    vectors = []
    
    for i, doc in enumerate(documents):
        # Generate embedding
        embedding = embeddings_model.embed_query(doc.page_content)
        
        # Create vector with metadata
        vector = {
            "id": f"doc_{i}",
            "values": embedding,
            "metadata": {
                "text": doc.page_content,
                "source": getattr(doc, 'metadata', {}).get('source', 'unknown')
            }
        }
        vectors.append(vector)
    
    # Upsert in batches
    batch_size = 100
    for i in range(0, len(vectors), batch_size):
        batch = vectors[i:i+batch_size]
        pinecone_index.upsert(vectors=batch)
    
    return len(vectors)

# Function to query Pinecone
def query_pinecone_db(query_text, embeddings_model, pinecone_index, top_k=3):
    """Query Pinecone database"""
    # Generate query embedding
    query_embedding = embeddings_model.embed_query(query_text)
    
    # Search Pinecone
    results = pinecone_index.query(
        vector=query_embedding,
        top_k=top_k,
        include_metadata=True
    )
    
    return results

# Store your medical documents (if you have text_chunks available)
if 'text_chunks' in globals() and len(text_chunks) > 0:
    print(f"📤 Storing {len(text_chunks)} documents...")
    stored_count = store_documents_in_pinecone(text_chunks, embeddings, index)
    print(f"✅ Stored {stored_count} documents in Pinecone")
    
    # Test query
    print(f"\n🔍 Testing query...")
    test_query = "What are the symptoms of diabetes?"
    results = query_pinecone_db(test_query, embeddings, index)
    
    print(f"Query: '{test_query}'")
    print(f"Found {len(results['matches'])} results:")
    
    for i, match in enumerate(results['matches']):
        score = match['score']
        text = match['metadata']['text'][:100] + "..."
        print(f"  {i+1}. Score: {score:.3f}")
        print(f"     Text: {text}")
        print()
        
else:
    print("⚠️ No text_chunks found. Run document processing cells first.")
    
print("🎯 Pinecone is ready for your medical chatbot!")

🔍 PINECONE DATABASE OPERATIONS
📤 Storing 5859 documents...


PineconeApiException: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Date': 'Thu, 10 Jul 2025 16:50:45 GMT', 'Content-Type': 'application/json', 'Content-Length': '103', 'Connection': 'keep-alive', 'x-pinecone-request-latency-ms': '1224', 'x-pinecone-request-id': '7306594509999880394', 'x-envoy-upstream-service-time': '92', 'server': 'envoy'})
HTTP response body: {"code":3,"message":"Vector dimension 384 does not match the dimension of the index 1024","details":[]}
