In [4]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
import pinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import  ctransformers
import os
import numpy as np
import pickle

## **Extract data from the PDF**

In [5]:
def load_pdf(data):
    loader=DirectoryLoader(data,
                    glob="*.pdf",
                    loader_cls=PyPDFLoader)

    documents=loader.load()

    return documents

In [6]:
extracted_data = load_pdf("/media/supunlakshan/Learning Hub/AI & Machine Learning/LLM,OpenAI/End-to-End-Medical-Chatbot/data")

In [None]:
extracted_data

## **Text preprocessing**

In [8]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize

# Download necessary resources
nltk.download('stopwords')
nltk.download('punkt')

# Define preprocessing functions
def clean_text(text):
    # Remove non-alphanumeric characters but keep URLs intact
    text = re.sub(r'[^\w\s\./-]', '', text)
    text = ' '.join(text.split())
    return text

def lowercase_text(text):
    # Convert text to lowercase
    return text.lower()

def remove_stopwords(text):
    # Remove stopwords from the text (excluding URLs)
    stop_words = set(stopwords.words('english'))
    tokens = text.split()
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words and not re.match(r'^https?://', token)]
    return ' '.join(filtered_tokens)

# Apply preprocessing to each document
preprocessed_documents = []
for document in extracted_data:
    content = document.page_content
    # Clean text (keeping URLs)
    cleaned_content = clean_text(content)
    # Lowercase text
    lowercase_content = lowercase_text(cleaned_content)
    # Remove stopwords (excluding URLs)
    filtered_content = remove_stopwords(lowercase_content)
    # Update the 'page_content' attribute of the document with preprocessed text
    document.page_content = filtered_content
    # Append the preprocessed document to the list
    preprocessed_documents.append(document)


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/supunlakshan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/supunlakshan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
len(preprocessed_documents)

637

In [None]:
preprocessed_documents

## **Create text chunks**

In [11]:
def text_split(preprocessed_documents):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(preprocessed_documents)  # Wrap extracted_data in a list
    return text_chunks

text_chunks=text_split(list(preprocessed_documents))
print("length of text_chunks:",len(text_chunks))

length of text_chunks: 4564


In [None]:
# text_chunks

In [13]:

print("Length of text_chunks:", len(text_chunks))
print(type(text_chunks))

Length of text_chunks: 4564
<class 'list'>


## **Embedding**

In [15]:

from sentence_transformers import SentenceTransformer

# Extract text from Document objects
text_list = [t.page_content for t in text_chunks]

# Initialize SentenceTransformer model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Encode the text chunks
embeddings = model.encode(text_list)

# Print the embeddings
print(embeddings)



[[ 0.00174608 -0.03350287 -0.03290391 ... -0.00555645  0.10660913
   0.05099721]
 [-0.00524566 -0.01224833 -0.03722573 ...  0.0075348   0.08098163
   0.05183331]
 [ 0.02046325 -0.0224895  -0.00601478 ... -0.0070935  -0.00974825
   0.04680153]
 ...
 [-0.09775922  0.06158298 -0.04184233 ...  0.05067963 -0.04601221
  -0.07389956]
 [ 0.03905462  0.01553802 -0.0352585  ... -0.0279488  -0.05134996
   0.12449328]
 [ 0.01676182  0.05766533 -0.09481081 ... -0.04915264  0.03086303
   0.09051288]]


In [16]:
# Get the dimensions of the embeddings
embedding_dimensions = embeddings.shape

# Print the dimensions
print("Embedding dimensions:", embedding_dimensions)

print(type(embeddings))
print(len(embeddings))

Embedding dimensions: (4564, 384)
<class 'numpy.ndarray'>
4564


In [17]:
from pinecone import Pinecone

pc = Pinecone(api_key="afedfe83-ec30-4c3c-b51c-2e565474af4a")
index = pc.Index("mymchatbot")

In [18]:
# Specify your namespace
namespace = "book1"

In [None]:
import uuid
ids = [str(uuid.uuid4()) for _ in range(len(embeddings))]

print(len(ids))
print(ids)

In [20]:
vectors_to_upsert = [
    {
        "id": str(ids[i]),  # Ensure each ID is a string
        "values": embeddings[i],  # The embedding vector for the text chunk
        "metadata": {"page_content": str(text_chunks[i])}  # Storing page content as metadata
    } for i in range(len(embeddings))
]

In [None]:
print(type(vectors_to_upsert))
print(vectors_to_upsert[1000])

In [22]:
# Proceed with the upsert
def upsert_in_batches(index, vectors, namespace, batch_size=100):
    for i in range(0, len(vectors), batch_size):
        batch = vectors[i:i+batch_size]
        index.upsert(vectors=batch, namespace=namespace)

# Usage
upsert_in_batches(index, vectors_to_upsert, namespace, batch_size=100)

In [23]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'book1': {'vector_count': 4564}},
 'total_vector_count': 4564}

## **If we already have an index we can load it like this**

In [24]:
query=("what is the Autism?")

In [25]:
query_vector = model.encode(query)

In [None]:
print(query_vector.shape)
print(len(query_vector))
query_vector

In [27]:
# Assuming `query_vector` is a numpy array
query_vector_list = query_vector.tolist()  # Convert numpy array to a list

query_result = index.query(
    vector=query_vector_list,  # Your query vector
    namespace="book1",
    top_k=5,  # Number of top similar vectors to retrieve
    include_metadata=True  # Ensure metadata is included in the response
)

In [28]:
# for match in query_result['matches']:
#     print(f"ID: {match['id']}, Score: {match['score']}, Page Content: {match['metadata'].get('page_content')}")

for match in query_result['matches']:
    print((f"{match['metadata'].get('page_content')}"))


page_content='autism research naar. http//www. naar.org. national information center children youth dis- abilities. http//www.nichcy.org/transitn.htm. carol a. turkington autograft seeskin grafting gale encyclopedia medicine 2 421autismgem - 0001 0432 - 10/22/03 143 pm page 421' metadata={'source': '/media/supunlakshan/Learning Hub/AI & Machine Learning/LLM,OpenAI/End-to-End-Medical-Chatbot/data/Medical_book.pdf', 'page': 434}
page_content='times often boys usual-ly first-born occurs around world racesand social backgrounds. autism usually evident thefirst three years life although children itshard tell problem develops. sometimes thecondition isnt diagnosed child enters school. person autism symptoms ranging mild severe 10 extraor-dinary ability one area mathematics mem-ory music art. children known autisticsavants formerly known idiot savants.. causes symptoms autism brain disorder affects way brain uses transmits information.' metadata={'source': '/media/supunlakshan/Learning Hub/AI

In [29]:
prompt_template="""
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

In [30]:
PROMPT=PromptTemplate(template=prompt_template, input_variables=["context", "question"])
chain_type_kwargs={"prompt": PROMPT}

In [None]:
llm = ctransformers.CTransformers(model="model/llama-2-7b-chat.ggmlv3.q4_0.bin",
                                  model_type="llama",
                                  config={'max_new_tokens': 512,
                                          'temperature': 0.8}
)

In [32]:
from langchain_pinecone import PineconeVectorStore

# Assuming this is your sentence transformer model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Create the PineconeVectorStore instance (assuming you have the index details)
vectorstore = PineconeVectorStore(index, model)



In [33]:
from langchain.retrievers import SelfQueryRetriever

document_content_description = "page_content"
metadata_field_info = {}  # Define the metadata_field_info variable (if needed)

retriever = SelfQueryRetriever.from_llm(
    llm,
    vectorstore,
    document_content_description,
    metadata_field_info,
    enable_limit=True,
    verbose=True,
)


In [34]:
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",  # Replace "stuff" with your actual chain type name
    retriever=retriever,
)


In [None]:
import nltk  # Import for potential text truncation

def get_user_input():
  """Prompts the user for input and handles potential truncation."""
  max_tokens = 512  # Adjust this limit as needed
  while True:
    user_input = input(f"Input Prompt:")
    if len(nltk.word_tokenize(user_input)) <= max_tokens:  # Check token length
      return user_input
    else:
      print(f"Warning: Input exceeds maximum length ({max_tokens} tokens). Please try again with a shorter prompt.")

while True:
  user_input = get_user_input()  # Get user input with truncation handling

  # Embed the user query using encode
  query_vector = model.encode(user_input)

  try:
    # Use Langchain for retrieval and question answering (using invoke)
    result = qa.invoke({"query": query_vector.tolist()})
    print("Response :", result["result"])
  except Exception as e:
    print(f"An error occurred: {e}")
    print("Please try again with a different prompt.")
