In [11]:
%pwd

'c:\\'

In [13]:
import os
os.chdir("C:/Users/vinod/OneDrive/Desktop/Coding/AIWorld/chatbot")

In [14]:
%pwd

'C:\\Users\\vinod\\OneDrive\\Desktop\\Coding\\AIWorld\\chatbot'

In [17]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader 

In [None]:
#Extract PDF from PDF file

def load_pdf_file(data):
    loader = DirectoryLoader(data,
                            glob="*.pdf",
                            loader_cls=PyPDFLoader)
    documents=loader.load()
    return documents
  


In [20]:
extracted_data = load_pdf_file(data="Data/")

In [22]:
# extracted_data

In [23]:
#Split the DATA into text Chunks perform chunk

def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunk = text_splitter.split_documents(extracted_data)
    return text_chunk



In [26]:
text_chunks = text_split(extracted_data)
print("Length of text chunks", len(text_chunks))

Length of text chunks 5860


In [28]:
# text_chunks

In [37]:
#Download the Embeddings from Hugging face

from langchain_huggingface import HuggingFaceEmbeddings


In [38]:
def downlaod_hugging_face_embedding():
    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

In [39]:
embeddings = downlaod_hugging_face_embedding()

In [43]:
query_result = embeddings.embed_query('Hello world')
print("Length: ",len(query_result))# This is a sentence-transformers model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search.

Length:  384


In [44]:
#query_result

In [46]:
# create a index in pinecone
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv

load_dotenv(".env", override=True)

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "medibot"

pc.create_index(
    name= index_name,
    dimension=384,
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

{
    "name": "medibot",
    "metric": "cosine",
    "host": "medibot-sgn6oe0.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [47]:
import os
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY

In [48]:
# EMBEED each chunk and insert the embeddings into your Pinecode index

from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings
)

In [51]:
# Load Existing index

from langchain_pinecone import PineconeVectorStore

#embed each chunk and upsert the embeddings into your Pincone index
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)


In [53]:
docsearch #will get the object <langchain_pinecone.vectorstores.PineconeVectorStore at 0x255b87547a0>

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x255b87547a0>

In [54]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

retrieved_docs = retriever.invoke("what is Acne?")

retrieved_docs

[Document(id='c6c52184-e604-473f-9b82-ae3c6e4aa557', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 39.0, 'page_label': '40', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'Data\\Medical_book.pdf', 'total_pages': 637.0}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='9ba4dc72-2115-419b-bb6a-82a4e5a226f0', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 38.0, 'page_label': '39', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'Data\\Medical_book.pdf', 'total_pages': 637.0}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 2 25\nAcne\nAcne vulgaris affecting a woman’s face. Acne is the general\nname given to a skin disorder in which the sebaceous\nglands become inflamed.(Photograph by Biophoto Associ-\nates, Photo Researchers, Inc. Reproduced by permission.)\nGEM -

In [55]:
MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")

In [57]:
import os
os.environ["MISTRAL_API_KEY"] = MISTRAL_API_KEY

In [59]:
from langchain_mistralai import ChatMistralAI
llm = ChatMistralAI(temperature=0.4, max_tokens=500)

In [61]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [62]:

question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)


In [63]:
response = rag_chain.invoke({"input": "what is Acne?"})
print(response["answer"])

Acne is a common skin disease characterized by pimples on the face, chest, and back. It occurs when pores become clogged with oil, dead skin cells, and bacteria. Acne vulgaris, its medical term, affects approximately 17 million people in the United States.
