## Installing Required Libraries

In [None]:
%pip install -qU pypdf
%pip install -qU langchain
%pip install -qU langchain-community
%pip install -qU langchain-huggingface sentence_transformers
%pip install -qU python-dotenv
%pip install -qU pinecone
%pip install -qU pinecone[grpc]
%pip install -qU langchain-pinecone
%pip install -qU "langchain[Groq]"

## Importing Libraries for Document Loading

To begin, import the necessary libraries to handle document loading efficiently.

In [None]:
from langchain.document_loaders import DirectoryLoader,PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

## Loading the Document

After importing the necessary libraries, load the document using the appropriate loader.

In [None]:
def load_pdf(data):
    loader = DirectoryLoader(data,
                                glob = "*.pdf",
                                loader_cls = PyPDFLoader)
    documents = loader.load()
    return documents

extracted_data = load_pdf("..\\Doc\\")

## Splitting the Document into Chunks

Once the document is loaded, it needs to be split into smaller chunks for efficient processing. We use **LangChain's RecursiveCharacterTextSplitter** for this purpose.

In [None]:
def text_split(extracted_data):
    text_splitter =  RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 20)
    text_chunk = text_splitter.split_documents(extracted_data)
    return text_chunk
    
text_chunk = text_split(extracted_data)

## Importing Embeddings

To convert text chunks into vector representations, we use embeddings. LangChain supports multiple embedding models, including OpenAI, Hugging Face, and SentenceTransformers.

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name= "sentence-transformers/all-mpnet-base-v2")

In [None]:
import os
from dotenv import load_dotenv

In [None]:
load_dotenv()

In [None]:
from pinecone import ServerlessSpec
from pinecone.grpc import PineconeGRPC as Pinecone

## Creating an Index on Pinecone

In [None]:
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
pc = Pinecone(api_key= PINECONE_API_KEY)
index_name = "medibot"
pc.create_index(
    name = index_name,
    dimension = 768,
    metric = "cosine",
    spec = ServerlessSpec(
        cloud = "aws",
        region = "us-east-1"
    )
)

In [None]:
from langchain_pinecone import PineconeVectorStore

In [None]:
vectorstore_from_docs = PineconeVectorStore.from_documents(
        text_chunk,
        index_name=index_name,
        embedding=embeddings,
    )

In [None]:
docsearch = PineconeVectorStore.from_existing_index(
    index_name = index_name,
    embedding = embeddings,
)

In [None]:
retriever = docsearch.as_retriever(search_type = "similarity", search_kwargs = {"k":3})

In [None]:
from langchain.chat_models import init_chat_model

In [None]:
GROQ_API_KEY = os.environ.get('GROQ_API_KEY')
os.environ["GROQ_API_KEY"] = GROQ_API_KEY
model = init_chat_model("gemma2-9b-It", model_provider= "groq")

In [None]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core .prompts import ChatPromptTemplate

In [None]:
system_prompt = (
    "You are an assistant for question-answering tasks."
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you   "
    "don't know. Use three sentences maximum and keep the "
    "answer concise. Analyze each question and give response to it"
    "\n\n"
    "{context}"
)

In [None]:
prompt = ChatPromptTemplate([
    ("system",system_prompt),
    ("human","{input}")
])

In [None]:
question_answer_chain = create_stuff_documents_chain(model, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [None]:
from pprint import pprint

In [None]:
response = rag_chain.invoke({"input" : " aviod  meicine fors acne?"})
res = response["answer"]
pprint(res)