In [2]:
!pip install -q langchain langchain-community langchain-groq pypdf sentence_transformers

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.4 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m2.4/2.4 MB[0m [31m110.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m59.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m53.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.0/298.0 kB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m108.8/108.8 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m409.5/409.5 kB[0m [31m32.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [14]:
!pip install -qU langchain-community faiss-cpu

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m66.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [56]:
!pip install jq



In [39]:
import os
import json
import glob
from typing import List

import torch
import numpy as np
from langchain_community.document_loaders import JSONLoader
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_groq import ChatGroq
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.docstore.document import Document

In [80]:
def load_json_files(folder_path):
    """
    Load all JSON files from a specified folder.

    Args:
        folder_path (str): Path to the folder containing JSON files.

    Returns:
        List of loaded documents.
    """
    json_files = glob.glob(os.path.join(folder_path, '*.json'))
    all_documents = []

    for file_path in json_files:
        try:
            loader = JSONLoader(file_path=file_path, jq_schema='.', text_content=False)
            documents = loader.load()
            for doc in documents:
                doc.metadata['source'] = file_path
            all_documents.extend(documents)
            print(f"Loaded documents from {file_path}")
        except Exception as e:
            print(f"Error loading {file_path}: {e}")

    return all_documents

In [81]:
def prepare_documents(documents):
    """Prepare documents by splitting them into chunks."""
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    split_docs = text_splitter.split_documents(documents)
    return split_docs

In [83]:
def create_vector_store(documents):
    """Create a vector store for similarity search."""
    embeddings = HuggingFaceEmbeddings(
        model_name="all-MiniLM-L6-v2",
        model_kwargs={'device': 'cuda' if torch.cuda.is_available() else 'cpu'}
    )
    vectorstore = FAISS.from_documents(documents, embeddings)
    return vectorstore

In [103]:
def setup_multi_query_retriever(vectorstore):
    """Set up a multi-query retriever with custom LLM."""
    llm = ChatGroq(
        temperature=0,
        model_name="llama-3.1-8b-instant",
        groq_api_key="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"
    )
    retriever = MultiQueryRetriever.from_llm(
        retriever=vectorstore.as_retriever(search_kwargs={'k': 5}),
        llm=llm
    )
    return retriever

In [112]:
def create_rag_chain(retriever):
    """
    Create RAG chain for QA with recommendations and warnings.

    This chain answers questions only based on the provided context.
    If the information is not in the context, it explicitly states that.
    """
    llm = ChatGroq(
        temperature=0.2,
        model_name="llama-3.1-8b-instant",
        groq_api_key="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"
    )

    prompt_template = """You are a medical assistant with access to knowledge about medicines.
    Use only the provided context to answer the question. If the answer cannot
    be derived from the provided context, respond with: "This information is not present in the provided documents."

    Context: {context}

    Question: {question}

    Answer:"""

    PROMPT = PromptTemplate(
        template=prompt_template,
        input_variables=["context", "question"]
    )

    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        return_source_documents=True,
        chain_type_kwargs={"prompt": PROMPT}
    )
    return qa_chain


In [113]:
json_folder = '/content/cleaned_data'
documents = load_json_files(json_folder)
split_docs = prepare_documents(documents)
vectorstore = create_vector_store(split_docs)
retriever = setup_multi_query_retriever(vectorstore)
qa_chain = create_rag_chain(retriever)

Loaded documents from /content/cleaned_data/cleaned_combined_dataset.json


In [118]:
queries = [
        "Can I take Ibuprofen if I have a history of stomach ulcers?"
]

In [119]:
for query in queries:
        print("\n" + "="*50)
        result = qa_chain({"query": query})

        print("Question:", query)
        print("\nAnswer:", result['result'])

        print("\nSource Documents:")
        for doc in result['source_documents']:
            print(f"Source: {doc.metadata.get('source', 'Unknown')}")
            print(doc.page_content[:300] + "...\n")


Question: Can I take Ibuprofen if I have a history of stomach ulcers?

Answer: This information is not present in the provided documents.

Source Documents:
Source: /content/cleaned_data/cleaned_combined_dataset.json
CAPSULE Size 15mm Flavor  Imprint Code                                                         ML;5;2;5                          Contains  Product Characteristics Color green Score no score Shape CAPSULE Size 15mm Flavor  Imprint Code                                                         ML;5;2;5 ...

Source: /content/cleaned_data/cleaned_combined_dataset.json
06/01/2018  4 NDC:42571-144-29 6  in 1 CARTON 06/01/2018  4 NDC:42571-144-32 10  in 1 BLISTER PACK; Type 0: Not a Combination Product   5 NDC:42571-144-01 100  in 1 BOTTLE; Type 0: Not a Combination Product 06/01/2018  Marketing Information Marketing CategoryApplication Number or Monograph CitationM...

Source: /content/cleaned_data/cleaned_combined_dataset.json
green Score no score Shape CAPSULE Size 15mm Flavor 