In [2]:
!pip install -q langchain langchain-community langchain-groq pypdf sentence_transformers

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.4 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m2.4/2.4 MB[0m [31m110.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m59.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m53.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.0/298.0 kB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m108.8/108.8 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m409.5/409.5 kB[0m [31m32.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [14]:
!pip install -qU langchain-community faiss-cpu

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m66.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [56]:
!pip install jq



In [39]:
import os
import json
import glob
from typing import List

import torch
import numpy as np
from langchain_community.document_loaders import JSONLoader
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_groq import ChatGroq
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.docstore.document import Document

In [40]:
def load_json_files(folder_path):
    """
    Load all JSON files from a specified folder

    Args:
        folder_path (str): Path to the folder containing JSON files

    Returns:
        List of loaded documents
    """
    # Use glob to find all JSON files in the folder
    json_files = glob.glob(os.path.join(folder_path, '*.json'))

    all_documents = []

    for file_path in json_files:
        try:
            # Load JSON file using JSONLoader for more flexible parsing
            loader = JSONLoader(
                file_path=file_path,
                jq_schema='.',  # Adjust based on your JSON structure
                text_content=False  # Set to True if you want entire JSON content
            )

            # Load documents from the file
            documents = loader.load()

            # Add source metadata to each document
            for doc in documents:
                doc.metadata['source'] = file_path

            all_documents.extend(documents)
            print(f"Loaded documents from {file_path}")

        except Exception as e:
            print(f"Error loading {file_path}: {e}")

    return all_documents

In [41]:
def prepare_documents(documents):
    """Prepare documents by splitting them into chunks"""
    # Split documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200
    )

    # Split documents while preserving metadata
    split_docs = text_splitter.split_documents(documents)

    return split_docs

In [42]:
def create_vector_store(documents):
    """Create vector store for similarity search"""
    # Use a sentence transformer embedding model
    embeddings = HuggingFaceEmbeddings(
        model_name="all-MiniLM-L6-v2",
        model_kwargs={'device': 'cuda' if torch.cuda.is_available() else 'cpu'}
    )

    # Create FAISS vector store
    vectorstore = FAISS.from_documents(documents, embeddings)
    return vectorstore

In [61]:
def setup_multi_query_retriever(vectorstore):
    """Setup multi-query retriever with custom LLM"""
    # Initialize GROQ LLM (replace with your API key)
    llm = ChatGroq(
        temperature=0,
        model_name="mixtral-8x7b-32768",
        groq_api_key="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"
    )

    # Create multi-query retriever
    retriever = MultiQueryRetriever.from_llm(
        retriever=vectorstore.as_retriever(search_kwargs={'k': 5}),
        llm=llm
    )

    return retriever

In [62]:
def create_rag_chain(retriever):
    """Create RAG chain for question answering"""
    # Initialize GROQ LLM for generation
    llm = ChatGroq(
        temperature=0.2,
        model_name="mixtral-8x7b-32768",
        groq_api_key="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"
    )

    # Custom prompt template
    prompt_template = """Use the following context from multiple documents to answer the question.
    If the answer is not in the context, admit that you don't know.

    Context: {context}

    Question: {question}

    Helpful Answer:"""

    PROMPT = PromptTemplate(
        template=prompt_template,
        input_variables=["context", "question"]
    )

    # Create QA chain
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        return_source_documents=True,
        chain_type_kwargs={"prompt": PROMPT}
    )

    return qa_chain

In [73]:
json_folder = '/content/cleaned_data'
documents = load_json_files(json_folder)

Loaded documents from /content/cleaned_data/cleaned_combined_dataset.json


In [74]:
split_docs = prepare_documents(documents)

In [75]:
vectorstore = create_vector_store(split_docs)

In [76]:
retriever = setup_multi_query_retriever(vectorstore)

In [77]:
qa_chain = create_rag_chain(retriever)

In [78]:
queries = [
        "What is the composition and primary use of Paracetamol?",
        "Can I take Ibuprofen if I have a history of stomach ulcers?",
        "Summarize the details of Amoxicillin."
  ]

In [79]:
for query in queries:
        print("\n" + "="*50)
        result = qa_chain({"query": query})

        print("Question:", query)
        print("\nAnswer:", result['result'])

        print("\nSource Documents:")
        for doc in result['source_documents']:
            print(f"Source: {doc.metadata.get('source', 'Unknown')}")
            print(doc.page_content[:300] + "...\n")


Question: What is the composition and primary use of Paracetamol?

Answer: Based on the provided context, there is no information about a product containing Paracetamol (also known as Acetaminophen) in the given text. Therefore, I cannot provide the composition and primary use of Paracetamol. Paracetamol is a common over-the-counter medication used for relieving pain and reducing fever. Its typical composition includes Paracetamol as the active ingredient. However, without the specific context mentioning Paracetamol, it is not possible to give a more detailed answer.

Source Documents:
Source: /content/cleaned_data/cleaned_combined_dataset.json

Source: /content/cleaned_data/cleaned_combined_dataset.json
100 mL in 1 BOTTLE; Type 0: Not a Combination Product 07/01/2021  Marketing Information Marketing CategoryApplication Number or Monograph CitationMarketing Start DateMarketing End Date ANDAANDA20518707/01/2021  Marketing Information Marketing Category Application Number or Monograph C