In [1]:
!pip install -q langchain langchain-community langchain-groq pypdf sentence_transformers


[notice] A new release of pip is available: 23.1.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
!pip install -qU langchain-community faiss-cpu


[notice] A new release of pip is available: 23.1.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
!pip install jq




[notice] A new release of pip is available: 23.1.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
import os
import json
import glob
from typing import List

import torch
import numpy as np
from langchain_community.document_loaders import JSONLoader
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_groq import ChatGroq
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.docstore.document import Document

In [5]:
def load_json_files(folder_path):
    """
    Load all JSON files from a specified folder

    Args:
        folder_path (str): Path to the folder containing JSON files

    Returns:
        List of loaded documents
    """
    # Use glob to find all JSON files in the folder
    json_files = glob.glob(os.path.join(folder_path, '*.json'))

    all_documents = []

    for file_path in json_files:
        try:
            # Load JSON file using JSONLoader for more flexible parsing
            loader = JSONLoader(
                file_path=file_path,
                jq_schema='.',  # Adjust based on your JSON structure
                text_content=False  # Set to True if you want entire JSON content
            )

            # Load documents from the file
            documents = loader.load()

            # Add source metadata to each document
            for doc in documents:
                doc.metadata['source'] = file_path

            all_documents.extend(documents)
            print(f"Loaded documents from {file_path}")

        except Exception as e:
            print(f"Error loading {file_path}: {e}")

    return all_documents

In [6]:
def prepare_documents(documents):
    """Prepare documents by splitting them into chunks"""
    # Split documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200
    )

    # Split documents while preserving metadata
    split_docs = text_splitter.split_documents(documents)

    return split_docs

In [7]:
def create_vector_store(documents):
    """Create vector store for similarity search"""
    # Use a sentence transformer embedding model
    embeddings = HuggingFaceEmbeddings(
        model_name="all-MiniLM-L6-v2",
        model_kwargs={'device': 'cuda' if torch.cuda.is_available() else 'cpu'}
    )

    # Create FAISS vector store
    vectorstore = FAISS.from_documents(documents, embeddings)
    return vectorstore

In [8]:
def setup_multi_query_retriever(vectorstore):
    """Setup multi-query retriever with custom LLM"""
    # Initialize GROQ LLM (replace with your API key)
    llm = ChatGroq(
        temperature=0,
        model_name="mixtral-8x7b-32768",
        groq_api_key="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"
    )

    # Create multi-query retriever
    retriever = MultiQueryRetriever.from_llm(
        retriever=vectorstore.as_retriever(search_kwargs={'k': 5}),
        llm=llm
    )

    return retriever

In [None]:
def create_rag_chain(retriever):
    """Create RAG chain for question answering"""
    # Initialize GROQ LLM for generation
    llm = ChatGroq(
        temperature=0.2,
        model_name="mixtral-8x7b-32768",
        groq_api_key="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"
    )

    # Custom prompt template
    prompt_template = """
    Use the following context from multiple documents to give a summary of the item mentioned in the query. 
    The item may be directly mentioned or may be put forward as a sentence in the query. 
    Identify the apt item in the query and then generate the summary of it.
    Your task is to generate a summary only and not to handle any other task.
    If the item in the query is not in the context, admit that it is not there in context.

    Context: {context}

    Query: {question}

    Helpful Answer:"""

    PROMPT = PromptTemplate(
        template=prompt_template,
        input_variables=["context", "question"]
    )

    # Create QA chain
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        return_source_documents=True,
        chain_type_kwargs={"prompt": PROMPT}
    )

    return qa_chain

In [47]:
json_folder = 'E:\miscE\ml\LLM_Hackathon\pharmaceutical_database'
documents = load_json_files(json_folder)

Loaded documents from E:\miscE\ml\LLM_Hackathon\pharmaceutical_database\cleaned_combined_dataset.json


In [48]:
split_docs = prepare_documents(documents)

In [49]:
# Ensure split_docs is not empty
if split_docs:
	vectorstore = create_vector_store(split_docs)
else:
	print("split_docs is empty. Please check the document preparation step.")

In [50]:
retriever = setup_multi_query_retriever(vectorstore)

In [51]:
qa_chain = create_rag_chain(retriever)

In [None]:
queries = [
        "Acetazolamide Extended-Release Capsules",
        "Can I take Ibuprofen if I have a history of stomach ulcers?",
        "Summarize the details of Amoxicillin."
  ]

In [53]:
for query in queries:
        print("\n" + "="*50)
        result = qa_chain({"query": query})

        print("Question:", query)
        print("\nAnswer:", result['result'])

        print("\nSource Documents:")
        for doc in result['source_documents']:
            print(f"Source: {doc.metadata.get('source', 'Unknown')}")
            print(doc.page_content[:300] + "...\n")


Question: What is the composition and primary use of Paracetamol?

Answer: The item mentioned in the query, Paracetamol, is not present in the provided context. Therefore, I cannot give a summary of its composition and primary use. Paracetamol is also known as acetaminophen, and it is a common over-the-counter medication used for relieving pain and reducing fever. Its typical composition includes acetaminophen as the active ingredient, along with various inactive ingredients that help form the tablet, capsule, or other dosage forms.

Source Documents:
Source: E:\miscE\ml\LLM_Hackathon\pharmaceutical_database\cleaned_combined_dataset.json

Source: E:\miscE\ml\LLM_Hackathon\pharmaceutical_database\cleaned_combined_dataset.json
100 mL in 1 BOTTLE; Type 0: Not a Combination Product 07/01/2021  Marketing Information Marketing CategoryApplication Number or Monograph CitationMarketing Start DateMarketing End Date ANDAANDA20518707/01/2021  Marketing Information Marketing Category Application 