In [5]:
import os

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/Users/ksumit/Documents/secrets/vertex-ai-im-6882a1563a11.json"

In [25]:
from langchain_google_vertexai import ChatVertexAI

llm = ChatVertexAI(
    model="gemini-1.5-pro",
    temperature=1,
    max_tokens=None,
    max_retries=6,
    stop=None,
    project="vertex-ai-im",
    location="asia-south1",
)

In [10]:
from langchain_core.documents import Document

text = """
MINISTRY OF HEALTH AND FAMILY WELFARE (Department of Health and Family Welfare) NOTIFICATION 
New Delhi, the 27th January, 2020 
            G.S.R. 52(E).—Whereas  the  Central Government, on being satisfied that the  use of the  drug 
Oxytocin and its formulation in any name or manner is likely to involve certain risk to human beings 
and  animals,  prohibited  the  import  of  the  said  drugs  in  public  interest  by  amending  the  notification 
number G.S.R. 577(E), dated  the  23rd July, 1983  vide  notification of the Government of India  in the 
Ministry of Health and Family Welfare number G.S.R. 390(E), dated the 24th April, 2018;  And whereas, subsequent to issuance of the said notification number G.S.R. 390(E), dated the 
24th  April,  2018  for  prohibition  of  import  of  drug  Oxytocin  and  its  formulation  in  any  name  or 
manner, the  Central Government received  representations  from various  stakeholders  to allow import 
of Oxytocin reference standards for the purpose of examination, test or analysis;  And whereas, the Central Government is satisfied that import of Oxytocin reference standards 
is necessary exclusively for the purpose of examination, test or analysis before carrying out 
commercial manufacturing of the said drug;  Now,  therefore,  in  exercise  of  the  powers  conferred  by  section  10A  of  the  Drugs  and 
Cosmetics Act,  1940 (23 of 1940), the  Central Government hereby makes the following amendment 
in the notification of the Government of India in the Ministry of Health and Family Welfare number 
G.S.R. 577(E), dated the 23rd July, 1983, namely:―  In the  said notification,  in the Table,  for serial number 12 and  the  entry relating thereto, the 
following serial number and entry shall be inserted, namely:― "12.  Oxytocin  and  its  formulation  in  any  name  or  manner  except  Oxytocin  reference  standards 
imported exclusively for the purpose of test and analysis.”
"""
documents = [Document(page_content=text)]

In [18]:
from langchain_experimental.graph_transformers import LLMGraphTransformer

allowed_relationships = [
    ("FDC", "CONTAINS", "Drug"),
    ("Drug", "MENTIONED_IN", "Notification No"),
    ("FDC", "MENTIONED_IN", "Notification No"),
    ("FDC", "STATUS", "Banned"),
    ("FDC", "STATUS", "Approved"),
    ("FDC", "EFFECTIVE_FROM", "Date"),
]

llm_transformer_tuple = LLMGraphTransformer(
    llm=llm,
    allowed_nodes=["FDC", "Drug", "Notification No", "Date", "Banned", "Approved"],
    allowed_relationships=allowed_relationships,
    node_properties=[]
)
graph_documents_filtered = llm_transformer_tuple.convert_to_graph_documents(documents)
print(f"Nodes:{graph_documents_filtered[0].nodes}")
print(f"Relationships:{graph_documents_filtered[0].relationships}")

Key '$defs' is not supported in schema, ignoring


Nodes:[Node(id='S.O. 180(E)', type='Notification no', properties={}), Node(id='28Th November, 2007', type='Date', properties={}), Node(id='16Th February, 2015', type='Date', properties={}), Node(id='15Th December, 2017', type='Date', properties={}), Node(id='5-Bromosalicyl-4-Chloranilide + Salicylic Acid', type='Fdc', properties={}), Node(id='5-Bromosalicyl-4-Chloranilide + Salicylic Acid', type='Drug', properties={}), Node(id='Banned', type='Banned', properties={})]
Relationships:[Relationship(source=Node(id='5-Bromosalicyl-4-Chloranilide + Salicylic Acid', type='Fdc', properties={}), target=Node(id='5-Bromosalicyl-4-Chloranilide + Salicylic Acid', type='Drug', properties={}), type='CONTAINS', properties={}), Relationship(source=Node(id='5-Bromosalicyl-4-Chloranilide + Salicylic Acid', type='Fdc', properties={}), target=Node(id='Banned', type='Banned', properties={}), type='STATUS', properties={}), Relationship(source=Node(id='5-Bromosalicyl-4-Chloranilide + Salicylic Acid', type='Fdc

In [19]:
import re
from PyPDF2 import PdfReader

def extract_english_text(pdf_path):
    reader = PdfReader(pdf_path)
    english_text = ""

    for page in reader.pages:
        text = page.extract_text()
        # Use regex to keep only English characters, numbers, and basic punctuation
        english_only = re.sub(r'[^a-zA-Z0-9\s.,!?\'"-]', '', text)
        english_text += english_only + "\n"

    return english_text

# Specify the path to your PDF
pdf_path = "pdfs/218928.pdf"
english_text = extract_english_text(pdf_path)

# Save the extracted English text to a file
with open("english_text.txt", "w", encoding="utf-8") as file:
    file.write(english_text)

print("English text extracted and saved to english_text.txt")

English text extracted and saved to english_text.txt


In [77]:
from pydantic import BaseModel, Field

class NotificationOutput(BaseModel):
    notification_no: str = Field(..., title="Notification No")
    notification_date: str = Field(..., title="Notification Date")
    fdc_name: str = Field(None, title="Drug Combination Mentioned in the Notification")
    drugs_list: list[str] = Field(None, title="List of Drugs in the FDC")
    description: str = Field(..., title="Summary of the Notification in simple english")

notification_llm = llm.with_structured_output(NotificationOutput)

In [78]:
notification_llm.invoke(data[0])

NotificationOutput(notification_no='S.O. 180(E)', notification_date='11th January, 2019', fdc_name='5-bromosalicyl-4-chloranilide + Salicylic acid', drugs_list=['5-bromosalicyl-4-chloranilide + Salicylic acid'], description='The Central Government hereby prohibits in the public interest the manufacture for sale, sale and distribution of the following drug with immediate effect:- “Fixed dose combination of 5-bromosalicyl-4-chloranilide + Salicylic acid for human use”.')

In [1]:
# read data.text file and break the text by empty line
with open("/Users/ksumit/Development/Personal/langchain-development/microsoft-graph-rag-approach/ragtest/input/data.txt", "r") as file:
    data = file.read().split("\n\n")

In [3]:
len(data)

80

In [12]:
notifications = []
from tqdm import tqdm

for desc in tqdm(data):
    notification = notification_llm.invoke(desc)
    notifications.append(notification)

100%|██████████| 80/80 [04:01<00:00,  3.02s/it]


In [35]:
for notification in notifications:
    print(notification.drugs_list)

['5-bromosalicyl-4-chloranilide', 'Salicylic acid']
['Activated charcoal', 'Fungal diastase', 'Lactic acid']
['Allobarbitone', 'Phospho-dimethyl-isopropyl-pyrazolone']
['Artesunate', 'Arteether', 'Artemether']
['Atorvastatin', 'Acetyl Salicylic acid', 'Caffeine']
['Chlormezanone', 'Paracetamol', 'Diclofenac sodium']
['Chlormezanone', 'Paracetamol', 'Ibuprofen']
['Chlorzoxazone', 'Ibuprofen', 'Paracetamol', 'Diclofenac', 'Oxyphenbutazone', 'Magnesium Hydroxide']
['Chlorzoxazone', 'Paracetamol', 'Ibuprofen', 'Diclofenac sodium']
['Ciprofloxacin', 'Tinidazole', 'Dicyclomine']
['Dicyclomine', 'Dextromethorphan', 'Paracetamol']
['Dicyclomine', 'Paracetamol', 'Chlordiazepoxide']
['Dicyclomine', 'Paracetamol', 'Phenylisopropyl Pyrazolone']
['Dicyclomine', 'Serratiopeptidase']
['Ibuprofen', 'Colchicine']
['Ibuprofen', 'Dextropropoxyphene', 'Paracetamol']
['Ibuprofen', 'Paracetamol', 'Colchicine']
['Ibuprofen', 'Paracetamol', 'Magnesium Trisilicate']
['Mebeverine', 'Alprazolam']
['Mecobalamin',

In [33]:
def modify_drugs_list(drugs_list):
    new_drug_list = []
    for drug in drugs_list:
        new_drugs = drug.split("+")
        new_drugs = [new_drug.strip() for new_drug in new_drugs]
        new_drug_list.extend(new_drugs)
    return new_drug_list

In [34]:
for notification in notifications:
    notification.drugs_list = modify_drugs_list(notification.drugs_list)

In [37]:
drugs = []
for notification in notifications:
    drugs.extend([drug.lower() for drug in notification.drugs_list])
    
drugs = list(set(drugs))

In [39]:
len(drugs)

107

In [28]:
prompt = f"""
Extract the following information from the given notification text:

Notification Number – The alphanumeric code following the term "NOTIFICATION" or "S.O."

Drugs List – The names of chemical compounds mentioned in the context of approval, prohibition, or any regulatory action.

Action Type – Determine whether the notification approves, prohibits, or modifies the usage of the mentioned drugs.

Return the extracted data in the following structured format:
{{
  "Notification_No": "S.O. XXX(E)",
  "Action_Type": "Approved/Prohibited/Modified",
  "Drugs_List": ["Drug_Name_1", "Drug_Name_2", ...]
}}

Ensure accuracy by capturing only the drugs explicitly mentioned in the regulatory decision.

Notification Text:
{data[0]}
"""

In [31]:
messages = [
    (
        "system",
        "You are a legal and regulatory document classifier. Your task is to analyze a given text and determine whether it is a government notification related to the prohibition, restriction, or regulation of a pharmaceutical drug or its formulation in India.",
    ),
    ("human", prompt),
]
ai_msg = llm.invoke(messages)
print(ai_msg.content)

{
  "Notification_No": "S.O. 180(E)",
  "Action_Type": "Prohibited",
  "Drugs_List": ["5-bromosalicyl-4-chloranilide + Salicylic acid"]
} 



In [53]:
import json

def extract_json_data(json_str):
    try:
        json_start = json_str.find("{")
        json_end = json_str.rfind("}")
        if json_start != -1 and json_end != -1:
            extracted_sentiment = json_str[
                json_start : json_end + 1
            ]  # Extract the JSON object
            data_dict = json.loads(extracted_sentiment)
            return data_dict
        else:
            print(f"Warning: JSON object not found in response: {json_str}")
            return {}
    except json.JSONDecodeError as e:
        print(f"Error parsing JSON: {e}")
        return {}

In [80]:
product_description = "Nimesulide 100mg + Paracetamol 500mg Tablet"

def extract_drugs(product_description, drugs):
  drug_extractor_prompt = f"""
  Extract drugs present in following product description:

  Product Description:
  {product_description}

  Return the extracted drugs in a following json format:
  {{
    "drugs": ["drug_Name_1", "drug_Name_2", ...]
  }}

  Possible drugs in the product description:
  """

  drugs_list_str = "";
  for drug in drugs:
      drugs_list_str += f"- {drug}\n"
      
  drug_extractor_prompt += drugs_list_str
  
  content = llm.invoke(drug_extractor_prompt).content
  
  return extract_json_data(content)

def filter_notification_having_drugs(notifications, drugs):
  filtered_notifications = []
  for notification in notifications:
    if any(drug.lower() in [d.lower() for d in notification.drugs_list] for drug in drugs):
      filtered_notifications.append(notification)
  return filtered_notifications

In [81]:
extracted_drugs = extract_drugs(product_description, drugs)['drugs']
filtered_notifications = filter_notification_having_drugs(notifications, extracted_drugs)

In [82]:
check_status_prompt = f"""
Check the status of the following drugs in the given notification text:
Also consider some of the drugs may be part of a fixed-dose combination (FDC) and may not be mentioned individually.
Given product is only banned when present all drugs in fdcs are banned otherwise it is approved.

{product_description}

Return the status of the drugs in the following structured format:
{{
  "status": "Banned/Approved",
  "reason": "Reason for the status"
}}

Notification Text:
"""

for notification in filtered_notifications:
    check_status_prompt += f"\n{notification.description}\n"

In [83]:
print(extract_json_data(llm.invoke(check_status_prompt).content))

{'status': 'Approved', 'reason': 'While the notification bans various FDCs containing Nimesulide and Paracetamol, it does not specifically ban the FDC of Nimesulide 100mg + Paracetamol 500mg.'}
