MAIN CODE

In [1]:
# Install necessary libraries
!pip install -U langchain faiss-cpu openai langchain-community langchain-openai tiktoken

Collecting langchain
  Downloading langchain-0.3.9-py3-none-any.whl.metadata (7.1 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting openai
  Downloading openai-1.55.3-py3-none-any.whl.metadata (24 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.8-py3-none-any.whl.metadata (2.9 kB)
Collecting langchain-openai
  Downloading langchain_openai-0.2.10-py3-none-any.whl.metadata (2.6 kB)
Collecting tiktoken
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting langchain-core<0.4.0,>=0.3.21 (from langchain)
  Downloading langchain_core-0.3.21-py3-none-any.whl.metadata (6.3 kB)
Collecting SQLAlchemy<3,>=1.4 (from langchain)
  Downloading SQLAlchemy-2.0.35-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading d

In [2]:
# Import required libraries
import pandas as pd
import faiss
import pickle
import numpy as np
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.docstore.in_memory import InMemoryDocstore
from langchain.docstore.document import Document
import tiktoken

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Load the uploaded CSV file
file_path = "/content/drive/My Drive/Dataset/Drug2.csv"
medicine_data = pd.read_csv(file_path)

In [4]:
# Display the first few rows to understand the structure
medicine_data.head()

Unnamed: 0,disease_name,disease_symptoms,medicine_name,disease_description
0,Acne,"(hives, difficult breathing, swelling in your ...","Acticlate, Adoxa CK, Adoxa Pak, Adoxa TT, Alod...",Acne Other names: Acne Vulgaris; Blackheads; B...
1,Acne,hives ; difficulty breathing; swelling of your...,"Aldactone, CaroSpir",Acne Other names: Acne Vulgaris; Blackheads; B...
2,Acne,"skin rash, fever, swollen glands, flu-like sym...","Dynacin, Minocin, Minolira, Solodyn, Ximino, V...",Acne Other names: Acne Vulgaris; Blackheads; B...
3,Acne,problems with your vision or hearing; muscle o...,,Acne Other names: Acne Vulgaris; Blackheads; B...
4,Acne,hives ; difficult breathing; swelling of your ...,"Cleocin T, Clindacin ETZ, Clindacin P, Clindag...",Acne Other names: Acne Vulgaris; Blackheads; B...


In [5]:
# Drop rows with missing values in required fields
medicine_data = medicine_data.dropna(subset=['medicine_name', 'disease_name', 'disease_symptoms', 'disease_description'])

In [6]:
# Combine relevant fields for embedding
medicine_data['combined_info'] = medicine_data.apply(
    lambda row: f"Medicine: {row['medicine_name']}. Disease: {row['disease_name']} - {row['disease_description']}."
                f" Symptoms: {row['disease_symptoms']}.", axis=1
)

# Display the first few rows to understand the structure
medicine_data.head()

Unnamed: 0,disease_name,disease_symptoms,medicine_name,disease_description,combined_info
0,Acne,"(hives, difficult breathing, swelling in your ...","Acticlate, Adoxa CK, Adoxa Pak, Adoxa TT, Alod...",Acne Other names: Acne Vulgaris; Blackheads; B...,"Medicine: Acticlate, Adoxa CK, Adoxa Pak, Adox..."
1,Acne,hives ; difficulty breathing; swelling of your...,"Aldactone, CaroSpir",Acne Other names: Acne Vulgaris; Blackheads; B...,"Medicine: Aldactone, CaroSpir. Disease: Acne -..."
2,Acne,"skin rash, fever, swollen glands, flu-like sym...","Dynacin, Minocin, Minolira, Solodyn, Ximino, V...",Acne Other names: Acne Vulgaris; Blackheads; B...,"Medicine: Dynacin, Minocin, Minolira, Solodyn,..."
4,Acne,hives ; difficult breathing; swelling of your ...,"Cleocin T, Clindacin ETZ, Clindacin P, Clindag...",Acne Other names: Acne Vulgaris; Blackheads; B...,"Medicine: Cleocin T, Clindacin ETZ, Clindacin ..."
5,Acne,hives ; difficulty breathing; swelling of your...,CaroSpir,Acne Other names: Acne Vulgaris; Blackheads; B...,Medicine: CaroSpir. Disease: Acne - Acne Other...


In [7]:
# Set embedding parameters
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"
max_tokens = 8000

encoding = tiktoken.get_encoding(embedding_encoding)
medicine_data["n_tokens"] = medicine_data['combined_info'].apply(lambda x: len(encoding.encode(x)))
medicine_data = medicine_data[medicine_data['n_tokens'] <= max_tokens]

In [None]:
# Define embedding function
from openai import OpenAI
openai_api_key = ""  # Replace with your OpenAI API key
client = OpenAI(api_key=openai_api_key)

def get_embedding(text, model="text-embedding-ada-002"):
    text = text.replace("\n", " ")
    return client.embeddings.create(input=[text], model=model).data[0].embedding

In [9]:
# Generate embeddings
medicine_data["embedding"] = medicine_data['combined_info'].apply(lambda x: get_embedding(x, model=embedding_model))

In [10]:
# Save embeddings and text for FAISS
embeddings = list(medicine_data["embedding"])
texts = list(medicine_data["combined_info"])

In [11]:
# Convert embeddings to a NumPy array of type float32
embeddings_array = np.array(embeddings, dtype='float32')

# Initialize FAISS index
dimension = embeddings_array.shape[1]  # Get the dimensionality of embeddings
index = faiss.IndexFlatL2(dimension)  # L2 distance-based index

# Add embeddings to the index
index.add(embeddings_array)

In [12]:
# Save FAISS index and metadata for later use
faiss_index_path = "/content/drive/My Drive/Dataset/faiss_index"
metadata_path = "/content/drive/My Drive/Dataset/faiss_metadata.pkl"

faiss.write_index(index, faiss_index_path)
with open(metadata_path, "wb") as f:
    pickle.dump(texts, f)

In [13]:
# Reload FAISS index and metadata
index = faiss.read_index(faiss_index_path)
with open(metadata_path, "rb") as f:
    texts = pickle.load(f)

In [14]:
# Ensure FAISS index and document mapping are aligned
docstore = InMemoryDocstore(
    {str(i): Document(page_content=texts[i]) for i in range(len(texts))}  # Wrap texts in Document objects
)
index_to_docstore_id = {i: str(i) for i in range(len(texts))}  # Map FAISS index IDs to docstore IDs

# Create the FAISS vectorstore
embedding_function = OpenAIEmbeddings(
    model="text-embedding-ada-002",
    openai_api_key=openai_api_key
)

vectorstore = FAISS(
    index=index,
    docstore=docstore,
    index_to_docstore_id=index_to_docstore_id,
    embedding_function=embedding_function
)

  embedding_function = OpenAIEmbeddings(


In [15]:
# Define the LLM and prompt
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0, api_key=openai_api_key)

template = """You are a medicine recommendation Virtual Doctor. Based on the given context, suggest most relevant 3 medicines.
Include the medicine name, Disease name , and tell the symptoms of that particular disease, if you wish you can tell the user about the disease description.
If the question doesn't match any data, say you don't know.

{context}

Question: {question}
Your response:"""

PROMPT = PromptTemplate(template=template, input_variables=["context", "question"])

chain_type_kwargs = {"prompt": PROMPT}

# Create the RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs=chain_type_kwargs
)

In [16]:
# Prompt the user for a query
query = input("Enter your query: ")

# Pass the query to the QA chain
result = qa_chain.invoke({"query": query})  # Use the `invoke` method per the LangChain update

# Display the result
print("Response:", result['result'])

Enter your query: i have diabetes and im kind of feeling dizzy
Response: Based on your symptoms of feeling dizzy and having diabetes, it is important to monitor your blood sugar levels closely. In addition to following your prescribed diabetes management plan, you may consider taking one of the following medicines:

1. Diabeta (glyburide) - This medication helps lower blood sugar levels in people with type 2 diabetes by stimulating the pancreas to release more insulin. It can help manage symptoms of diabetes and prevent complications associated with high blood sugar levels.

2. Amaryl (glimepiride) - Amaryl is another medication that helps lower blood sugar levels in people with type 2 diabetes by stimulating the release of insulin from the pancreas. It can help control symptoms of diabetes and reduce the risk of complications.

3. Glucovance (glyburide and metformin) - This combination medication contains two different drugs that work together to lower blood sugar levels in people wit

In [17]:
# Prompt the user for a query
query = input("Enter your query: ")

# Pass the query to the QA chain
result = qa_chain.invoke({"query": query})  # Use the `invoke` method per the LangChain update

# Display the result
print("Response:", result['result'])

Enter your query: i have severe cough, slight fever and runny nose can you suggest me some medications
Response: Based on your symptoms of severe cough, slight fever, and runny nose, I recommend the following medications for you:

1. Avedana Medicated Chest Rub or Vicks VapoRub for cold symptoms such as congestion of the nasal mucous membrane and watery nasal rhinorrhea.

2. Clear Cough PM Multi-Symptom for cold symptoms and cough relief.

3. Altarussin, Bidex-400, Fenesin IR, Mucinex, Mucus Relief, Robafen, Scot-Tussin, Siltussin SA, Tussin Expectorant, Xpect for bronchitis symptoms such as a nagging productive cough, increased mucus production, fatigue, chest tightness, and shortness of breath.

Please consult with a healthcare professional before starting any new medications to ensure they are appropriate for your specific condition.


In [18]:
# Prompt the user for a query
query = input("Enter your query: ")

# Pass the query to the QA chain
result = qa_chain.invoke({"query": query})  # Use the `invoke` method per the LangChain update

# Display the result
print("Response:", result['result'])

Enter your query: i have flu
Response: Based on the symptoms you have described, I recommend the following medicines for flu:
1. Tamiflu
2. Relenza
3. Rapivab

These medicines are commonly used to treat flu symptoms and can help alleviate fever, lethargy, coughing, and other symptoms associated with the flu. Please consult with a healthcare professional before starting any medication.


EVALUVATION METRICS

In [32]:
import time
import re
import pandas as pd

# Load the dataset
file_path = "/content/drive/My Drive/Dataset/Drug2.csv"
medicine_data = pd.read_csv(file_path)

# Fill missing values
medicine_data['disease_symptoms'] = medicine_data['disease_symptoms'].fillna("")
medicine_data['medicine_name'] = medicine_data['medicine_name'].fillna("")

# Refined test queries aligned with dataset
test_data_aligned = [
    {"query": "Suggest medications for acne and its related symptoms?",
     "expected": {"bactrim", "septra", "sulfatrim"}},
    {"query": "What are the medicines for ADHD and its symptoms?",
     "expected": {"concerta", "adderall", "ritalin"}},
    {"query": "Can you suggest medications for AIDS/HIV treatment?",
     "expected": {"atripla", "complera", "truvada"}}
]

# Simulate QA chain (replace this with actual QA system)
def mock_qa_chain(query):
    for _, row in medicine_data.iterrows():
        if any(keyword in query.lower() for keyword in row['disease_symptoms'].lower().split(", ")):
            return {"result": row['medicine_name']}
    return {"result": "I don't know."}

# Helper function to extract medications
def extract_medication_names_fixed(response_text):
    return [med.strip().lower() for med in response_text.split(",")]

# Metrics evaluation
top_k = 3
true_positives, false_positives, false_negatives, correct_top_k = 0, 0, 0, 0
latencies = []

for test in test_data_aligned:
    query = test["query"]
    expected = {med.lower().strip() for med in test["expected"]}

    # Measure latency
    start_time = time.time()
    result = mock_qa_chain(query)
    end_time = time.time()
    latencies.append(end_time - start_time)

    # Extract medications
    response_text = result["result"]
    retrieved = set(extract_medication_names_fixed(response_text))

    # Debug prints
    print(f"Query: {query}")
    print(f"Expected: {expected}")
    print(f"Response: {response_text}")
    print(f"Retrieved: {retrieved}")

    # Calculate metrics
    true_positives += len(retrieved & expected)
    false_positives += len(retrieved - expected)
    false_negatives += len(expected - retrieved)
    retrieved_top_k = list(retrieved)[:top_k]
    if any(item in expected for item in retrieved_top_k):
        correct_top_k += 1

# Calculate evaluation metrics
precision = true_positives / (true_positives + false_positives) if true_positives + false_positives > 0 else 0
recall = true_positives / (true_positives + false_negatives) if true_positives + false_negatives > 0 else 0
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
average_latency = sum(latencies) / len(latencies)
top_k_accuracy = correct_top_k / len(test_data_aligned)

# Print metrics
print("\nEvaluation Metrics:")
print("-------------------")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1_score:.2f}")
print(f"Top-K Accuracy (K={top_k}): {top_k_accuracy:.2f}")
print(f"Average Query Latency: {average_latency:.2f} seconds")

Query: Suggest medications for acne and its related symptoms?
Expected: {'bactrim', 'sulfatrim', 'septra'}
Response: Bactrim, Bactrim DS, Septra, Septra DS, Sulfatrim
Retrieved: {'sulfatrim', 'bactrim', 'septra ds', 'bactrim ds', 'septra'}
Query: What are the medicines for ADHD and its symptoms?
Expected: {'adderall', 'concerta', 'ritalin'}
Response: Bactrim, Bactrim DS, Septra, Septra DS, Sulfatrim
Retrieved: {'sulfatrim', 'bactrim', 'septra ds', 'bactrim ds', 'septra'}
Query: Can you suggest medications for AIDS/HIV treatment?
Expected: {'atripla', 'truvada', 'complera'}
Response: Bactrim, Bactrim DS, Septra, Septra DS, Sulfatrim
Retrieved: {'sulfatrim', 'bactrim', 'septra ds', 'bactrim ds', 'septra'}

Evaluation Metrics:
-------------------
Precision: 0.20
Recall: 0.33
F1-Score: 0.25
Top-K Accuracy (K=3): 0.33
Average Query Latency: 0.01 seconds
