In [2]:
import pandas as pd
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
import pandas as pd

In [3]:
df = pd.read_csv("fine_tuning_diseases.txt", sep="\t")

#  Convert rows to LangChain Documents
documents = [
    Document(page_content=row[0])
    for _, row in df.iterrows()
]

# Use BioBERT for embeddings
bio_bert_model = "dmis-lab/biobert-base-cased-v1.1"
embeddings = HuggingFaceEmbeddings(model_name=bio_bert_model)

# Index the documents using FAISS
knowledge_base = FAISS.from_documents(documents, embeddings)

  Document(page_content=row[0])
  embeddings = HuggingFaceEmbeddings(model_name=bio_bert_model)
  from .autonotebook import tqdm as notebook_tqdm
No sentence-transformers model found with name dmis-lab/biobert-base-cased-v1.1. Creating a new one with mean pooling.


In [4]:
# Function to retrieve context using symptoms
def retrieve_context(symptoms, k=1):
    query = f"Symptoms: {symptoms}"
    results = knowledge_base.similarity_search(query, k=k)
    return " ".join([doc.page_content for doc in results])


In [5]:
from transformers import GPTNeoForCausalLM, GPT2Tokenizer

# Load the fine-tuned GPT-Neo model and tokenizer
gpt_neo_model_path = "GPT-Neo-Disease-Symptoms"
model = GPTNeoForCausalLM.from_pretrained(gpt_neo_model_path)
tokenizer = GPT2Tokenizer.from_pretrained(gpt_neo_model_path)
tokenizer.pad_token = tokenizer.eos_token


In [10]:
def identify_disease_with_biobert_rag(symptoms, max_length=100):
    # Retrieve relevant context using BioBERT embeddings
    context = retrieve_context(symptoms, k=1)
    
    # Create input text for GPT-Neo
    input_text = f"Context: {context}\nSymptoms: {symptoms}. Possible diagnosis:"
    
    # Tokenize input with attention mask
    inputs = tokenizer(
        input_text, 
        return_tensors="pt", 
        padding=True, 
        truncation=True
    )
    
    # Generate response
    outputs = model.generate(
        inputs['input_ids'],
        attention_mask=inputs['attention_mask'],  # Explicitly pass attention mask
        max_length=max_length,
        top_k=50,
        top_p=0.95,
        do_sample=True,  # Enable sampling for top-p to work
        pad_token_id=tokenizer.eos_token_id,
    )
    
    # Decode and process the output to extract disease name
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Look for the keyword and extract the disease name
    for line in response.strip().split("\n"):
        if "this could be the disease:" in line.lower():
            disease_name = line.split("this could be the disease:")[-1].strip()
            return disease_name.rstrip(".")  # Return disease name without trailing period
    
    return "I don't know the answer."  # Default response if no disease is found


In [12]:
# Example usage
user_symptoms = "Deep, constant pain in the belly area and back pain"
diagnosis = identify_disease_with_biobert_rag(user_symptoms)
print(f"Predicted Disease: {diagnosis}")


Predicted Disease: AAA


In [13]:
# Evaluation function
def evaluate_model_with_biobert(test_df):
    correct_predictions = 0
    total_predictions = len(test_df)
    mismatched_cases = []

    for _, row in test_df.iterrows():
        symptoms = row["symptoms"]
        actual_disease = row["disease"].rstrip(".")  # Remove trailing period for comparison
        predicted_disease = identify_disease_with_biobert_rag(symptoms).rstrip(".")  # Ensure no trailing period
        
        if predicted_disease.lower() == actual_disease.lower():
            correct_predictions += 1
        else:
            mismatched_cases.append(
                {
                    "Symptoms": symptoms,
                    "Actual Disease": actual_disease,
                    "Predicted Disease": predicted_disease,
                }
            )
    
    accuracy = correct_predictions / total_predictions
    return accuracy, mismatched_cases

# Example usage with evaluation
test_data_path = "symptom_diseases_test.csv"
test_df = pd.read_csv(test_data_path)

accuracy, mismatched_cases = evaluate_model_with_biobert(test_df)

# Display evaluation results
print(f"Accuracy: {accuracy * 100:.2f}%")
if mismatched_cases:
    print("\nMismatched cases:")
    for case in mismatched_cases:
        print(f"Symptoms: {case['Symptoms']}")
        print(f"Actual Disease: {case['Actual Disease']}")
        print(f"Predicted Disease: {case['Predicted Disease']}")
        print("-" * 50)


Accuracy: 65.22%

Mismatched cases:
Symptoms: I have deep, constant pain in my belly and back, and I feel a pulse near my bellybutton. What could this be?
Actual Disease: Abdominal aortic aneurysm
Predicted Disease: Bell's palsy
--------------------------------------------------
Symptoms: I’m having trouble swallowing, and sometimes it feels like food is stuck in my throat. I’ve also lost some weight. What might be going on?
Actual Disease: Achalasia
Predicted Disease: Cushing's syndrome
--------------------------------------------------
Symptoms: I feel severe pain in my upper right belly that spreads to my shoulder, and I’ve been feeling nauseous and feverish. What could this mean?
Actual Disease: Acute cholecystitis
Predicted Disease: Cushing's syndrome
--------------------------------------------------
Symptoms: I’ve been losing hearing on one side, and there’s ringing in my ear. Sometimes I feel dizzy and off balance. What might this be?
Actual Disease: Acoustic neuroma (vestibula