In [1]:
# import pandas as pd

# # Load the dataset
# data_path = 'disease_symptoms.csv'  
# df = pd.read_csv(data_path)

# print(df.head())

                                             disease  \
0  AAA screening, see Abdominal aortic aneurysm (...   
1                 AAA, see Abdominal aortic aneurysm   
2                          Abdominal aortic aneurysm   
3          Abdominal aortic aneurysm (AAA) screening   
4                                           Abortion   

                                            symptoms  
0  Deep, constant pain in the belly area or side ...  
1  Deep, constant pain in the belly area or side ...  
2  Deep, constant pain in the belly area or side ...  
3  Deep, constant pain in the belly area or side ...  
4  Bleeding from the vagina with or without pain,...  


In [2]:
# df.columns = df.columns.str.strip()

In [3]:
# # Combine Disease and Symptoms into the required [Q]/[A] format
# data = []
# for _, row in df.iterrows():
#     symptoms = row['symptoms']
#     disease = row['disease']
#     data.append(f"For symptoms {symptoms}, these could be the disease: {disease}.")

# # Save to a file for fine-tuning
# output_file = 'fine_tuning_diseases.txt'
# with open(output_file, "w") as f:
#     f.write("\n".join(data))


In [3]:
output_file = 'fine_tuning_diseases.txt'

In [2]:
from transformers import TextDataset, DataCollatorForLanguageModeling, GPT2Tokenizer, GPTNeoForCausalLM, Trainer, TrainingArguments

# Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
tokenizer.pad_token = tokenizer.eos_token

# Prepare the dataset
def load_dataset(file_path, tokenizer, block_size=128):
    return TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size,
    )

# Prepare the data collator
def load_data_collator(tokenizer, mlm=False):
    return DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=mlm,
    )

# Load the dataset
train_dataset = load_dataset(output_file, tokenizer)
data_collator = load_data_collator(tokenizer)

# Load the GPT-Neo model
model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-125M")


  from .autonotebook import tqdm as notebook_tqdm


In [1]:
output_dir = "GPT-Neo-Disease-Symptoms"

In [7]:
# Define training arguments

training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    num_train_epochs=5, 
    per_device_train_batch_size=4,  
    save_steps=500,
    save_total_limit=2,
    logging_dir=f'{output_dir}/logs',
    logging_steps=100,
    learning_rate=5e-5,
)

# Initialize the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

# Fine-tune the model
trainer.train()

# Save the model
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)


 14%|█▍        | 100/705 [01:15<08:34,  1.18it/s]

{'loss': 1.3017, 'grad_norm': 7.4578752517700195, 'learning_rate': 4.2907801418439716e-05, 'epoch': 0.71}


 28%|██▊       | 200/705 [02:31<06:12,  1.36it/s]

{'loss': 1.0796, 'grad_norm': 5.425893783569336, 'learning_rate': 3.5815602836879437e-05, 'epoch': 1.42}


 43%|████▎     | 300/705 [03:50<05:17,  1.27it/s]

{'loss': 0.9044, 'grad_norm': 7.193349361419678, 'learning_rate': 2.8723404255319154e-05, 'epoch': 2.13}


 57%|█████▋    | 400/705 [05:06<03:47,  1.34it/s]

{'loss': 0.706, 'grad_norm': 5.844755172729492, 'learning_rate': 2.1631205673758867e-05, 'epoch': 2.84}


 71%|███████   | 500/705 [06:25<02:45,  1.24it/s]

{'loss': 0.5765, 'grad_norm': 5.316499710083008, 'learning_rate': 1.4539007092198581e-05, 'epoch': 3.55}


 85%|████████▌ | 600/705 [07:43<01:17,  1.36it/s]

{'loss': 0.4847, 'grad_norm': 6.853212356567383, 'learning_rate': 7.446808510638298e-06, 'epoch': 4.26}


 99%|█████████▉| 700/705 [08:57<00:03,  1.38it/s]

{'loss': 0.4579, 'grad_norm': 4.617097854614258, 'learning_rate': 3.546099290780142e-07, 'epoch': 4.96}


100%|██████████| 705/705 [09:03<00:00,  1.30it/s]


{'train_runtime': 543.7701, 'train_samples_per_second': 5.177, 'train_steps_per_second': 1.297, 'train_loss': 0.7850412253792405, 'epoch': 5.0}


('GPT-Neo-Disease-Symptoms/tokenizer_config.json',
 'GPT-Neo-Disease-Symptoms/special_tokens_map.json',
 'GPT-Neo-Disease-Symptoms/vocab.json',
 'GPT-Neo-Disease-Symptoms/merges.txt',
 'GPT-Neo-Disease-Symptoms/added_tokens.json')

In [33]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import os

# Step 1: Preprocess and Load Data for FAISS
def preprocess_data(file_path):
    with open(file_path, "r") as f:
        lines = f.readlines()
    # Split data into individual entries for FAISS
    documents = [{"text": line.strip()} for line in lines]
    return documents

# Load the dataset for FAISS
retriever_data = preprocess_data(output_file)



# Step 2: Build FAISS Index
embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_name)

# Initialize FAISS retriever
vectorstore = FAISS.from_texts(
    texts=[doc["text"] for doc in retriever_data],
    embedding=embedding_model
)

# Step 3: Enhance the LLM with a Generator
def load_gpt_neo_model(model_path="EleutherAI/gpt-neo-125M"):
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForCausalLM.from_pretrained(model_path)
    pipeline_model = pipeline("text-generation", model=model, tokenizer=tokenizer)
    return HuggingFacePipeline(pipeline=pipeline_model)

# Load the fine-tuned GPT-Neo model
gpt_neo_model = load_gpt_neo_model(output_dir)


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [30]:
from transformers import GPTNeoForCausalLM, GPT2Tokenizer

# Load the fine-tuned model and tokenizer
model_path = "GPT-Neo-Disease-Symptoms"
model = GPTNeoForCausalLM.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)

def identify_disease(symptoms, max_length=500):
    """
    Predicts a disease based on symptoms and retrieved context.

    Args:
        symptoms (str): Input symptoms provided by the user.
        vectorstore (FAISS): FAISS vector store for context retrieval.
        max_length (int): Maximum length for the generated response.
        k (int): Number of top documents to retrieve for context.

    Returns:
        str: Predicted disease or an appropriate message.
    """
    # Retrieve context from the vector store
    query = f"Symptoms: {symptoms}"
    results = vectorstore.similarity_search(query, k=2)
    context = " ".join([doc.page_content for doc in results]) if results else "No relevant context available."

    # Create input text with retrieved context
    input_text = (
        "You are a medical assistant trained to predict diseases based on symptoms.\n\n"
        f"Context: {context}\n"
        f"Symptoms: {symptoms}\n\n"
        "Based on the context and symptoms, provide the name of the disease. "
        "If the context does not contain the answer, respond with: 'I don't know the answer.'"
    )
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    outputs = model.generate(
        input_ids,
        max_length=min(len(input_ids[0]) + 50, tokenizer.model_max_length),
        top_k=50,
        top_p=0.95,
        pad_token_id=tokenizer.eos_token_id,
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extract the answer
    #  Process the response to extract the disease name
    response_lines = response.strip().split("\n")
    for line in response_lines:
        if "this could be the disease:" in line:
            # Extract the disease name after the phrase
            disease_name = line.split("this could be the disease:")[-1].strip()
            if disease_name:
                # Remove any trailing period
                disease_name = disease_name.rstrip(".")
                return disease_name  # Return the extracted disease name

    # Default response if no valid disease name is found
    return "I don't know the answer."




In [31]:
# Example usage
user_symptoms = "I have deep, constant pain in my belly and back, and I feel a pulse near my bellybutton. What could this be?"
diagnosis = identify_disease(user_symptoms)

print(f"Predicted Disease: {diagnosis}")



Predicted Disease: Abdominal aortic aneurysm


In [32]:
import pandas as pd

# Load the test dataset
test_data_path = "symptom_diseases_test.csv"
test_df = pd.read_csv(test_data_path)

# Function to evaluate the model
def evaluate_model(test_df):
    correct_predictions = 0
    total_predictions = len(test_df)
    mismatched_cases = []

    for _, row in test_df.iterrows():
        symptoms = row["symptoms"]
        actual_disease = row["disease"]
        actual_disease = actual_disease.rstrip(".")
        predicted_disease = identify_disease(symptoms)
        
        if predicted_disease.lower() == actual_disease.lower():
            correct_predictions += 1
        else:
            mismatched_cases.append(
                {
                    "Symptoms": symptoms,
                    "Actual Disease": actual_disease,
                    "Predicted Disease": predicted_disease,
                }
            )
    
    accuracy = correct_predictions / total_predictions
    return accuracy, mismatched_cases

# Run the evaluation
accuracy, mismatched_cases = evaluate_model(test_df)

# Display the results
print(f"Accuracy: {accuracy * 100:.2f}%")
if mismatched_cases:
    print("\nMismatched cases:")
    for case in mismatched_cases:
        print(f"Symptoms: {case['Symptoms']}")
        print(f"Actual Disease: {case['Actual Disease']}")
        print(f"Predicted Disease: {case['Predicted Disease']}")
        print("-" * 50)




Accuracy: 45.65%

Mismatched cases:
Symptoms: I’m having trouble swallowing, and sometimes it feels like food is stuck in my throat. I’ve also lost some weight. What might be going on?
Actual Disease: Achalasia
Predicted Disease: Dysphagia (swallowing problems)
--------------------------------------------------
Symptoms: I’m always tired, losing weight, and I’ve noticed dark patches on my skin. What could this indicate?
Actual Disease: Addison's disease
Predicted Disease: Cancer
--------------------------------------------------
Symptoms: I have severe back pain that spreads down my legs and causes numbness and tingling. What could it be?
Actual Disease: Back pain due to nerve compression or another serious cause
Predicted Disease: Back pain
--------------------------------------------------
Symptoms: I noticed swelling behind my knee and it feels stiff and painful. What could this be?
Actual Disease: Baker's cyst
Predicted Disease: Knee pain
-------------------------------------------