# Retrieval-Augmented Generation for Diabetes Treatment and Diagnosis

Import required libraries

In [9]:
from unstructured.partition.pdf import partition_pdf
import openai
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
import chromadb
import base64
import os
import pandas as pd
import re

In [10]:
API_KEY  = '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'
from openai import OpenAI
client = OpenAI(api_key=API_KEY)

# Data Extraction

In [11]:
# Function to Extract elements from a PDF
def extract_pdf_elements(path, fname):
        """
    Extract images, tables, and text from a PDF file.
    
    Parameters:
    path: str - File path, which is used to dump images (.jpg)
    fname: str - File name of the PDF
    
    Returns:
    The extracted elements including text, tables, and images.
    """
        return partition_pdf(
            filename=path + fname,
            extract_images_in_pdf=True,
            infer_table_structure=True,
            chunking_strategy="by_title",
            max_characters=4000,
            new_after_n_chars=3800,
            combine_text_under_n_chars=2000,
            image_output_dir_path=path,
        )


In [12]:
# Categorize elements by type
def categorize_elements(raw_pdf_elements):
    """
    Categorize extracted elements from a PDF into tables, texts, and images.
    
    Parameters:
    raw_pdf_elements: List of unstructured.documents.elements - Extracted elements from the PDF.
    
    Returns:
    A tuple containing categorized texts, tables, and images.
    """
    tables = []
    texts = []
    images = []
    
    for element in raw_pdf_elements:
        element_type = str(type(element))
        if "unstructured.documents.elements.Table" in element_type:
            tables.append(str(element))
        elif "unstructured.documents.elements.Image" in element_type:
            images.append(element)  # Append the image element directly
        elif "unstructured.documents.elements.CompositeElement" in element_type:
            texts.append(str(element))
    
    return texts, tables, images

In [13]:
# Path to the pdfs
path = "data/"
fname = "Diagnostic_statistical_manual_of_mental_disorders.pdf"

# Get the elements
pdf_elements = extract_pdf_elements(path, fname)

# Categorize the elements
texts, tables, images = categorize_elements(pdf_elements)

# Data Summarization

Tables and texts

In [14]:
# Define a function to summarize text or table elements

def summarize_text_or_tables(text_or_tables,type_):
    '''
    type = 'text' if a text
            'table' if a table
    text_or_tables = list of texts or tables to be summarized
    '''

    # Tables with summarized texts or tables
    summarized_tables = []
    summarized_texts = []

    # Set the model parameters
    model_params = {"model" : "gpt-4o-mini", "temperature": 0.4 , "max_tokens" : 800}
    
    for element in text_or_tables:

        # Prepare the messages
        messages = [{'role' : 'user', 'content' : f"""You are an assistant tasked with summarizing tables, text and images for retrieval. 
        These summaries will be embedded and used to retrieve the raw text or table or image elements. 
        Give a concise summary of the table or text that is well optimized for retrieval. Table or text: {element} """}]

        # Define the chatbot
        llm_response = client.chat.completions.create( messages = messages,**model_params)

        # Extract LLMs 1st response
        summary = llm_response.choices[0].message.content

        # Append the summary to the table
        if type_ == "text":
            summarized_texts.append(summary)
        else:
            summarized_tables.append(summary)

    
    
    if type_ == "text":
        return summarized_texts
    else:
        return summarized_tables
    
    

In [15]:
# Summarize texts
summarized_texts = summarize_text_or_tables(texts,type_="text")

# Summarize tables
summarized_tables = summarize_text_or_tables(tables,type_="table")

Images

In [16]:
# Function to encode the image
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

# Function to summarize all images in a specified directory
def summarize_images(image_directory):
    # List to store summarized images
    summarized_images = []

    # Iterate through all files in the directory
    for filename in os.listdir(image_directory):
    #if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp')):  # Check for image file types
        image_path = os.path.join(image_directory, filename)
        
        # Encode the image
        base64_image = encode_image(image_path)

        # Get the summary from the OpenAI API
        response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {"role": "user","content": [{"type": "text","text": "What is in this image?",},
                            {"type": "image_url","image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},},],}],
            )

        # Append the summary to the list
        summarized_images.append(response.choices[0].message.content) 

    return summarized_images



In [17]:
# Summarize all images in directory
image_directory = "figures/"
summarized_images = summarize_images(image_directory)


In [18]:
# Create a single list with all summarizations (from texts, images and tables)
combined_summarizations = summarized_texts + summarized_tables +  summarized_images


# Text Chunking

In [19]:

# join all summarizations into a sngle string separated by lines
combined_text = "\n".join(combined_summarizations)


In [20]:
# Split the text into smaller chuncks

# Initialize our text separator
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,
    chunk_overlap = 100,
    separators = ["\n\n", "\n", " ", ""]      # list of potential separators
    )

# Split te combined text into chunks
chunks = text_splitter.split_text(combined_text)


In [21]:
# Inspect the chunks
print(len(chunks))

12550


# Data cleaning

Clean the chunks:
- Remove ponctuation special characters (without substituting accented characters)
- Replace multiple spaces with single space
- Remove english stopwords
- Convert to lowercase

In [22]:
# Clean the chunks

for i in range(len(chunks)):
    # Remove any special character and replace them space " "
    chunks[i] = re.sub(r'[^a-zA-Z0-9\s]', " ", chunks[i])

    # Convert to lowercase
    chunks[i] = chunks[i].lower()

    # Replace newline characters with spaces
    chunks[i] = chunks[i].replace('\n', ' ').strip()
    
    # Replace multiple spaces with a single space
    chunks[i] = re.sub(r'\s+', ' ', chunks[i])

# Inspect the chunks
print(chunks)



# Embedding Generation

In [23]:


# Create an instance of OpenAIEmbeddings from the model text-embedding-3-large
embeddings = OpenAIEmbeddings(model="text-embedding-3-large",api_key=API_KEY)

In [24]:
# Create a function to embed a string
def embed_text(text):
    """
    Embeds the provided text using the OpenAI embeddings model.
    
    Args:
        text (str): The text to be embedded.
        
    Returns:
        list: The embeddings for the input text.
    """
    # Generating embeddings for the provided text
    embeddings_vector = embeddings.embed_query(text)

    return embeddings_vector  #embeddings as a list




In [25]:
# Test the emb_text function
# Example text to embed
example_text = "This is an example sentence to embed."

# Calling the function to get embeddings
embedding_result = embed_text(example_text)

# Output the result
print(embedding_result)


[0.010525023564696312, 0.005371965002268553, -0.03108472004532814, 0.003703897586092353, 0.019981782883405685, -0.0032704626210033894, -0.0029596155509352684, 0.041504666209220886, -0.035340260714292526, -0.025463201105594635, 0.007337744813412428, 0.04612797126173973, -0.0023838914930820465, -0.04395642131567001, 0.011750899255275726, 0.01521837804466486, -0.036811310797929764, 0.017521273344755173, -0.01879093237221241, -0.005336939822882414, 0.03824733942747116, -0.044972144067287445, -0.02702181413769722, 0.012267518788576126, 0.05716085806488991, -0.01880844496190548, -0.004566389136016369, 0.03238064423203468, -0.015752509236335754, 0.009150290861725807, 0.02157542109489441, -0.005836046766489744, 0.008791283704340458, 0.005017336457967758, 0.02457006275653839, -0.0033689707051962614, 0.03887778893113136, 0.014859371818602085, 0.006886797491461039, -0.01807291805744171, 0.06143391132354736, -0.024079712107777596, -0.01737241819500923, 0.02225841023027897, -0.00864680577069521, 0.

In [26]:
# Embed chunks
embeded_chunks = []
for chunk in chunks:
    embeded_chunks.append(embed_text(chunk))   # returns a list of emebedded chunks. each chunk has a list of 3072 numbers (embeded)

In [27]:
# Inspect the embeded chunks
len(embeded_chunks[0]) 

3072

# Vector Database Creation

In [28]:
# Create the documents list for Chroma ingestion
## A Chroma database ingests documents
## documents have text and corresponding embeddings

# Prepara the documents for ingestion
documents = []

# Create a documents list for each chunk
for i in range(len(embeded_chunks)):
    embedding = embeded_chunks[i]
    text = chunks[i]
    
    # Create a doc dictionary for pair chunk, embedding
    doc = {"id" : str(i), #unique identifier of the chunk
           "embedding" : embedding,
           "text" : text}
    
    # Append doc to documents
    documents.append(doc)

documents[0]['text']

'summary of dsm 5 overview'

In [29]:
# Create a Chroma Database
import chromadb
from chromadb import Client


# path for the database storage
db_directory = '/Users/tiagovhp/Ironhack/Week_8/Project_RAG/chroma_db'

# Initialize Chroma Client with persistent storage (local directory)
chroma_client = chromadb.PersistentClient(path=db_directory)


# Create a collection
collection_name = "my_documents"
collection = chroma_client.get_or_create_collection(name=collection_name)



In [30]:
# Ingest the documents into the collection database
for doc in documents:
    collection.add(ids = doc["id"],
                  embeddings = doc['embedding'],
                  documents = doc["text"]
                   #metadatas = {"text": doc["text"]} 
                )
    

# Querying the Database

In [31]:
# List of queries text
queries_text = [
    "What are the diagnostic criteria for Major Depressive Disorder, and how do they differ from criteria for Persistent Depressive Disorder (Dysthymia)?",
    "How does the DSM-5 categorize and define different types of Anxiety Disorders, and what are the key symptoms used to differentiate them?",
    "What updates or changes were made in the DSM-5 compared to the DSM-IV regarding the classification of Autism Spectrum Disorder?",
    "What are the defining features of Post-Traumatic Stress Disorder (PTSD) in the DSM-5, and what are the specific criteria for diagnosis?",
    "How does the DSM-5 address Substance Use Disorders, and what are the factors that differentiate mild, moderate, and severe levels of substance-related disorders?",
    "What criteria are used in the DSM-5 to diagnose Schizophrenia, and how are the symptoms categorized into positive, negative, and cognitive symptoms?",
    "What are the diagnostic features of Attention-Deficit/Hyperactivity Disorder (ADHD), and how does the DSM-5 distinguish between the inattentive and hyperactive-impulsive presentations?",
    "How does the DSM-5 define and classify Personality Disorders, and what are some examples of criteria for diagnosing Borderline Personality Disorder?",
    "What considerations does the DSM-5 provide for diagnosing mental disorders in children and adolescents, and how do these differ from adult diagnoses?",
    "What is the DSM-5's approach to understanding and diagnosing Bipolar Disorder, and how does it differentiate between Bipolar I, Bipolar II, and Cyclothymic Disorder?"
]

In [32]:
# List of queries embeddings
queries_embeddings=[]
for queries in queries_text:
    queries_embeddings.append(embeddings.embed_query(queries))

In [33]:
# Convert to pandas DataFrame

queries = pd.DataFrame({'query_text' : queries_text, 'query_embeddings' :queries_embeddings })
queries.head()

Unnamed: 0,query_text,query_embeddings
0,What are the diagnostic criteria for Major Dep...,"[0.0006456910632550716, -0.014854440465569496,..."
1,How does the DSM-5 categorize and define diffe...,"[-0.0021462230943143368, 0.002572633558884263,..."
2,What updates or changes were made in the DSM-5...,"[0.0011631035013124347, 0.01922764629125595, -..."
3,What are the defining features of Post-Traumat...,"[-0.013604141771793365, -0.011891506612300873,..."
4,How does the DSM-5 address Substance Use Disor...,"[0.03340025618672371, -0.010398311540484428, -..."


In [34]:
# Query the database collection
query_results = []
for element in queries['query_embeddings']:
    # Retrieve 5 text results
    results = collection.query(query_embeddings=[element],n_results=5)

    # Combine the retrieved documents into a single string
    combined_string = "\n".join(results['documents'][0])

    # Append the combined string to the list
    query_results.append(combined_string)

# Add the combined results to the 'context' column in the queries
queries['query_result'] = query_results

    

# Connecing to LLM

In [35]:

# Set the model parameters
model_params = {"model" : "gpt-4o-mini", "temperature": 0.4 , "max_tokens" : 800}

answers = []
for question, context in zip(queries['query_text'], queries['query_result']):
    # Prepare the messages
    messages = [{'role' : 'system', 'content' : f'You are a chatbot designed to help answers specific questions about mental disorders diagnosis. Here is the context of the context: {context}'},
                {'role' : 'user', 'content' : f'{question}'}]

    # Define the chatbot
    llm_response = client.chat.completions.create( messages = messages,**model_params)

    # Extract LLMs 1st response
    answer = llm_response.choices[0].message.content

    # Apend the answer to a list
    answers.append(answer)

# Add the combined answers from LLM  to the 'answer' column in the queries
queries['Generator_Response'] = answers


# Evaluation : LLM as a judge

In [36]:
# Set the model parameters
model_params = {"model" : "gpt-4o-mini", "temperature": 0.4 , "max_tokens" : 400}

# Initialize empty lists for feedback and scores
scores = []
feedbacks = []

for question, answer in zip(queries['query_text'], queries['Generator_Response']):
    # Prepare the messages
    messages = [{'role' : 'user', 'content' : f'Evaluate the following response for the query regarding mental disorders diagnosis. User Question: {question}. Response: {answer}. Please provide a score out of 5 for relevance and accuracy as a number only, followed by your feedback.'}]

    # Define the chatbot
    llm_response = client.chat.completions.create( messages = messages,**model_params)

    # Extract LLM's response
    response_content = llm_response.choices[0].message.content
    
    # Split the response to get score and feedback
    parts = response_content.split('\n', 1)  # Split into two parts
    score = parts[0]  # First part is the score
    feedback = parts[1].strip() if len(parts) > 1 else ''  # Second part is the feedback

    
    # Append the score and feedback to their respective lists
    scores.append(score)
    feedbacks.append(feedback)

# Add score and feedback to dataframe
queries['Judge_Score'] = scores
queries['Judge_Feedback'] = feedbacks


In [37]:
# Print query - answers -score
for i in range(5):
    print(f'Question : {queries['query_text'][i]}')
    print(f'Answer : {queries['Generator_Response'][i]}')
    print(f'Feedback : {queries['Judge_Feedback'][i]}')
    print(f'Score : {queries['Judge_Score'][i]}')
    print('-' * 80)


Question : What are the diagnostic criteria for Major Depressive Disorder, and how do they differ from criteria for Persistent Depressive Disorder (Dysthymia)?
Answer : The diagnostic criteria for Major Depressive Disorder (MDD) and Persistent Depressive Disorder (Dysthymia) differ primarily in the duration and severity of symptoms.

### Major Depressive Disorder (MDD) Diagnostic Criteria:
According to the DSM-5, to be diagnosed with MDD, a person must experience at least five of the following symptoms during the same two-week period, and at least one of the symptoms must be either depressed mood or loss of interest or pleasure:

1. **Depressed mood** most of the day, nearly every day.
2. **Loss of interest or pleasure** in most activities.
3. **Significant weight loss** when not dieting, weight gain, or decrease/increase in appetite.
4. **Insomnia or hypersomnia** nearly every day.
5. **Psychomotor agitation or retardation** (restlessness or being slowed down).
6. **Fatigue or loss of

In [38]:
# Inspect the dataframe
queries.head()

Unnamed: 0,query_text,query_embeddings,query_result,Generator_Response,Judge_Score,Judge_Feedback
0,What are the diagnostic criteria for Major Dep...,"[0.0006456910632550716, -0.014854440465569496,...",summary of major depressive disorder diagnosti...,The diagnostic criteria for Major Depressive D...,5,The response provides a clear and accurate ove...
1,How does the DSM-5 categorize and define diffe...,"[-0.0021462230943143368, 0.002572633558884263,...",this summary encapsulates the diagnostic crite...,The DSM-5 categorizes anxiety disorders into s...,5,The response is highly relevant and accurate. ...
2,What updates or changes were made in the DSM-5...,"[0.0011631035013124347, 0.01922764629125595, -...",summary of changes from dsm iv to dsm 5\nthis ...,"In the transition from DSM-IV to DSM-5, signif...",5,"The response is highly relevant and accurate, ..."
3,What are the defining features of Post-Traumat...,"[-0.013604141771793365, -0.011891506612300873,...",summary of posttraumatic stress disorder ptsd ...,The defining features of Post-Traumatic Stress...,5,The response accurately outlines the defining ...
4,How does the DSM-5 address Substance Use Disor...,"[0.03340025618672371, -0.010398311540484428, -...",summary severity and specifiers of substance u...,The DSM-5 (Diagnostic and Statistical Manual o...,4,The response accurately summarizes how the DSM...


# Export results

In [39]:
# Export as csv file only with relevant columns
for_export = queries.drop(columns=['query_embeddings'])

for_export.to_csv('Multi-Modal-RAG-Results.csv', sep=';', index=False, header=True, encoding='utf-8')