RAG Configuration in this notebook:

Embedding model: thenlper/gte-large

Chunk size: 1000

Chunk overlap: 100

Generation Model: llama-3-8B-Instruct

Retriever: VectorStore

Embedding Size: 1024

This notebook also includes RAG response generation and evaluation.

In [None]:
import pickle
from pinecone import Pinecone, ServerlessSpec
import getpass
import os
import time
from transformers import AutoTokenizer, AutoModelForCausalLM
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
import pinecone
import torch
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pinecone import Index
from langchain_pinecone import PineconeVectorStore
from transformers import BitsAndBytesConfig
import bitsandbytes
from langchain.llms import HuggingFacePipeline
from transformers import pipeline
from tqdm import tqdm
from langchain.prompts import PromptTemplate
import pandas as pd

In [None]:
if not os.getenv("PINECONE_API_KEY"):
    os.environ["PINECONE_API_KEY"] = getpass.getpass("Enter your Pinecone API key: ")

pinecone_api_key = os.environ.get("PINECONE_API_KEY")

pc = Pinecone(api_key=pinecone_api_key)

Enter your Pinecone API key: ··········


In [None]:
#Embedding Model
model_name = "thenlper/gte-large"
model_kwargs = {"device": "cuda" if torch.cuda.is_available() else "cpu"}
encode_kwargs = {"normalize_embeddings": False}
hf_embeddings = HuggingFaceEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)

  hf_embeddings = HuggingFaceEmbeddings(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# Create the index if it doesn't exist
#pc.create_index(name="rag-llm-gte-large",
#                  dimension=1024,metric="cosine",
#                   spec=ServerlessSpec(
#                    cloud="aws",
#                    region="us-east-1"))

pc_index = pc.Index("rag-llm-gte-large")

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
#VectorStore
vectorstore = PineconeVectorStore(
    index_name="rag-llm-gte-large",
    embedding=hf_embeddings,
)

In [None]:
index_stats = pc_index.describe_index_stats()
print("Index Stats:", index_stats)

Index Stats: {'dimension': 1024,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 2644}},
 'total_vector_count': 2644}


In [None]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})

In [None]:
!pip install -U bitsandbytes



In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import bitsandbytes

bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)

llm_model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", quantization_config=bnb_config)
llm_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")

`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
llm_tokenizer.pad_token_id = llm_tokenizer.eos_token_id

In [None]:
llm_pipeline = pipeline(
    "text-generation",
    model=llm_model,
    tokenizer=llm_tokenizer,
    temperature=0.2,
    do_sample=True,
    repetition_penalty=1.1,
    return_full_text=False,
    max_new_tokens=100,
    top_p=0.9,
    top_k=50,
    eos_token_id=llm_tokenizer.eos_token_id
)

In [None]:
from langchain.llms import HuggingFacePipeline

llm_final_model = HuggingFacePipeline(pipeline=llm_pipeline)

  llm_final_model = HuggingFacePipeline(pipeline=llm_pipeline)


In [None]:
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser

# Prompt template
template = """
You are a compassionate and knowledgeable mental health assistant that answers questions related to mental health.\n
Use the following pieces of retrieved context to provide a helpful and empathetic response to the user's question.\n
Use only the context provided and not any prior knowledge.\n
If you are unsure of the answer, tell that you do not know the answer.\n
Stick to the question and just answer the question in a short manner.\n
Avoid any additional greetings or elaborations.\n

Context: \n
------------------------------------------------------------------------------\n
{context}
------------------------------------------------------------------------------\n
Given the context and without any prior knowledge, answer the below question.\n
Question: {question}
Answer:
"""

prompt = PromptTemplate(
    template=template,
    input_variables=["context", "question"]
)

In [None]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm_final_model,
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt}
)

In [None]:
query = "What is the Four-Fold Breath technique?"

result= qa_chain.invoke(query)

In [None]:
answer = result['result']
print("Answer:", answer)

Answer: The Four-Fold Breath technique involves breathing in slowly to the count of 4 and breathing out even slower to the count of 6. This exercise should be practiced for about 5 minutes, three times a day, focusing solely on the breath. With regular practice, this technique can help individuals manage anger and anxiety by training the mind to stay focused on the present moment.


Response generation

In [None]:
df = pd.read_csv("U_eval_dataset_100q_with_context.csv")

In [None]:
df.head()

Unnamed: 0,Question,Ground_Truth,Source,Category,contexts
0,What triggers Seasonal Affective Disorder (SAD)?,"SAD is triggered by seasonal changes, especial...",https://www.samhsa.gov/mental-health/seasonal-...,Mental Illness,"['Seasonal affective disorder, or SAD, is a co..."
1,What are the health risks of bulimia nervosa?,"Risks include dental decay, dehydration, and s...",https://www.samhsa.gov/mental-health/eating-di...,Mental Illness,['It is common that binge eating will lead to ...
2,How can someone with mental health problems ma...,By finding trustworthy individuals who provide...,https://www.samhsa.gov/mental-health/how-to-ta...,Mental Health,"[""Anyone can experience mental health problems..."
3,What are the risk factors for schizophrenia?,Risk factors include genetic predisposition an...,https://www.samhsa.gov/mental-health/schizophr...,Mental Illness,['Schizophrenia is a serious brain disorder th...
4,How does schizophrenia impact daily functioning?,"It affects work, socialization, and completing...",https://www.samhsa.gov/mental-health/schizophr...,Mental Illness,['Schizophrenia is a serious brain disorder th...


In [None]:
results = []

for idx, row in tqdm(df.iterrows(), total=len(df), desc="Generating Responses"):
    question = row['Question']
    ground_truth = row['Ground_Truth']
    source = row['Source']
    category = row['Category']
    context = row['contexts']

    rag_result = qa_chain.invoke(question)
    rag_response = rag_result['result']

    results.append({
        "Question": question,
        "Ground_Truth": ground_truth,
        "Source": source,
        "Category": category,
        "Context": context,
        "rag_response": rag_response
    })

results_df = pd.DataFrame(results)

In [None]:
results_df

Unnamed: 0,Question,Ground_Truth,Source,Category,Context,rag_response
0,What triggers Seasonal Affective Disorder (SAD)?,"SAD is triggered by seasonal changes, especial...",https://www.samhsa.gov/mental-health/seasonal-...,Mental Illness,"['Seasonal affective disorder, or SAD, is a co...","According to the context, Seasonal Affective D..."
1,What are the health risks of bulimia nervosa?,"Risks include dental decay, dehydration, and s...",https://www.samhsa.gov/mental-health/eating-di...,Mental Illness,['It is common that binge eating will lead to ...,The health risks of bulimia nervosa include ch...
2,How can someone with mental health problems ma...,By finding trustworthy individuals who provide...,https://www.samhsa.gov/mental-health/how-to-ta...,Mental Health,"[""Anyone can experience mental health problems...","Find someone—such as a parent, family member, ..."
3,What are the risk factors for schizophrenia?,Risk factors include genetic predisposition an...,https://www.samhsa.gov/mental-health/schizophr...,Mental Illness,['Schizophrenia is a serious brain disorder th...,"According to the provided context, risk factor..."
4,How does schizophrenia impact daily functioning?,"It affects work, socialization, and completing...",https://www.samhsa.gov/mental-health/schizophr...,Mental Illness,['Schizophrenia is a serious brain disorder th...,Schizophrenia can be extremely disruptive to a...
...,...,...,...,...,...,...
95,How does untreated OCD impact daily life?,It can lead to compulsions that dominate daily...,https://www.samhsa.gov/mental-health/anxiety-d...,Anxiety,"[""counseling can be very effective for ocd . i...","Left untreated, OCD can take over your life."
96,How can parents identify mental health concern...,"By observing signs like extreme sadness, withd...",https://www.samhsa.gov/mental-health/how-to-ta...,Mental Health,"[""As a parent or caregiver, you want the best ...","As a parent, it's essential to recognize poten..."
97,Why do people develop social anxiety disorder?,It often develops due to genetics or negative ...,https://www.samhsa.gov/mental-health/anxiety-d...,Anxiety,['Signs & Symptoms of Social Anxiety Disorder\...,It's not fully known what are the specific cau...
98,What are signs a child may need mental health ...,"Signs include drastic mood swings, extreme fea...",https://www.samhsa.gov/mental-health/how-to-ta...,Mental Health,"[""As a parent or caregiver, you want the best ...","According to the context, some signs a child m..."


In [None]:
results_df.to_csv("gte-large-1000-100-responses.csv")

In [None]:
import pandas as pd

In [None]:
eval_df = pd.read_csv("gte-large-1000-100-responses.csv")

In [None]:
eval_df = eval_df.to_dict(orient='records')

In [None]:
# Evaluation prompt
evaluation_prompt = """
### Task Description:
You are provided with an instruction (or query), a response to evaluate, a reference answer that represents an ideal response, and evaluation criteria. Additionally, you are given responses from a RAG model: a RAG response. Your task is to evaluate and compare the RAG response with the reference answer based on the provided criteria.

1. Assess the quality of the response strictly based on the given rubric and criteria (relevance, semantic similarity, accuracy, factual correctness, completeness, clarity, and conciseness), not general impressions.
2. Write short feedback for each response based on the rubric, highlighting strengths and weaknesses for each criterion.
3. Provide feedback in the specified format.
4. Finally, assign an overall score out of 10 for RAG response.


### The instruction to evaluate:
{Question}

### RAG Response:
{rag_response}

### Reference Answer:
{Ground_Truth}

### Rubric for Individual Criteria:
- **Relevance**: How effectively does the response address the specific requirements or intent of the query?
- **Semantic Similarity**: To what extent does the response capture the meaning or intent of the reference answer, even if the phrasing differs?
- **Accuracy**: Does the response provide correct and logically sound information in relation to the query?
- **Factual Correctness**: Are the facts presented in the response verified and consistent with reliable sources or the reference answer?
- **Completeness**: Does the response thoroughly cover all essential aspects of the query without omitting critical information?
- **Clarity**: Is the response expressed in a manner that is straightforward, unambiguous, and easy to comprehend?
- **Conciseness**: Does the response convey the necessary information succinctly, avoiding extraneous details or redundancy?

### Output Format:
Feedback:
- RAG Response:
    - Relevance: {{feedback}}
    - Semantic Similarity: {{feedback}}
    - Accuracy: {{feedback}}
    - Factual Correctness: {{feedback}}
    - Completeness: {{feedback}}
    - Clarity: {{feedback}}
    - Conciseness: {{feedback}}

### Overall Scores:
- RAG Overall Score: {{integer between 1 and 10}}
"""

In [None]:
import openai

In [None]:
!pip install openai



In [None]:
!pip install httpx==0.27.2



In [None]:
from openai import OpenAI

client = OpenAI(api_key = "YOUR_API_KEY") # Insert your api key here

In [None]:
import re

def evaluate_responses(instruction, rag_response, reference_answer):
    prompt = evaluation_prompt.format(
        Question=instruction,
        rag_response=rag_response,
        Ground_Truth=reference_answer
    )

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a helpful assistant evaluating model responses."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=1000,
        temperature=0.5
    )

    s = response.choices[0].message.content
    print("Evaluation Output:")
    print(s)

    rag_score = re.search(r'RAG Overall Score: (\d+)', s).group(1).strip()

    return int(rag_score)

evaluations = []
for data in tqdm(eval_df, desc="Evaluating responses", unit="sample"):
    instruction = data["Question"]
    rag_response = data["rag_response"]
    reference_answer = data["Ground_Truth"]

    rag_score = evaluate_responses(
        instruction, rag_response, reference_answer
    )

    evaluations.append({
        "Question": instruction,
        "Ground_Truth": reference_answer,
        "rag_response": rag_response,
        "RAG_Score": rag_score,
    })

Evaluating responses:   1%|          | 1/100 [00:02<04:52,  2.95s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response effectively addresses the query about what triggers Seasonal Affective Disorder (SAD) by mentioning seasonal changes, which is the core aspect of the question.
    - Semantic Similarity: The meaning of the RAG response aligns well with the reference answer, capturing the essential idea that SAD is related to seasonal changes and sunlight exposure.
    - Accuracy: The information provided is accurate, correctly linking SAD to seasonal variations and the associated decrease in sunlight.
    - Factual Correctness: The response is factually correct, as it accurately reflects the known triggers of SAD, consistent with established knowledge on the topic.
    - Completeness: While the response covers the main trigger of SAD, it could be slightly more complete by briefly mentioning the psychological aspect of depression in relation to these seasonal changes.
    - Clarity: The response is clear and straightforward, maki

Evaluating responses:   2%|▏         | 2/100 [00:04<03:43,  2.28s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is relevant as it directly addresses the health risks associated with bulimia nervosa.
    - Semantic Similarity: The response captures the intent of the reference answer but expands on it with additional details, which is somewhat different in focus.
    - Accuracy: The information provided is accurate and aligns with known health risks associated with bulimia nervosa.
    - Factual Correctness: The facts presented are consistent with reliable sources regarding the health risks of bulimia nervosa.
    - Completeness: The response is more complete than the reference answer, listing multiple health risks rather than just a few.
    - Clarity: The response is clear and easy to understand, effectively conveying the information without ambiguity.
    - Conciseness: While the response is thorough, it could be considered slightly less concise due to the additional details provided.

### Overall Scores:
- RAG Overall S

Evaluating responses:   3%|▎         | 3/100 [00:07<04:23,  2.71s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is relevant as it addresses the importance of finding supportive individuals to maintain a support system for mental health. However, it could be more focused on the broader concept of maintaining a support system rather than just finding one individual.
    - Semantic Similarity: The response captures the essence of the reference answer by emphasizing trust and support, but it includes additional details that diverge slightly from the concise nature of the reference.
    - Accuracy: The information presented is accurate in terms of identifying qualities of a supportive individual, which aligns with the intent of the query.
    - Factual Correctness: The facts are consistent with general knowledge about support systems in mental health and do not present any inaccuracies.
    - Completeness: The response covers essential aspects, such as trust, advice, and space for growth, but it could mention other ways to mai

Evaluating responses:   4%|▍         | 4/100 [00:10<03:57,  2.48s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is not relevant to the query as it fails to provide any information about the risk factors for schizophrenia, which is the main focus of the instruction.
    - Semantic Similarity: There is little to no semantic similarity to the reference answer since the RAG response does not address the topic at all.
    - Accuracy: The response does not provide any accurate information regarding the risk factors for schizophrenia, as it does not attempt to answer the question.
    - Factual Correctness: Since the response does not present any facts related to the query, it cannot be assessed for factual correctness.
    - Completeness: The response is incomplete as it does not cover the essential aspects of the query, which is to identify the risk factors for schizophrenia.
    - Clarity: The response is clear in stating that it does not know the answer, but this clarity does not contribute to addressing the question.
    - 

Evaluating responses:   5%|▌         | 5/100 [00:12<04:02,  2.56s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is highly relevant as it directly addresses how schizophrenia impacts daily functioning by mentioning specific areas affected, such as work, socialization, and daily tasks.
    - Semantic Similarity: The response captures the essence of the reference answer, though it provides more detail. The core meaning aligns well, despite differences in phrasing.
    - Accuracy: The information presented is accurate regarding the effects of schizophrenia on daily functioning and the importance of treatment.
    - Factual Correctness: The facts are consistent with what is known about schizophrenia and its impact on daily life, aligning with reliable sources.
    - Completeness: The response is complete, covering multiple aspects of daily functioning affected by schizophrenia and mentioning treatment options, which adds depth.
    - Clarity: The response is clear and easy to understand, effectively communicating the impact of

Evaluating responses:   6%|▌         | 6/100 [00:19<05:59,  3.82s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is relevant as it addresses how to manage symptoms of social anxiety disorder, aligning with the query's intent.
    - Semantic Similarity: The response captures the essence of the reference answer, though it elaborates more on various methods rather than directly mirroring the phrasing.
    - Accuracy: The information provided about treatment options, such as therapy, medication, and techniques like deep breathing, is accurate and appropriate for managing social anxiety disorder.
    - Factual Correctness: The facts presented are consistent with common knowledge about social anxiety disorder treatments and align with the reference answer.
    - Completeness: The response is complete as it covers a variety of management strategies, including therapy, medication, support groups, and practical techniques, which enhances its thoroughness.
    - Clarity: The response is clear and easy to understand, with a logical f

Evaluating responses:   7%|▋         | 7/100 [00:22<05:29,  3.54s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response addresses the query well by listing symptoms of anxiety disorders, aligning with the intent of the instruction.
    - Semantic Similarity: The response captures many of the symptoms mentioned in the reference answer but includes additional details. The core symptoms are similar, but the RAG response is more extensive.
    - Accuracy: The symptoms listed in the RAG response are accurate and relevant to anxiety disorders, reflecting a correct understanding of the condition.
    - Factual Correctness: The information provided is factually correct and consistent with what is known about anxiety disorders.
    - Completeness: The response is quite comprehensive, covering a wide range of symptoms, although it may include some that are less common or not universally recognized as symptoms of anxiety disorders.
    - Clarity: The response is generally clear, but the long list of symptoms may overwhelm the reader, making

Evaluating responses:   8%|▊         | 8/100 [00:24<04:57,  3.24s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is relevant as it directly addresses how mental health impacts decision-making, aligning well with the query's intent.
    - Semantic Similarity: The response captures the essence of the reference answer but expands on it significantly, which diverges from the brevity of the reference. However, it maintains the core idea of mental health influencing decision-making.
    - Accuracy: The information provided is accurate in describing the effects of mental health on decision-making processes.
    - Factual Correctness: The response is factually correct and logically sound, detailing how mental health issues can complicate decision-making.
    - Completeness: The response is quite comprehensive, covering various aspects of decision-making affected by mental health, which goes beyond the reference answer's scope.
    - Clarity: The response is mostly clear, though it could benefit from more straightforward language i

Evaluating responses:   9%|▉         | 9/100 [00:26<04:19,  2.85s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response effectively addresses the query about ADHD by defining the condition and its symptoms, making it relevant to the question asked.
    - Semantic Similarity: The response captures the essence of the reference answer, although it uses slightly different phrasing. It conveys similar meanings regarding the symptoms of ADHD.
    - Accuracy: The information presented is accurate in describing ADHD and its symptoms, aligning well with established definitions.
    - Factual Correctness: The facts provided are consistent with reliable sources regarding ADHD, confirming the validity of the symptoms mentioned.
    - Completeness: The response covers the main symptoms of ADHD but could be improved by mentioning impulsivity explicitly, which is a critical aspect of the disorder.
    - Clarity: The response is clear and easy to understand, effectively communicating the main points without ambiguity.
    - Conciseness: The resp

Evaluating responses:  10%|█         | 10/100 [00:29<04:18,  2.87s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response effectively addresses the query about treating SAD, mentioning relevant treatment options. However, it could have directly stated that these treatments are specifically for SAD to improve relevance further.
    - Semantic Similarity: The meaning of the RAG response is similar to the reference answer, as both mention light therapy, antidepressants, talk therapy, and vitamin D supplements. The phrasing differs slightly, but the intent is captured.
    - Accuracy: The information presented is accurate regarding the treatment options for SAD, aligning with established treatment methods.
    - Factual Correctness: The facts are consistent with reliable sources regarding SAD treatment, confirming the effectiveness of the mentioned therapies.
    - Completeness: The response covers the main treatment options for SAD but could have emphasized that these methods are typically used in combination for better context.
    -

Evaluating responses:  11%|█         | 11/100 [00:32<04:08,  2.80s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is relevant as it addresses the importance of mental health throughout life, aligning with the query's intent.
    - Semantic Similarity: The response captures the essence of the reference answer by discussing mental health's role in life stages, though it lacks the specific mention of "emotional, psychological, and social well-being."
    - Accuracy: The information presented is accurate, indicating that mental health affects thinking, mood, and behavior, which is a valid point.
    - Factual Correctness: The facts are consistent with general knowledge about mental health and its impact across life stages.
    - Completeness: The response is somewhat incomplete; it could expand on the significance of mental health in emotional, psychological, and social contexts, which the reference answer emphasizes.
    - Clarity: The response is clear and easy to understand, conveying its message effectively.
    - Concisene

Evaluating responses:  12%|█▏        | 12/100 [00:35<04:11,  2.86s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Semantic Similarity: The response captures the intent of the reference answer well, although it provides additional details that are not present in the reference.
    - Conciseness: While the response is somewhat longer than the reference, it remains concise and avoids unnecessary elaboration.

### Overall Scores:
- RAG Overall Score: 9


Evaluating responses:  13%|█▎        | 13/100 [00:38<04:17,  2.96s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is relevant as it addresses the reasons why people engage in self-harm, aligning with the query.
    - Semantic Similarity: The response captures the essence of the reference answer, particularly the idea of self-harm providing relief from emotional distress, although it expands slightly on the reasons.
    - Accuracy: The information presented is accurate, as it correctly identifies emotional relief as a primary reason for self-harm.
    - Factual Correctness: The response is factually correct and consistent with what is generally understood about self-harm.
    - Completeness: The response is somewhat complete but could have included additional aspects of self-harm, such as its potential as a maladaptive coping mechanism or the underlying mental health issues.
    - Clarity: The response is clear and easy to understand, with straightforward language.
    - Conciseness: The response is concise but could be slig

Evaluating responses:  14%|█▍        | 14/100 [00:41<04:12,  2.93s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response effectively addresses the query about the role of communication in preventing suicide, emphasizing the importance of open conversations and support. 
    - Semantic Similarity: The response captures the essence of the reference answer by highlighting support and the reduction of stigma, although it expands on these points more comprehensively.
    - Accuracy: The information provided is accurate and logically sound, discussing the role of communication in a way that aligns with established understanding of suicide prevention.
    - Factual Correctness: The facts presented are consistent with reliable sources regarding suicide prevention and the importance of communication.
    - Clarity: The response is clear and straightforward, making it easy to understand the key points being made about communication's role in suicide prevention.
    - Conciseness: While the response is comprehensive, it could be slightly mor

Evaluating responses:  15%|█▌        | 15/100 [00:44<04:12,  2.98s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is relevant as it directly addresses the query about how therapy can help with self-harm by mentioning a specific therapeutic approach (DBT). However, it could have included more general insights about therapy's role in addressing emotional pain and coping mechanisms, similar to the reference answer.
    - Semantic Similarity: The response captures some aspects of the reference answer but does not directly convey the idea of providing healthier coping mechanisms. It focuses on DBT specifically rather than therapy in general.
    - Accuracy: The information about DBT being effective for self-harm is accurate, but the response could have benefited from mentioning other types of therapy that may also help.
    - Factual Correctness: The claims made about DBT are factually correct and supported by evidence in the field of psychology. However, the response lacks a broader context about therapy as a whole.
    - Compl

Evaluating responses:  16%|█▌        | 16/100 [00:47<04:09,  2.97s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response effectively addresses the query by identifying emotional triggers linked to self-harm, aligning well with the intent of the instruction.
    - Semantic Similarity: The response captures the essence of the reference answer, mentioning the same emotional triggers (loneliness, anger, hopelessness) although with slightly different phrasing.
    - Accuracy: The information presented is accurate, as it correctly identifies common emotional triggers for self-harm.
    - Factual Correctness: The response is factually correct and aligns with the reference answer. The emotional triggers mentioned are widely recognized in discussions about self-harm.
    - Completeness: While the response covers the main emotional triggers, it could be considered slightly less complete as it does not explicitly state that these feelings are the only triggers or that they can vary among individuals.
    - Clarity: The response is clear and 

Evaluating responses:  17%|█▋        | 17/100 [00:52<05:07,  3.71s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is relevant as it addresses the signs of suicide and how to intervene, which is aligned with the query. However, it does not directly mention asking about suicidal thoughts, which is a critical aspect of intervention.
    - Semantic Similarity: The response captures some intent similar to the reference answer but lacks the directness of asking about suicidal thoughts, which is the core of the reference answer.
    - Accuracy: The information provided is generally accurate regarding intervention strategies but misses a crucial element of directly asking the individual about their feelings.
    - Factual Correctness: The facts presented are consistent with general guidelines for suicide intervention, but the omission of directly asking about suicidal thoughts reduces the overall correctness.
    - Completeness: The response is somewhat complete in terms of intervention strategies but lacks the essential step of di

Evaluating responses:  18%|█▊        | 18/100 [00:59<06:28,  4.73s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is highly relevant as it addresses the challenges faced by individuals with schizophrenia, directly aligning with the query.
    - Semantic Similarity: The response captures a broader range of challenges compared to the reference answer, which focuses on socialization, employment, and daily routines. While it does not match the reference answer exactly, it conveys a similar intent.
    - Accuracy: The information provided is accurate and reflects common challenges associated with schizophrenia.
    - Factual Correctness: The facts presented are consistent with known issues faced by those with schizophrenia, such as difficulties in communication and social relationships.
    - Completeness: The response is comprehensive, covering various challenges beyond those mentioned in the reference answer, thus providing a fuller picture of the difficulties faced.
    - Clarity: The response is clear and easy to understand,

Evaluating responses:  19%|█▉        | 19/100 [01:03<06:07,  4.54s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is relevant as it directly addresses the differences between bulimia nervosa and binge eating disorder, which is the core of the query.
    - Semantic Similarity: The response captures the intent of the reference answer well, highlighting the key difference of purging behaviors in bulimia nervosa versus binge eating disorder.
    - Accuracy: The information presented is accurate; it correctly describes the behaviors associated with both disorders.
    - Factual Correctness: The facts are consistent with established definitions of both disorders, thus maintaining factual correctness.
    - Completeness: While the response covers the primary distinction, it could include more detail about the implications of these behaviors or the psychological aspects involved for greater completeness.
    - Clarity: The response is clear and easy to understand, with no ambiguous language or complex phrasing.
    - Conciseness: T

Evaluating responses:  20%|██        | 20/100 [01:06<05:08,  3.85s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is highly relevant as it directly addresses the query about the physical symptoms linked to panic disorder.
    - Semantic Similarity: The response captures the essence of the reference answer but goes beyond it by providing additional symptoms, thus maintaining semantic similarity.
    - Accuracy: The symptoms listed are accurate and commonly associated with panic disorder.
    - Factual Correctness: The information presented is factually correct and aligns with known symptoms of panic disorder.
    - Completeness: The response is complete as it lists multiple symptoms, providing a broader understanding than the reference answer.
    - Clarity: The response is clear and easy to understand, listing symptoms in a straightforward manner.
    - Conciseness: While the response is longer than the reference, it remains concise without unnecessary details.

### Overall Scores:
- RAG Overall Score: 9


Evaluating responses:  21%|██        | 21/100 [01:08<04:37,  3.52s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response effectively addresses the query about why panic attacks are frightening by discussing the emotional and physical sensations involved. However, it could be more focused on the core reasons rather than listing symptoms.
    - Semantic Similarity: The response captures the essence of the reference answer by mentioning intense fear and physical symptoms. However, it elaborates more than the reference, which is less direct.
    - Accuracy: The information provided is accurate regarding the nature of panic attacks and their effects on individuals.
    - Factual Correctness: The response includes correct facts about panic attacks, aligning well with reliable information on the topic.
    - Completeness: The response is thorough, covering a wide range of symptoms and feelings associated with panic attacks, which adds depth but may stray from the simplicity of the reference answer.
    - Clarity: The response is clear an

Evaluating responses:  22%|██▏       | 22/100 [01:11<04:15,  3.28s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is partially relevant as it addresses the population targeted by the Zero Suicide framework, but it is too narrow as it only mentions adults 18 and older without acknowledging other populations that may be included.
    - Semantic Similarity: The response captures some of the intent of the reference answer but lacks broader context. The reference answer indicates a focus on individuals across healthcare systems, which is not mentioned in the RAG response.
    - Accuracy: The response is accurate in stating that the framework targets adults, but it is incomplete regarding the broader scope of the target population.
    - Factual Correctness: The information provided is factually correct but does not encompass the full range of the Zero Suicide framework’s target populations.
    - Completeness: The response lacks completeness as it omits critical information about other populations and the overarching goal of the

Evaluating responses:  23%|██▎       | 23/100 [01:14<03:57,  3.09s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is relevant as it directly addresses the relationship between insomnia, depression, and feelings of worthlessness, aligning well with the query.
    - Semantic Similarity: The response captures the essence of the reference answer by discussing how depression affects sleep and emotional states, though it does not explicitly mention racing thoughts.
    - Accuracy: The information presented is accurate, noting the bidirectional relationship between insomnia and depression, which is a well-established concept in psychology.
    - Factual Correctness: The facts are consistent with established knowledge about depression and insomnia, and the mention of feelings of worthlessness is appropriate.
    - Completeness: The response covers the main aspects of the query, including the interplay between insomnia, depression, and feelings of worthlessness, but could benefit from a more explicit connection to racing thoughts.
 

Evaluating responses:  24%|██▍       | 24/100 [01:16<03:41,  2.91s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Factual Correctness: The facts are consistent with established knowledge on the topic, making the response factually correct.
    - Conciseness: While the response is informative, it is less concise than the reference answer due to the length of the list, which could be streamlined.

### Overall Scores:
- RAG Overall Score: 8


Evaluating responses:  25%|██▌       | 25/100 [01:19<03:34,  2.86s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response effectively addresses the query by listing physical symptoms associated with bulimia nervosa. It directly answers the question, making it relevant.
    - Semantic Similarity: The response captures the essence of the reference answer but includes additional details. While it differs in phrasing, the core symptoms mentioned are consistent with the reference.
    - Accuracy: The information provided is accurate regarding the symptoms of bulimia nervosa, including those related to oral health and dehydration.
    - Factual Correctness: The facts presented are consistent with known medical information about bulimia nervosa and its physical symptoms, confirming the response's reliability.
    - Completeness: The response is more comprehensive than the reference answer, providing additional symptoms and potential health complications, which enhances its completeness.
    - Clarity: The response is clearly articulated, 

Evaluating responses:  26%|██▌       | 26/100 [01:22<03:28,  2.82s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is somewhat relevant as it addresses the concern of parents regarding their child's mental health, but it focuses too heavily on specific behavioral indicators rather than providing a broader range of resources available to parents.
    - Semantic Similarity: The response does not capture the intent of the reference answer effectively. It diverges significantly by emphasizing specific behaviors rather than listing resources.
    - Accuracy: The information provided about behaviors is accurate but does not directly address the query about resources for parents.
    - Factual Correctness: The response is factually correct regarding the behaviors mentioned, but it fails to present the intended resources clearly.
    - Completeness: The response lacks completeness as it does not mention any specific resources like crisis helplines or healthcare providers, which are essential for addressing the query.
    - Clarity: 

Evaluating responses:  27%|██▋       | 27/100 [01:24<03:12,  2.64s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response directly addresses the query about the symptoms of Generalized Anxiety Disorder, listing multiple relevant symptoms. This is a strength.
    - Semantic Similarity: The response captures the essence of the reference answer but provides more symptoms. While it maintains the meaning, it does not closely mirror the phrasing of the reference.
    - Accuracy: The symptoms listed are accurate and align with recognized criteria for Generalized Anxiety Disorder.
    - Factual Correctness: All symptoms mentioned are factually correct and consistent with established medical knowledge about Generalized Anxiety Disorder.
    - Completeness: The response is thorough, covering a wide range of symptoms, which adds to its completeness compared to the reference answer.
    - Clarity: The response is clear and easy to understand, presenting the symptoms in a straightforward manner.
    - Conciseness: While the list is comprehensiv

Evaluating responses:  28%|██▊       | 28/100 [01:26<03:02,  2.54s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is relevant as it discusses the treatment of schizophrenia, focusing on antipsychotic medications and other therapeutic options.
    - Semantic Similarity: The response captures some of the key ideas from the reference answer, such as the use of antipsychotic medications, but it lacks mention of individual therapy and family support.
    - Accuracy: The information provided about antipsychotic medications and Electroconvulsive therapy (ECT) is accurate and reflects current treatment practices.
    - Factual Correctness: The facts presented are consistent with reliable sources regarding the treatment of schizophrenia, particularly regarding medication and ECT.
    - Completeness: The response is incomplete as it does not mention individual therapy or family support, which are important components of effective treatment.
    - Clarity: The response is clear and easy to understand, effectively communicating the tre

Evaluating responses:  29%|██▉       | 29/100 [01:28<02:49,  2.39s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response effectively addresses the query by distinguishing between panic disorder and occasional panic attacks, which is the core of the question.
    - Semantic Similarity: The response captures the essence of the reference answer, although it uses slightly different phrasing and expands on the concept of avoidance.
    - Accuracy: The information presented is accurate, clearly stating that panic disorder involves repeated panic attacks while occasional panic attacks do not necessarily lead to a disorder.
    - Factual Correctness: The facts are consistent with established definitions of panic disorder and occasional panic attacks, aligning well with the reference answer.
    - Completeness: The response is mostly complete, as it explains the relationship between panic disorder and occasional panic attacks, but it could briefly mention the situational nature of occasional panic attacks as noted in the reference answer.


Evaluating responses:  30%|███       | 30/100 [01:31<03:04,  2.63s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is somewhat relevant as it discusses behavioral analysis and Dialectical Behavior Therapy (DBT) in the context of psychotherapy for eating disorders. However, it does not directly address the broader aspects of how psychotherapy aids recovery, which makes it less relevant to the query.
    - Semantic Similarity: The response captures some elements related to emotional issues and symptom management, but it does not align closely with the succinctness of the reference answer, which emphasizes these aspects more clearly.
    - Accuracy: The mention of behavioral analysis and DBT is accurate in the context of psychotherapy for eating disorders. However, the response could benefit from a clearer connection to how these methods specifically aid recovery.
    - Factual Correctness: The information about DBT is factually correct and supported by evidence, which is a strength of the response. However, it lacks a direct c

Evaluating responses:  31%|███       | 31/100 [01:36<03:32,  3.08s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is relevant as it addresses the query about treatments available for anxiety disorders, listing appropriate options.
    - Semantic Similarity: The response captures the overall intent of the reference answer but includes more specific treatments, which slightly diverges from the broader phrasing of the reference.
    - Accuracy: The information provided about treatments is accurate and aligns with common knowledge regarding anxiety disorder treatments.
    - Factual Correctness: The treatments mentioned are factually correct, though the inclusion of PTSD treatments (Cognitive Processing Therapy and EMDR) may be seen as tangential.
    - Completeness: The response is fairly complete, listing multiple treatment options, but could include additional stress management techniques mentioned in the reference answer for thoroughness.
    - Clarity: The response is clear and easy to understand, with a logical structure 

Evaluating responses:  32%|███▏      | 32/100 [01:38<03:23,  2.99s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is relevant as it addresses the role of CBT in managing depression and self-esteem issues, highlighting the impact of thought processes on beliefs and behaviors.
    - Semantic Similarity: The response captures the essence of the reference answer, though it uses different phrasing and adds some details that are not present in the reference.
    - Accuracy: The information provided is accurate regarding the principles of CBT and its application to depression and self-esteem.
    - Factual Correctness: The facts presented align well with established knowledge about CBT and its functions, making it factually correct.
    - Completeness: The response is fairly complete, covering the main aspects of CBT's role but could benefit from a more explicit mention of its effectiveness in treating depression and self-esteem issues.
    - Clarity: The response is generally clear, but some sentences are a bit convoluted, which 

Evaluating responses:  33%|███▎      | 33/100 [01:43<03:54,  3.50s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response partially addresses the query by mentioning suicide risk, which is related to self-harm. However, it lacks broader coverage of other health risks associated with self-harm, such as infections or injuries, making it only somewhat relevant.
    - Semantic Similarity: The response captures some aspects of the reference answer's intent, focusing on severe consequences of self-harm. However, it does not align closely with the reference answer's mention of infections and scars, which are also significant risks.
    - Accuracy: The information about the risk of suicide is accurate; however, it does not encompass the full range of health risks associated with self-harm as indicated in the reference answer.
    - Factual Correctness: The statement regarding the risk of suicide is factually correct. However, it fails to mention other important health risks that are also factual.
    - Completeness: The response is incompl

Evaluating responses:  34%|███▍      | 34/100 [01:45<03:22,  3.07s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is relevant as it addresses the causes of social anxiety disorder, which is the main focus of the query.
    - Semantic Similarity: The response captures the essence of the reference answer but lacks some of the precise phrasing and structure, leading to a slightly lower semantic similarity.
    - Accuracy: The response is accurate in identifying genetics and learned behavior as potential causes of social anxiety disorder.
    - Factual Correctness: The information provided is factually correct and aligns with common understandings of the disorder's causes.
    - Completeness: The response is somewhat complete but could benefit from a more comprehensive explanation of the causes, as it only mentions two factors.
    - Clarity: The response is clear and easy to understand, though the phrasing could be more polished.
    - Conciseness: The response is concise, but the phrase "It’s not fully known what are the spec

Evaluating responses:  35%|███▌      | 35/100 [01:51<04:22,  4.04s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is relevant as it describes the Four-Fold Breath technique in relation to the query. However, it could be more directly tied to the relaxation and sleep benefits mentioned in the reference.
    - Semantic Similarity: The response captures some aspects of the reference answer but lacks the specific mention of its use for relaxation and sleep, which is a key point in the reference.
    - Accuracy: The information about the technique's breathing counts and practice recommendations is accurate. However, it misses the broader context of its purpose.
    - Factual Correctness: The details provided about the breathing technique are factually correct, but the omission of its primary purpose (relaxation and aiding sleep) detracts from its overall correctness in context.
    - Completeness: The response provides a good description of how to perform the technique but omits critical information about its intended benefits, 

Evaluating responses:  36%|███▌      | 36/100 [01:54<03:58,  3.73s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is relevant as it addresses the influence of family history on mental health, aligning with the query's intent.
    - Semantic Similarity: The response captures the essence of the reference answer but does not convey the same succinctness and directness, making it somewhat less similar in meaning.
    - Accuracy: The information presented is accurate; it correctly identifies the link between family history and mental health conditions.
    - Factual Correctness: The response is factually correct, as it reflects a well-established understanding of how family history can impact mental health.
    - Completeness: The response is somewhat complete but lacks depth; it could benefit from further elaboration on how family history influences mental health.
    - Clarity: The response is clear and understandable, though it could be more concise.
    - Conciseness: The response is slightly verbose; it could be more succin

Evaluating responses:  37%|███▋      | 37/100 [01:57<03:41,  3.51s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is highly relevant as it directly addresses how phobias affect daily life, providing specific examples that illustrate the impact of phobias on routines and social interactions.
    - Semantic Similarity: The response captures the essence of the reference answer, expanding on the idea of avoidance but maintaining the core meaning regarding the limitations phobias impose on daily activities.
    - Accuracy: The information presented is accurate, reflecting a correct understanding of how phobias influence behavior and lifestyle choices.
    - Factual Correctness: The examples given are logically sound and consistent with common knowledge about phobias and their effects on individuals.
    - Completeness: The response is complete, covering various aspects of how phobias can affect different areas of life, including social interactions and quality of life.
    - Clarity: The response is clear and easy to understand,

Evaluating responses:  38%|███▊      | 38/100 [02:00<03:24,  3.30s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is relevant as it directly addresses how NCCA temperament therapy can benefit individuals, aligning with the query's intent.
    - Semantic Similarity: The response captures the essence of the reference answer but is more detailed. It conveys a similar meaning regarding self-acceptance and personal growth.
    - Accuracy: The information presented about NCCA temperament therapy aligns with general understanding, but it could be more explicitly connected to specific benefits.
    - Factual Correctness: The claims made in the response appear to be factually correct, as they align with common interpretations of temperament therapy.
    - Completeness: The response provides a good overview of the benefits but could include more specific examples or outcomes to enhance completeness.
    - Clarity: The response is clear and easy to understand, effectively communicating the main ideas without confusion.
    - Concisene

Evaluating responses:  39%|███▉      | 39/100 [02:03<03:17,  3.24s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is relevant as it discusses the connection between physical health and depression, addressing the query effectively.
    - Semantic Similarity: The response captures some aspects of the reference answer, particularly the idea that physical health can influence depression, but it lacks the specific mention of "physical structures" not functioning well.
    - Accuracy: The information provided is generally accurate, noting that trauma, life changes, stress, and certain illnesses can affect mental health.
    - Factual Correctness: The response correctly identifies factors that may contribute to depression, aligning with established understanding in psychology and medicine.
    - Completeness: While the response covers several factors linking physical health to depression, it could be more complete by discussing how addressing physical health can alleviate symptoms, as mentioned in the reference answer.
    - Clari

Evaluating responses:  40%|████      | 40/100 [02:06<02:58,  2.98s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is relevant as it addresses how lack of sunlight contributes to Seasonal Affective Disorder (SAD).
    - Semantic Similarity: The response captures the essence of the reference answer but does not explicitly mention melatonin, which is a key component in the reference.
    - Accuracy: The information about circadian rhythms and serotonin is accurate and relevant to the question about SAD.
    - Factual Correctness: The facts presented are consistent with established knowledge about the effects of sunlight on mood and sleep.
    - Completeness: The response lacks completeness as it does not mention melatonin, which is an important aspect of the relationship between sunlight and mood.
    - Clarity: The response is clear and easy to understand, though it could be slightly more precise in its wording.
    - Conciseness: The response is concise and gets to the point without unnecessary details.

### Overall Scores:


Evaluating responses:  41%|████      | 41/100 [02:08<02:47,  2.84s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is relevant as it addresses social withdrawal and its implications for youth, but it goes beyond the specific concern of why it is a concern, which could be more directly stated.
    - Semantic Similarity: The response captures some of the meaning of the reference answer, particularly the emotional aspects of social withdrawal, but it does not directly reflect the idea of hopelessness or distress.
    - Accuracy: The information provided is accurate in describing social withdrawal as a coping mechanism and the potential underlying causes, aligning well with psychological understanding.
    - Factual Correctness: The facts presented are consistent with psychological principles regarding social withdrawal, making the response factually correct.
    - Completeness: The response is relatively complete, addressing various aspects of social withdrawal, but it could be more focused on why it is a concern rather than ju

Evaluating responses:  42%|████▏     | 42/100 [02:10<02:32,  2.63s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is relevant as it addresses the treatment of ADHD, which is the focus of the query.
    - Semantic Similarity: The response captures the intent of the reference answer but lacks some specific terms like "behavior management strategies," which are important in the context of ADHD treatment.
    - Accuracy: The information provided is accurate, as it lists valid treatment options for ADHD.
    - Factual Correctness: The facts presented align with common knowledge about ADHD treatment options, ensuring consistency with reliable sources.
    - Completeness: The response is somewhat incomplete; while it mentions various treatment methods, it does not detail behavior management strategies, which are significant in ADHD treatment.
    - Clarity: The response is clear and understandable, making it easy for the reader to grasp the treatment options mentioned.
    - Conciseness: The response is concise, providing necessar

Evaluating responses:  43%|████▎     | 43/100 [02:12<02:13,  2.34s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is relevant as it addresses the importance of early treatment for schizophrenia directly.
    - Semantic Similarity: The meaning is similar to the reference answer, though it could be more closely aligned in terms of phrasing and structure.
    - Accuracy: The information provided is accurate regarding the benefits of early treatment for schizophrenia.
    - Factual Correctness: The facts presented are consistent with general knowledge about schizophrenia and early intervention.
    - Completeness: The response is somewhat complete but lacks specific details on how early treatment improves recovery and quality of life.
    - Clarity: The response is clear and easy to understand, effectively communicating the main idea.
    - Conciseness: The response is concise, but it could be slightly more detailed without losing brevity.

### Overall Scores:
- RAG Overall Score: 7


Evaluating responses:  44%|████▍     | 44/100 [02:14<02:10,  2.33s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response effectively addresses the query by listing signs of binge eating disorder, which is directly relevant to the question asked.
    - Semantic Similarity: The response captures the meaning of the reference answer well, though it includes additional details that are not present in the reference.
    - Accuracy: The information provided about binge eating disorder is accurate and aligns with established understanding of the disorder.
    - Factual Correctness: The facts presented are consistent with reliable sources regarding binge eating disorder and accurately describe its signs.
    - Completeness: The response is complete as it covers several signs of binge eating disorder, but it could have been more concise by omitting some repetitive elements.
    - Clarity: The response is clear and easy to understand, with no ambiguous language.
    - Conciseness: The response is somewhat lengthy and could be more concise; s

Evaluating responses:  45%|████▌     | 45/100 [02:17<02:07,  2.32s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is relevant as it directly addresses the impact of binge eating disorder on mental health, aligning well with the query.
    - Semantic Similarity: The response captures the essence of the reference answer, mentioning feelings of guilt and shame, although it elaborates further with additional details.
    - Accuracy: The information presented is accurate and logically sound regarding the mental health implications of binge eating disorder.
    - Factual Correctness: The facts are consistent with what is known about binge eating disorder and its effects on mental health, aligning with recognized psychological impacts.
    - Completeness: The response is thorough, covering various aspects of how binge eating disorder affects mental health, including feelings of depression and anxiety, which enhances its completeness.
    - Clarity: The response is clear and easy to understand, with straightforward language that ef

Evaluating responses:  46%|████▌     | 46/100 [02:24<03:25,  3.81s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response directly addresses the query about the goal of the Garrett Lee Smith Campus program, highlighting its focus on mental health services for students, which is relevant to the question.
    - Semantic Similarity: The response captures the essence of the reference answer, emphasizing the enhancement of mental health services and the reduction of suicide risks, though it elaborates more extensively.
    - Accuracy: The information provided is accurate and aligns with the known objectives of the Garrett Lee Smith Campus program, particularly regarding mental health and suicide prevention.
    - Factual Correctness: The details about supporting students at risk for various mental health issues are factually correct and consistent with the program's goals.
    - Completeness: The response is comprehensive, covering multiple facets of the program's goals, including various mental health conditions and promoting help-seek

Evaluating responses:  47%|████▋     | 47/100 [02:26<02:56,  3.33s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response effectively addresses the query by detailing how obsessive-compulsive disorder (OCD) manifests through behaviors and emotional responses, making it relevant to the question asked.
    - Semantic Similarity: The RAG response captures the essence of the reference answer by mentioning both obsessions and compulsions, although it does not explicitly mention "obsessions" as the reference does.
    - Accuracy: The information provided about compulsive behaviors and emotional responses is accurate and aligns with common understandings of OCD.
    - Factual Correctness: The facts presented are consistent with reliable sources on OCD and accurately reflect the disorder's symptoms.
    - Completeness: The response covers several manifestations of OCD, including both compulsions and emotional responses, making it more comprehensive than the reference answer.
    - Clarity: The response is clear and straightforward, making 

Evaluating responses:  48%|████▊     | 48/100 [02:29<02:42,  3.12s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is relevant as it addresses the query about resources for self-harm prevention and provides specific examples. However, it could more explicitly connect these resources to the prevention of self-harm rather than just listing them.
    - Semantic Similarity: The response does not capture the essence of the reference answer, which focuses on specialized programs and crisis helplines. The RAG response is broader and less aligned with the specific intent of the reference.
    - Accuracy: The information provided is accurate and aligns with known resources for self-harm prevention. However, it lacks a direct mention of crisis helplines, which are crucial for immediate support.
    - Factual Correctness: The facts presented are correct and sourced from reputable organizations, making the information reliable.
    - Completeness: The response is fairly complete, listing various resources; however, it misses mentioning 

Evaluating responses:  49%|████▉     | 49/100 [02:31<02:31,  2.97s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response effectively addresses the query by discussing the community's role in suicide prevention, highlighting various community sectors involved, which is pertinent to the question.
    - Semantic Similarity: The response captures the essence of the reference answer, emphasizing community support and resources, although it expands on this idea in more detail.
    - Accuracy: The information provided is accurate, as it reflects the collaborative nature of suicide prevention within communities.
    - Factual Correctness: The response mentions organizations like SAMHSA's Suicide Prevention Resource Center, which is a reliable source, thus maintaining factual correctness.
    - Completeness: The response covers multiple aspects of community responsibility in suicide prevention, making it more complete than the reference answer, which is quite brief.
    - Clarity: The response is clear and easy to understand, presenting id

Evaluating responses:  50%|█████     | 50/100 [02:34<02:20,  2.82s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is relevant as it directly addresses the symptoms of schizophrenia, which is the main focus of the query.
    - Semantic Similarity: The RAG response captures the essence of the reference answer, mentioning similar symptoms, but it includes additional details that slightly diverge from the succinctness of the reference.
    - Accuracy: The response accurately describes symptoms associated with schizophrenia, aligning well with established knowledge on the disorder.
    - Factual Correctness: The information presented is consistent with reliable sources regarding schizophrenia symptoms, confirming its factual correctness.
    - Completeness: The response is complete, covering multiple symptoms and elaborating on them, which adds depth beyond the reference answer.
    - Clarity: The response is clear and easy to understand, with well-organized points that enhance comprehension.
    - Conciseness: While the respons

Evaluating responses:  51%|█████     | 51/100 [02:36<02:08,  2.63s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is relevant as it addresses how family members can support a loved one with schizophrenia by discussing emotional support, education, and connecting to professional help.
    - Semantic Similarity: The response captures the essence of the reference answer regarding emotional support and treatment adherence, but it expands on this with additional details.
    - Accuracy: The information provided is accurate and logically sound, discussing appropriate ways family members can assist their loved one.
    - Factual Correctness: The response presents facts that are consistent with general understanding and knowledge about supporting individuals with schizophrenia.
    - Completeness: The response is fairly complete, covering multiple aspects of support, including emotional support, education, and practical assistance.
    - Clarity: The response is clear and easy to understand, with straightforward language that conve

Evaluating responses:  52%|█████▏    | 52/100 [02:39<02:14,  2.81s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is relevant as it addresses the query about treatments for eating disorders, mentioning several types of treatment. However, it includes some information that may not be directly related to the core treatments.
    - Semantic Similarity: The response captures the general intent of the reference answer but introduces additional concepts that diverge from the simplicity of the reference, making it less semantically similar.
    - Accuracy: The information provided about treatment options is generally accurate, though it introduces some concepts (like behavioral analysis and DBT) that are more specific and may not be universally applicable to all eating disorders.
    - Factual Correctness: The response is factually correct regarding the types of treatments mentioned; however, it could mislead readers by implying that behavioral analysis and DBT are essential for all cases.
    - Completeness: The response is more 

Evaluating responses:  53%|█████▎    | 53/100 [02:43<02:20,  2.99s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is highly relevant as it directly addresses how panic disorder affects relationships, discussing avoidance behaviors and the impact of panic attacks.
    - Semantic Similarity: The response captures the essence of the reference answer, which mentions tension and avoidance, while expanding on these points, thus showing good semantic similarity.
    - Accuracy: The information presented is accurate regarding the effects of panic disorder on relationships, aligning with established understanding of the condition.
    - Factual Correctness: The response is factually correct, as it describes the impact of panic disorder on relationships based on psychological insights.
    - Completeness: The response is complete, covering multiple aspects of how panic disorder can affect relationships, including avoidance behaviors and the importance of communication.
    - Clarity: The response is clear and easy to understand, with

Evaluating responses:  54%|█████▍    | 54/100 [02:46<02:20,  3.05s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Semantic Similarity: The RAG response captures some key ideas from the reference answer, such as listening without judgment and staying with the person, but it introduces additional elements that diverge from the core intent.
    - Accuracy: The information provided is accurate and aligns with general advice on how to support someone in crisis. 
    - Completeness: The response is somewhat complete but lacks the directness of the reference answer. It includes some helpful advice but could benefit from more specific actions like asking about suicidal thoughts.
    - Clarity: The response is clear and easy to understand, with straightforward language.
    - Conciseness: The response is somewhat verbose. While it conveys important information, it includes some redundancy, such as the mention of self-care, which could be stated more succinctly.

### Overall Scores:
- RAG Overall Score: 7


Evaluating responses:  55%|█████▌    | 55/100 [02:50<02:30,  3.35s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is relevant as it addresses how parents should approach mental health conversations, providing practical suggestions for communication.
    - Semantic Similarity: The response captures the essence of the reference answer by emphasizing the importance of creating a safe environment and using appropriate language, although it elaborates more than the reference.
    - Accuracy: The suggestions provided are accurate and logically sound for the context of discussing mental health with children.
    - Factual Correctness: The information presented is factually correct and aligns well with best practices for mental health conversations with children.
    - Completeness: The response is complete, covering various aspects such as communication style, emotional awareness, and listening, which are essential in mental health discussions.
    - Clarity: The response is clear and easy to understand, with straightforward langu

Evaluating responses:  56%|█████▌    | 56/100 [02:56<03:05,  4.21s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response effectively addresses the query about treatments for phobias by focusing on exposure therapy, which is a relevant treatment option. However, it could have mentioned cognitive-behavioral techniques as well, which are also important treatments.
    - Semantic Similarity: The response captures the intent of the reference answer but lacks the mention of cognitive-behavioral techniques, which reduces its semantic similarity to the reference.
    - Accuracy: The information provided about exposure therapy is accurate, as it is a well-established treatment for phobias.
    - Factual Correctness: The response is factually correct regarding exposure therapy and the need for a trained therapist, aligning with established psychological practices.
    - Completeness: The response is somewhat complete but could be improved by including other effective treatments, like cognitive-behavioral therapy, to provide a more rounded v

Evaluating responses:  57%|█████▋    | 57/100 [03:01<03:03,  4.27s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response effectively addresses the query about the purpose of the Native Connections program, focusing on prevention and mental health, which is directly relevant.
    - Semantic Similarity: The response captures the intent of the reference answer well, although it includes additional details not present in the reference.
    - Accuracy: The information provided is accurate regarding the program's goals and target demographic.
    - Factual Correctness: The facts presented align with known objectives of the Native Connections program, confirming the response's reliability.
    - Completeness: While the response is more detailed than the reference answer, it covers all essential aspects of the program's purpose, making it complete.
    - Clarity: The response is clear and straightforward, making it easy to understand the program's objectives.
    - Conciseness: The response is slightly longer due to the additional details

Evaluating responses:  58%|█████▊    | 58/100 [03:03<02:38,  3.78s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response effectively addresses the query by distinguishing between social anxiety disorder and shyness, thus meeting the intent of the question.
    - Semantic Similarity: The response captures the essence of the reference answer, highlighting the persistent fear associated with social anxiety disorder, but it does not directly mirror the phrasing.
    - Accuracy: The information provided about social anxiety disorder is accurate, emphasizing the debilitating fear aspect, which is a correct representation of the condition.
    - Factual Correctness: The response is factually correct, as it aligns with the established understanding of social anxiety disorder and its distinction from shyness.
    - Completeness: While the response covers the essential differences, it could be more concise in defining shyness, which is somewhat vague in the context provided.
    - Clarity: The response is generally clear, but the explanatio

Evaluating responses:  59%|█████▉    | 59/100 [03:06<02:18,  3.38s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response effectively addresses the query by explaining why consistent treatment is important for individuals with schizophrenia, thus meeting the intent of the question.
    - Semantic Similarity: The response captures the essence of the reference answer by highlighting symptom management and relapse prevention, although it expands on these points with additional details.
    - Accuracy: The information provided is accurate and aligns with common understanding of the treatment of schizophrenia.
    - Factual Correctness: The facts presented are consistent with established knowledge about schizophrenia treatment, making the response factually correct.
    - Completeness: The response is comprehensive, covering various aspects of why consistent treatment is important, including symptom management and coping strategies.
    - Clarity: The response is clearly articulated and easy to understand, with no ambiguous language.
  

Evaluating responses:  60%|██████    | 60/100 [03:08<02:08,  3.20s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is relevant as it addresses the query about triggers for eating disorders, mentioning several factors that can contribute to these disorders.
    - Semantic Similarity: The response captures the general meaning of the reference answer but lacks the inclusion of societal pressures, which is a significant aspect mentioned in the reference.
    - Accuracy: The information provided is accurate in identifying genetic factors and co-morbid mental disorders as triggers for eating disorders.
    - Factual Correctness: The facts presented align with common knowledge about eating disorders and are consistent with reliable sources.
    - Completeness: The response is somewhat incomplete as it omits societal pressures, which are a key trigger mentioned in the reference answer.
    - Clarity: The response is clear and easy to understand, effectively communicating the identified triggers.
    - Conciseness: The response is co

Evaluating responses:  61%|██████    | 61/100 [03:11<01:53,  2.91s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response addresses the query by listing factors that can trigger generalized anxiety disorder, which is relevant to the question.
    - Semantic Similarity: The response captures the essence of the reference answer but introduces additional factors that are not mentioned in the reference, showing a moderate level of similarity.
    - Accuracy: The factors mentioned (genetics, brain function, personality, development, perception of threats) are generally accurate but may not all be considered direct triggers in a clinical sense.
    - Factual Correctness: The information provided is mostly correct; however, "one’s perception of threats" may not be universally recognized as a trigger in the same way as environmental stressors.
    - Completeness: The response is somewhat complete but lacks mention of environmental stressors, which are significant triggers according to the reference answer.
    - Clarity: The response is cl

Evaluating responses:  62%|██████▏   | 62/100 [03:13<01:46,  2.81s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is relevant as it addresses the query about NCCA temperament therapy, although it could be more focused on the primary aspects.
    - Semantic Similarity: The response captures the intent of the reference answer but adds additional context that is not present in the reference, which slightly alters its alignment.
    - Accuracy: The information provided about understanding and managing one's temperament is generally accurate but introduces a religious aspect that may not be universally applicable.
    - Factual Correctness: The reference to "God-given temperament" may not be factually incorrect but could misrepresent the broader understanding of temperament therapy, which is not exclusively religious.
    - Completeness: The response includes additional context that, while informative, may not be necessary for a basic understanding of NCCA temperament therapy, thus impacting its completeness.
    - Clarity: The 

Evaluating responses:  63%|██████▎   | 63/100 [03:16<01:40,  2.71s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is relevant as it addresses effective interventions for children facing mental health issues, aligning with the query's intent.
    - Semantic Similarity: The response captures some of the main ideas of the reference answer, but it introduces additional concepts (like early intervention and the role of educators) that are not present in the reference, leading to a lower similarity.
    - Accuracy: The information provided is accurate and reflects common practices in addressing children's mental health issues.
    - Factual Correctness: The facts presented are consistent with known approaches to mental health interventions for children, thus maintaining factual correctness.
    - Completeness: The response is more comprehensive than the reference answer, covering a broader range of interventions and considerations.
    - Clarity: The response is clear and easy to understand, making it accessible to a wide audienc

Evaluating responses:  64%|██████▍   | 64/100 [03:21<02:05,  3.48s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is relevant as it addresses the importance of observing behavior changes in children, focusing on stress indicators and the need for professional help.
    - Semantic Similarity: The response captures the intent of the reference answer but expands on it significantly, which may lead to a lower similarity score due to the added details.
    - Accuracy: The information provided is accurate regarding the implications of behavior changes in children and the need for intervention.
    - Factual Correctness: The facts presented about stressors and the potential need for professional assistance are consistent with established knowledge in child psychology.
    - Completeness: The response is quite complete, offering a broader perspective on the reasons for observing behavior changes, though it diverges from the succinctness of the reference answer.
    - Clarity: The response is clear and understandable, effectively co

Evaluating responses:  65%|██████▌   | 65/100 [03:25<02:06,  3.61s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is relevant as it addresses the factors contributing to mental health conditions directly, aligning well with the query.
    - Semantic Similarity: The response captures the essential meaning of the reference answer, though the phrasing is slightly different. It effectively conveys the same ideas.
    - Accuracy: The information provided is accurate and logically sound, aligning with established knowledge about mental health conditions.
    - Factual Correctness: The facts presented are consistent with reliable sources and the reference answer, demonstrating a correct understanding of the topic.
    - Completeness: The response covers the main factors contributing to mental health conditions but could potentially expand on the types of life experiences or provide examples for better depth.
    - Clarity: The response is clearly stated and easy to understand, with straightforward language that conveys the intende

Evaluating responses:  66%|██████▌   | 66/100 [03:28<01:53,  3.33s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response effectively addresses the query about how early and consistent treatment helps people with mental health conditions, specifically mentioning medication and psychotherapy, which are key components of treatment.
    - Semantic Similarity: The response captures the core meaning of the reference answer, emphasizing management of symptoms and leading a meaningful life, though it includes additional detail.
    - Accuracy: The information presented is accurate regarding the benefits of early and consistent treatment for mental health conditions.
    - Factual Correctness: The response is factually correct, aligning well with established knowledge about mental health treatment.
    - Completeness: While the response covers essential aspects of the query, it could benefit from more elaboration on the specific ways treatment helps individuals beyond just managing symptoms.
    - Clarity: The response is clear and easy to

Evaluating responses:  67%|██████▋   | 67/100 [03:37<02:50,  5.16s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is relevant as it addresses the common causes of depression in Seasonal Affective Disorder (SAD) by mentioning lack of sunlight, which is a primary factor.
    - Semantic Similarity: The response captures the essence of the reference answer, focusing on the seasonal changes and their impact on mood, although it uses different phrasing.
    - Accuracy: The information provided about the lack of sunlight affecting circadian rhythms and serotonin levels is accurate and relevant to SAD.
    - Factual Correctness: The facts presented are consistent with established knowledge about SAD and its triggers, particularly the role of sunlight.
    - Completeness: The response could be considered somewhat incomplete as it does not mention the specific seasons (fall and winter) that are typically associated with SAD, which is a key detail in the reference answer.
    - Clarity: The response is clear and straightforward, makin

Evaluating responses:  68%|██████▊   | 68/100 [03:40<02:20,  4.38s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response effectively addresses the query by explaining what makes social anxiety disorder unique, focusing on the fear of judgment and its impact on daily life.
    - Semantic Similarity: The response captures the essence of the reference answer, as both emphasize the persistent fear of judgment and its effects on social and work life, though it is more detailed.
    - Accuracy: The information provided is accurate regarding the nature of social anxiety disorder and its implications for individuals.
    - Factual Correctness: The facts presented align with established understandings of social anxiety disorder and are consistent with reliable sources.
    - Completeness: While the response covers key aspects of social anxiety disorder, it could be more complete by explicitly mentioning the persistent nature of the fear, which is a critical component.
    - Clarity: The response is clear and easy to understand, effectively

Evaluating responses:  69%|██████▉   | 69/100 [03:43<02:05,  4.06s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is highly relevant as it directly addresses how the 988 Suicide & Crisis Lifeline supports individuals in crisis by outlining the services provided.
    - Semantic Similarity: The response captures the meaning of the reference answer well, although it elaborates more on the services offered, which slightly diverges from the brevity of the reference.
    - Accuracy: The information presented is accurate and aligns with the known functions of the 988 Lifeline.
    - Factual Correctness: The response contains factually correct details about the Lifeline's services, such as the availability of trained counselors and various communication methods.
    - Completeness: The response is complete as it covers multiple aspects of the Lifeline's support, including emotional support and connection to local resources, which enhances the understanding of its role.
    - Clarity: The response is clear and easy to understand, wi

Evaluating responses:  70%|███████   | 70/100 [03:46<01:54,  3.82s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is relevant as it addresses lifestyle changes for managing Seasonal Affective Disorder (SAD), aligning well with the query's intent.
    - Semantic Similarity: The response does not capture the essence of the reference answer, which emphasizes exposure to natural light and sleep consistency, while the RAG response focuses on a broader range of lifestyle changes.
    - Accuracy: The information provided is accurate and aligns with common recommendations for managing SAD, such as nutrition and exercise.
    - Factual Correctness: The suggestions made in the RAG response are factually correct and supported by general knowledge about managing SAD.
    - Completeness: The response is somewhat complete, covering various aspects of lifestyle changes, but it omits the critical elements of light exposure and sleep consistency highlighted in the reference answer.
    - Clarity: The response is clear and easy to understand

Evaluating responses:  71%|███████   | 71/100 [03:49<01:39,  3.44s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is relevant as it addresses the causes of self-harm behaviors, mentioning emotional distress and coping mechanisms, which aligns with the query.
    - Semantic Similarity: The response captures the essence of the reference answer by discussing emotional distress and coping, although it expands on these concepts more than the reference answer.
    - Accuracy: The information provided is accurate, discussing the typical onset of self-harm and the emotional reasons behind it, which aligns with common psychological understandings.
    - Factual Correctness: The facts presented are consistent with reliable sources on self-harm behaviors, particularly regarding the age of onset and emotional motivations.
    - Completeness: The response is more complete than the reference answer, providing additional context about the behaviors and feelings associated with self-harm, though it may include slightly more detail than nec

Evaluating responses:  72%|███████▏  | 72/100 [03:57<02:14,  4.80s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response effectively addresses the query by listing behaviors that indicate self-harm, making it highly relevant.
    - Semantic Similarity: The response captures the intent of the reference answer well, as both mention similar behaviors associated with self-harm, though the RAG response provides more detail.
    - Accuracy: The behaviors listed in the RAG response are accurate and commonly recognized indicators of self-harm.
    - Factual Correctness: The information presented aligns with established understanding of self-harm behaviors, maintaining factual correctness.
    - Completeness: The response is comprehensive, covering a wide range of behaviors and including associated physical symptoms, which adds depth to the answer.
    - Clarity: The response is clear and easy to understand, with a straightforward presentation of information.
    - Conciseness: While the response is somewhat lengthy, it remains focused on 

Evaluating responses:  73%|███████▎  | 73/100 [04:02<02:10,  4.83s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is highly relevant as it directly addresses the symptoms of depression in individuals with Seasonal Affective Disorder (SAD), aligning well with the query.
    - Semantic Similarity: The response captures the meaning of the reference answer effectively, although it includes additional symptoms that are not mentioned in the reference.
    - Accuracy: The information provided is accurate regarding the symptoms associated with SAD and aligns with established knowledge on the topic.
    - Factual Correctness: The facts presented are consistent with reliable sources and are factually correct concerning the symptoms of SAD.
    - Completeness: The response is complete, listing several symptoms that encompass the range of experiences individuals with SAD may face, surpassing the brevity of the reference answer.
    - Clarity: The response is clear and well-structured, making it easy to understand the symptoms listed.
 

Evaluating responses:  74%|███████▍  | 74/100 [04:05<01:56,  4.48s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Factual Correctness: The facts provided are consistent with reliable sources regarding mental health and suicide prevention.
    - Conciseness: While the response is thorough, it could be considered slightly lengthy; however, the additional details do not detract significantly from its overall effectiveness.

### Overall Scores:
- RAG Overall Score: 9


Evaluating responses:  75%|███████▌  | 75/100 [04:08<01:38,  3.92s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response effectively addresses the query about what 'mental health' encompasses by providing relevant aspects of mental health.
    - Semantic Similarity: The response captures the essence of the reference answer, discussing emotional, psychological, and social well-being, though it adds more detail.
    - Accuracy: The information presented is accurate and logically sound, aligning with established definitions of mental health.
    - Factual Correctness: The facts are consistent with reliable sources and the reference answer, providing a correct understanding of the term.
    - Completeness: The response is more complete than the reference answer as it elaborates on how mental health affects behavior and decision-making, covering additional relevant aspects.
    - Clarity: The response is clear and straightforward, making it easy to understand the concept of mental health.
    - Conciseness: While the response is slight

Evaluating responses:  76%|███████▌  | 76/100 [04:11<01:28,  3.69s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is relevant as it directly addresses how therapy aids individuals in understanding their thought processes, specifically mentioning Cognitive Behavioral Therapy (CBT).
    - Semantic Similarity: The response captures the essence of the reference answer by discussing the impact of thoughts on actions and interpreting challenges, although it is somewhat more detailed than the reference.
    - Accuracy: The information about CBT and its role in managing thoughts and feelings is accurate and aligns well with established therapeutic practices.
    - Factual Correctness: The facts presented regarding CBT and its function in therapy are consistent with reliable sources and are factually correct.
    - Completeness: The response provides a good amount of detail regarding CBT, but it could elaborate more on the general concept of therapy beyond just CBT to enhance completeness.
    - Clarity: The response is clear and ea

Evaluating responses:  77%|███████▋  | 77/100 [04:15<01:28,  3.85s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is relevant as it addresses the query about overcoming self-harm tendencies by suggesting coping mechanisms and treatment options. However, it could be more directly focused on individual strategies rather than treatment facilities.
    - Semantic Similarity: The response captures the general intent of the reference answer but does not align closely in terms of phrasing. It elaborates more on treatment options rather than summarizing the key strategies as in the reference.
    - Accuracy: The information provided about therapy and hospitalization as treatment options is accurate. However, it lacks emphasis on personal coping mechanisms, which are crucial for overcoming self-harm tendencies.
    - Factual Correctness: The facts presented are consistent with common knowledge about self-harm treatment; however, the mention of when self-harm typically begins (teen or early adult years) may not be directly relevant t

Evaluating responses:  78%|███████▊  | 78/100 [04:19<01:27,  3.98s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is relevant as it addresses how the Native Connections initiative supports youth, specifically focusing on American Indian/Alaska Native youth.
    - Semantic Similarity: The response captures some of the meaning of the reference answer but lacks the emphasis on culturally tailored programs, which is a key aspect of the reference.
    - Accuracy: The information presented is accurate regarding the focus on suicide prevention and mental health for AI/AN youth.
    - Factual Correctness: The facts provided are consistent with known goals of the Native Connections initiative, ensuring factual correctness.
    - Completeness: The response is somewhat incomplete as it does not mention the culturally tailored programs that are crucial for understanding the initiative's support for youth.
    - Clarity: The response is clear and easy to understand, effectively conveying the main points without confusion.
    - Concisen

Evaluating responses:  79%|███████▉  | 79/100 [04:23<01:19,  3.79s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Semantic Similarity: The response captures some of the intent of the reference answer, particularly the notion of managing symptoms, but does not directly convey the idea of preventing worsening symptoms.
    - Accuracy: The information provided about the signs and symptoms of depression is accurate and aligns with common understanding, but it does not focus on the early intervention aspect as much as the reference answer does.
    - Factual Correctness: The mention of the DSM-V is factually correct, and the symptoms described are consistent with established knowledge about depression.
    - Completeness: While the response provides a good overview of symptoms, it lacks a direct mention of the benefits of early intervention, making it less complete compared to the reference answer.
    - Clarity: The response is generally clear and understandable, though some sentences could be simplified for better readability.
    - Conciseness: The 

Evaluating responses:  80%|████████  | 80/100 [04:28<01:26,  4.32s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is relevant as it directly addresses the aim of the Zero Suicide framework, focusing on suicide prevention.
    - Semantic Similarity: The response captures the essence of the reference answer regarding the focus on reducing suicide within healthcare systems, though it adds more detail about the approach.
    - Accuracy: The information provided is accurate regarding the goals of the Zero Suicide framework and aligns with known principles of suicide prevention.
    - Factual Correctness: The mention of "health equity principles" and the target demographic (adults 18 and older) is consistent with the framework's objectives, thus factually correct.
    - Completeness: The response is somewhat complete, covering the framework's aim and approach; however, it could include more about the specific strategies employed within healthcare systems.
    - Clarity: The response is clear and straightforward, making it easy to

Evaluating responses:  81%|████████  | 81/100 [04:32<01:15,  3.99s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response effectively addresses the query by explaining how increased substance use can signal suicide risk, making it relevant to the question.
    - Semantic Similarity: The response captures the essence of the reference answer by highlighting coping mechanisms and emotional pain, although it elaborates more extensively.
    - Accuracy: The information provided is accurate, linking substance use with mental health issues and suicide risk appropriately.
    - Factual Correctness: The facts presented are consistent with established knowledge about substance use and its relationship with mental health and suicide risk.
    - Completeness: The response is comprehensive, covering multiple aspects of how substance use can relate to suicide risk, which is more detailed than the reference answer.
    - Clarity: The response is clear and easy to understand, presenting ideas in a logical manner.
    - Conciseness: While the respo

Evaluating responses:  82%|████████▏ | 82/100 [04:35<01:06,  3.72s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Semantic Similarity: The response captures the essence of the reference answer, as it includes similar concepts such as changes in energy levels and withdrawal from activities, although it is more extensive.
    - Factual Correctness: The facts presented are consistent with established understanding of mental health issues and are logically sound.
    - Completeness: The response is thorough and covers a wide range of signs, providing more detail than the reference answer, which enhances completeness.
    - Clarity: The response is clear and easy to understand, with each sign presented in a straightforward manner.
    - Conciseness: While the response is comprehensive, it could be considered slightly lengthy. However, it does not contain unnecessary details, so it remains relatively concise.

### Overall Scores:
- RAG Overall Score: 9


Evaluating responses:  83%|████████▎ | 83/100 [04:38<01:02,  3.67s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is relevant as it addresses how caregivers can discuss mental health with children, focusing on communication strategies.
    - Semantic Similarity: The response captures the intent of the reference answer by emphasizing age-appropriate communication and a comfortable environment, but it expands on these ideas significantly.
    - Accuracy: The information provided is accurate and aligns with best practices for discussing mental health with children.
    - Factual Correctness: The response is factually correct, as it includes valid approaches for engaging children in conversations about their mental health.
    - Completeness: The response is quite complete, covering multiple aspects of effective communication, including the child's emotional state and the caregiver's role in listening.
    - Clarity: The response is clear and easy to understand, with straightforward language that is appropriate for the context.

Evaluating responses:  84%|████████▍ | 84/100 [04:42<00:58,  3.64s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response effectively addresses the query by contrasting phobias with general anxiety, making it relevant to the question.
    - Semantic Similarity: The response captures the main ideas of the reference answer, maintaining the core distinctions between phobias and general anxiety, though it elaborates more.
    - Accuracy: The information presented is accurate, distinguishing between specific fears associated with phobias and the broader nature of general anxiety.
    - Factual Correctness: The details regarding the nature of phobias and general anxiety are factually correct and consistent with psychological understanding.
    - Completeness: The response is complete, covering key aspects such as the irrationality of phobias, the intensity of the fear response, and avoidance behaviors.
    - Clarity: The response is clear and well-structured, making it easy to follow and understand the differences between the two concept

Evaluating responses:  85%|████████▌ | 85/100 [04:47<01:03,  4.20s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is highly relevant as it specifically addresses how the 988 Lifeline provides crisis support, aligning well with the query.
    - Semantic Similarity: The response captures the essence of the reference answer, though it provides more detail. It maintains the intent of connecting individuals to trained counselors.
    - Accuracy: The information presented is accurate, reflecting the Lifeline's services correctly.
    - Factual Correctness: The facts regarding the service being confidential, available 24/7, and the existence of specialized lines are correct and consistent with reliable sources.
    - Completeness: The response is complete as it covers various aspects of the Lifeline's support, including emotional support and resources for specific populations.
    - Clarity: The response is clear and easy to understand, with a logical flow of information.
    - Conciseness: While the response is somewhat lengthy, 

Evaluating responses:  86%|████████▌ | 86/100 [04:53<01:03,  4.56s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is relevant as it addresses coping techniques for social anxiety, aligning well with the query.
    - Semantic Similarity: The response captures the general intent of the reference answer, but it includes additional techniques that are not mentioned in the reference, leading to a lower similarity.
    - Accuracy: The techniques listed are accurate and valid methods for coping with social anxiety, which supports the response's credibility.
    - Factual Correctness: The information provided is factually correct and aligns with common psychological practices for managing social anxiety.
    - Completeness: The response is more comprehensive than the reference answer, covering a wider range of techniques, which enhances its completeness.
    - Clarity: The response is clear and easy to understand, with a logical flow of ideas that makes it accessible to readers.
    - Conciseness: While the response is thorough, it

Evaluating responses:  87%|████████▋ | 87/100 [04:59<01:04,  5.00s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response partially addresses the query by mentioning Dialectical Behavior Therapy (DBT) as a support option, but it fails to mention other support options like support groups or nutrition counseling, which are also relevant.
    - Semantic Similarity: The response does not capture the essence of the reference answer well. While it discusses a therapeutic approach, it does not align closely with the succinct nature of the reference answer.
    - Accuracy: The mention of DBT is accurate as a treatment option for binge eating disorder, but the response lacks a broader perspective on available support options.
    - Factual Correctness: The information about DBT is factually correct; however, it does not encompass the full range of support options that individuals with binge eating disorder might seek.
    - Completeness: The response is incomplete as it only focuses on DBT without acknowledging other important support mecha

Evaluating responses:  88%|████████▊ | 88/100 [05:02<00:51,  4.33s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response effectively addresses the query by discussing the importance of early intervention in children's mental health, which is directly related to the question.
    - Semantic Similarity: The response captures the overall intent of the reference answer but does so in a more detailed manner. However, it does not align closely in phrasing.
    - Accuracy: The information provided is accurate, as it discusses the significance of early intervention in mental health, which is well-supported in research.
    - Factual Correctness: The facts presented are consistent with established knowledge about mental health in children, particularly regarding early signs and intervention.
    - Completeness: The response is thorough, covering multiple aspects of why early intervention is important, including the prevention of worsening conditions and promoting healthy development.
    - Clarity: The response is clear and easy to underst

Evaluating responses:  89%|████████▉ | 89/100 [05:07<00:52,  4.76s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is relevant as it addresses the role of extreme mood swings in suicide risk, which directly relates to the query.
    - Semantic Similarity: The response captures the essence of the reference answer by indicating that mood swings can signify emotional distress, although it expands on the topic with additional details.
    - Accuracy: The information provided is accurate, highlighting the connection between mood swings and increased suicide risk effectively.
    - Factual Correctness: The response is factually correct, as it aligns with established knowledge regarding mood swings and their implications for mental health.
    - Clarity: The response is clear and easy to understand, presenting the information in a straightforward manner.
    - Conciseness: While the response is generally concise, it could be slightly more succinct by eliminating some repetitive phrases about seeking help.

### Overall Scores:
- RAG

Evaluating responses:  90%|█████████ | 90/100 [05:13<00:48,  4.89s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response effectively addresses the query by discussing ways schools can prevent self-harm, which is the main focus of the question.
    - Semantic Similarity: While the RAG response expands on the topic more than the reference answer, it captures the intent of providing support and education, which aligns with the reference.
    - Accuracy: The information presented is accurate regarding school interventions for preventing self-harm, highlighting social and emotional development and mental health supports.
    - Factual Correctness: The response is factually correct, as it discusses recognized methods for intervention in schools that are commonly accepted in mental health discourse.
    - Clarity: The response is clear and well-structured, making it easy to understand the proposed strategies for prevention.
    - Conciseness: While the response is somewhat longer than the reference answer, it remains focused and does not

Evaluating responses:  91%|█████████ | 91/100 [05:15<00:38,  4.30s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is relevant as it directly addresses the differences between SAD and general depression, which is the core of the query.
    - Semantic Similarity: The response captures the essence of the reference answer, discussing seasonal aspects of SAD and contrasting them with general depression, though it expands on details not present in the reference.
    - Accuracy: The information provided is accurate, correctly identifying the seasonal nature of SAD and the various factors that can contribute to general depression.
    - Factual Correctness: The facts presented are consistent with established knowledge about SAD and depression, making the response factually correct.
    - Completeness: The response is thorough, covering multiple aspects of the differences between SAD and general depression, which adds depth to the answer compared to the reference.
    - Clarity: The response is clear and easy to understand, with wel

Evaluating responses:  92%|█████████▏| 92/100 [05:18<00:31,  3.92s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is relevant as it directly addresses practices that can improve emotional and spiritual well-being, which aligns with the query.
    - Semantic Similarity: The RAG response captures the intent of the reference answer by discussing practices for emotional and spiritual support, though it elaborates more than the reference.
    - Accuracy: The information provided is accurate and logically sound, discussing well-known practices that contribute to emotional and spiritual well-being.
    - Factual Correctness: The practices mentioned, such as yoga and meditation, are widely recognized for their benefits, making the response factually correct.
    - Completeness: The response is comprehensive, covering a range of practices that can enhance emotional and spiritual well-being without omitting critical information.
    - Clarity: The response is clear and easy to understand, with straightforward language that effectivel

Evaluating responses:  93%|█████████▎| 93/100 [05:22<00:26,  3.72s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is relevant as it addresses psychological factors contributing to eating disorders, which directly relates to the query.
    - Semantic Similarity: There is a moderate level of semantic similarity with the reference answer, as both mention negative self-image and co-existing mental illnesses, though the RAG response is more expansive.
    - Accuracy: The information provided is generally accurate, listing recognized psychological factors associated with eating disorders.
    - Factual Correctness: The factors mentioned in the RAG response are factually correct and align with established knowledge about eating disorders.
    - Completeness: The response is quite comprehensive, listing multiple psychological factors, which adds depth beyond the reference answer.
    - Clarity: The response is clear and structured well, making it easy to follow the listed factors.
    - Conciseness: While the response is detailed, 

Evaluating responses:  94%|█████████▍| 94/100 [05:25<00:21,  3.53s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response effectively addresses the query by listing several support strategies for individuals with schizophrenia, making it relevant to the question asked.
    - Semantic Similarity: While the response does not directly mirror the reference answer, it captures the essence of providing support strategies, albeit in a more detailed manner.
    - Accuracy: The information provided in the response is generally accurate and aligns with common support strategies for schizophrenia.
    - Factual Correctness: The strategies mentioned, such as sticking to a treatment plan and involving family, are consistent with established practices for managing schizophrenia, indicating factual correctness.
    - Completeness: The response is comprehensive, covering multiple strategies that can benefit individuals with schizophrenia, which goes beyond the brevity of the reference answer.
    - Clarity: The response is clear and easy to unders

Evaluating responses:  95%|█████████▌| 95/100 [05:29<00:19,  3.84s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is relevant as it addresses behaviors that indicate a child's mental health issues, which is directly related to the query about steps caregivers should take. However, it lacks a broader scope of actions caregivers should consider.
    - Semantic Similarity: The response captures some of the intent of the reference answer but does not align closely in terms of providing a comprehensive approach. The reference answer emphasizes consulting healthcare providers and maintaining communication, which the RAG response does not address.
    - Accuracy: The behavioral indicators provided are accurate and relevant to mental health concerns in children. However, the response does not specify that these behaviors should prompt caregivers to take action, which may lead to some ambiguity.
    - Factual Correctness: The facts presented about specific behaviors are consistent with recognized signs of mental health issues in chi

Evaluating responses:  96%|█████████▌| 96/100 [05:33<00:14,  3.70s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response addresses the query by indicating the impact of untreated OCD on daily life, but it lacks specific details or examples that would enhance its relevance.
    - Semantic Similarity: The response captures the general idea of the reference answer but does not convey the same depth of meaning or specific information regarding compulsions and quality of life.
    - Accuracy: The statement is accurate in suggesting that untreated OCD can significantly affect life, but it is overly simplistic and does not elaborate on how it manifests.
    - Factual Correctness: The statement is factually correct in its claim that OCD can take over one's life, but it does not provide sufficient context or details to fully inform the reader.
    - Completeness: The response is incomplete as it does not mention the specific effects of OCD, such as compulsions or the reduction in quality of life, which are critical aspects of the query.
  

Evaluating responses:  97%|█████████▋| 97/100 [05:35<00:10,  3.40s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response effectively addresses the query by providing specific signs for parents to look out for regarding mental health concerns in children. It is directly relevant to the question.
    - Semantic Similarity: The RAG response captures the essence of the reference answer by mentioning signs of mental health issues, but it elaborates more than the reference, which may lead to some divergence in phrasing.
    - Factual Correctness: The information provided is factually correct and reflects widely recognized signs of mental health concerns, consistent with reliable sources.
    - Completeness: The response is thorough, covering a variety of behaviors that indicate potential mental health issues, thus providing a comprehensive view.
    - Clarity: The response is clear and easy to understand, with each sign presented in a straightforward manner.
    - Conciseness: While the response is detailed, it is somewhat lengthy compa

Evaluating responses:  98%|█████████▊| 98/100 [05:38<00:06,  3.09s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response addresses the query by discussing the causes of social anxiety disorder, which is relevant to the question asked.
    - Semantic Similarity: The response captures the intent of the reference answer but does not align closely in phrasing or completeness, as it introduces additional factors.
    - Accuracy: The information provided is generally accurate, mentioning genetics and learned behavior as potential causes.
    - Factual Correctness: The facts are consistent with known information about social anxiety disorder, but the phrase "not fully known" could imply uncertainty that is not as definitive as the reference answer.
    - Completeness: The response lacks completeness as it does not succinctly summarize the primary causes, focusing instead on a broader explanation that includes learned behavior.
    - Clarity: The response is clear but could be more straightforward in its explanation of the causes without 

Evaluating responses:  99%|█████████▉| 99/100 [05:41<00:02,  2.99s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is highly relevant as it directly addresses the signs that indicate a child may need mental health support, which aligns well with the query.
    - Semantic Similarity: The RAG response captures the essence of the reference answer but goes into more detail. While it does not match the brevity of the reference, it conveys similar concepts of mood swings and behavioral changes.
    - Accuracy: The information presented in the response is accurate and aligns with common signs of mental health issues in children.
    - Factual Correctness: The facts provided are consistent with reliable sources regarding mental health indicators in children, confirming the response's reliability.
    - Completeness: The response is quite complete, listing multiple signs of mental health issues, which provides a broader understanding than the reference answer.
    - Clarity: The response is clear and understandable, although it could

Evaluating responses: 100%|██████████| 100/100 [05:43<00:00,  3.43s/sample]

Evaluation Output:
Feedback:
- RAG Response:
    - Relevance: The response is relevant as it addresses the query regarding biological factors contributing to mental health conditions.
    - Semantic Similarity: The response captures the main idea of the reference answer, though it does not use the same phrasing.
    - Accuracy: The information about genes and brain chemistry is accurate and relevant to mental health conditions.
    - Factual Correctness: The facts presented are consistent with the reference answer and align with established knowledge in the field.
    - Completeness: The response is somewhat limited as it only mentions two factors (genes and brain chemistry) without elaborating or including other potential biological factors.
    - Clarity: The response is clear and easy to understand, presenting the information in a straightforward manner.
    - Conciseness: The response is concise and avoids unnecessary details, effectively conveying the main points.

### Overall Sco




In [None]:
import json

with open("evaluation_results_rag_1000_100_gte-large.json", "w") as f:
    json.dump(evaluations, f, indent=4)

In [None]:
results = pd.DataFrame(evaluations)

In [None]:
results.head()

Unnamed: 0,Question,Ground_Truth,Source,Category,Context,rag_response,RAG_Score
0,What triggers Seasonal Affective Disorder (SAD)?,"SAD is triggered by seasonal changes, especial...",https://www.samhsa.gov/mental-health/seasonal-...,Mental Illness,"['Seasonal affective disorder, or SAD, is a co...","According to the context, Seasonal Affective D...",9
1,What are the health risks of bulimia nervosa?,"Risks include dental decay, dehydration, and s...",https://www.samhsa.gov/mental-health/eating-di...,Mental Illness,['It is common that binge eating will lead to ...,The health risks of bulimia nervosa include ch...,8
2,How can someone with mental health problems ma...,By finding trustworthy individuals who provide...,https://www.samhsa.gov/mental-health/how-to-ta...,Mental Health,"[""Anyone can experience mental health problems...","Find someone—such as a parent, family member, ...",7
3,What are the risk factors for schizophrenia?,Risk factors include genetic predisposition an...,https://www.samhsa.gov/mental-health/schizophr...,Mental Illness,['Schizophrenia is a serious brain disorder th...,"According to the provided context, risk factor...",2
4,How does schizophrenia impact daily functioning?,"It affects work, socialization, and completing...",https://www.samhsa.gov/mental-health/schizophr...,Mental Illness,['Schizophrenia is a serious brain disorder th...,Schizophrenia can be extremely disruptive to a...,9


In [None]:
results[['RAG_Score']].describe()

Unnamed: 0,RAG_Score
count,100.0
mean,7.48
std,1.344499
min,2.0
25%,7.0
50%,8.0
75%,8.0
max,9.0


In [None]:
average_rag_score = results['RAG_Score'].mean()
print("Average RAG Score:", average_rag_score)

Average RAG Score: 7.48
