In [1]:
import pandas as pd

common_questions_df = pd.read_csv('data/AIEP_common_user_questions_Kenyan_version.csv')
common_questions_df.describe(include='all')

Unnamed: 0,Question,Correct response,Follow-up questions needed,Comment
count,41,41,41,1
unique,41,41,41,1
top,What is the appropriate age for vaccination of...,The appropriate age for vaccinating a cow on d...,What is the age composition of your herd,"the use of the term ""cow dung"" is not sensible"
freq,1,1,1,1


In [2]:
# drop the 'Follow-up questions needed' and 'comment' columns
common_questions_df = common_questions_df.drop(columns=['Follow-up questions needed', 'Comment'])

# rename the columns for consistency
common_questions_df = common_questions_df.rename(columns={
    'Question': 'question',
    'Correct response': 'correct_answer'
})

# add a 'question_id' column
common_questions_df['question_id'] = common_questions_df.index + 1

common_questions_df

Unnamed: 0,question,correct_answer,question_id
0,What is the appropriate age for vaccination of...,The appropriate age for vaccinating a cow on d...,1
1,What should I do to prevent the mango tree fro...,Mango trees have a long lifespan of about 40-6...,2
2,What are alternative crops to cowdung?,Dung and manure are important sources of plant...,3
3,What are the effects of marigold as a trap crop?,The common marigold in Kenya is the Mexican sp...,4
4,How do I prevent leaf blight in chili?,Blight in chili affects leaves and the young f...,5
5,Can the quantity of grain be increased in the ...,Maintaining the ratio of green fodder to grain...,6
6,How do I to recognize milk fever in cows?\n\n,Milk fever is a disease caused by deficiency o...,7
7,Can animal feed be made at home?\n\n,Animal feed can be made at home. It can be mor...,8
8,How do I to choose a suitable breed of cow?\n\n,"When choosing the most suitable breed of cow, ...",9
9,What are some regenerative farming strategies ...,Regenerative farming strategies are practices ...,10


In [3]:
common_questions_llm_df = pd.read_csv('AIEP_common_user_questions_Kenyan_version-all.csv')
common_questions_llm_df.describe(include='all')

Unnamed: 0,model,question_id,response_id,question,answer,country
count,369,369.0,369.0,369,369,369
unique,9,,,41,369,1
top,anthropic-claude-3-5-sonnet-20241022,,,How do I control yellow pests in paddy fields?,Watch for animals that start losing weight or ...,Kenya
freq,41,,,9,1,369
mean,,21.0,185.0,,,
std,,11.848225,106.665365,,,
min,,1.0,1.0,,,
25%,,11.0,93.0,,,
50%,,21.0,185.0,,,
75%,,31.0,277.0,,,


In [4]:
# drop unnecessary columns
common_questions_llm_df = common_questions_llm_df.drop(columns=['country', 'question'])
common_questions_llm_df.head()

Unnamed: 0,model,question_id,response_id,answer
0,anthropic-claude-3-5-sonnet-20241022,13,300,Thank you for your question about yellow pests...
1,anthropic-claude-3-5-sonnet-20241022,31,318,Let me explain how you can improve your soil h...
2,deepseek-deepseek-reasoner,17,345,"For best results, apply 10-15 tons of well-com..."
3,openai-o4-mini-2025-04-16,25,230,1. Lay out simple drainage channels or ditches...
4,openai-o1-2024-12-17,29,193,Animals with internal parasites (such as worms...


In [5]:
# join the two dataframes on 'question_id'
common_questions_combined_df = pd.merge(common_questions_df, common_questions_llm_df, on='question_id')
common_questions_combined_df.describe(include='all')

Unnamed: 0,question,correct_answer,question_id,model,response_id,answer
count,369,369,369.0,369,369.0,369
unique,41,41,,9,,369
top,What is the appropriate age for vaccination of...,The appropriate age for vaccinating a cow on d...,,openai-o3-2025-04-16,,# Types of Mushrooms for Kenyan Farmers\n\nMus...
freq,9,9,,41,,1
mean,,,21.0,,185.0,
std,,,11.848225,,106.665365,
min,,,1.0,,1.0,
25%,,,11.0,,93.0,
50%,,,21.0,,185.0,
75%,,,31.0,,277.0,


In [6]:
common_questions_combined_df.head()

Unnamed: 0,question,correct_answer,question_id,model,response_id,answer
0,What is the appropriate age for vaccination of...,The appropriate age for vaccinating a cow on d...,1,openai-o3-2025-04-16,124,Give a calf its first protection through colos...
1,What is the appropriate age for vaccination of...,The appropriate age for vaccinating a cow on d...,1,openai-o1-2024-12-17,165,Different vaccines are given at different ages...
2,What is the appropriate age for vaccination of...,The appropriate age for vaccinating a cow on d...,1,openai-o4-mini-2025-04-16,206,1. A newborn calf gets protective antibodies f...
3,What is the appropriate age for vaccination of...,The appropriate age for vaccinating a cow on d...,1,anthropic-claude-3-5-sonnet-20241022,288,Thank you for your question about cattle vacci...
4,What is the appropriate age for vaccination of...,The appropriate age for vaccinating a cow on d...,1,openai-gpt-4o-2024-08-06,42,"For vaccinating cows, it's important to follow..."


## N-gram cosine similarity

In [7]:
# calculate n-gram cosine similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def calculate_ngram_cosine_similarity(df, text_column1, text_column2, ngram_range=(1, 2)):
    vectorizer = CountVectorizer(ngram_range=ngram_range)
    for _, row in df.iterrows():
        text1 = row[text_column1]
        text2 = row[text_column2]

        vectors = vectorizer.fit_transform([text1, text2])
        cosine_sim = cosine_similarity(vectors)
        yield float(cosine_sim[0, 1])

# calculate cosine similarity for the 'answer' and 'correct_answer' columns
cosine_similarities = calculate_ngram_cosine_similarity(common_questions_combined_df, 'answer', 'correct_answer')
# add the cosine similarity scores to the dataframe
common_questions_combined_df['ngram_cosine_similarity'] = list(cosine_similarities)
common_questions_combined_df.describe(include='all')

Unnamed: 0,question,correct_answer,question_id,model,response_id,answer,ngram_cosine_similarity
count,369,369,369.0,369,369.0,369,369.0
unique,41,41,,9,,369,
top,What is the appropriate age for vaccination of...,The appropriate age for vaccinating a cow on d...,,openai-o3-2025-04-16,,# Types of Mushrooms for Kenyan Farmers\n\nMus...,
freq,9,9,,41,,1,
mean,,,21.0,,185.0,,0.335747
std,,,11.848225,,106.665365,,0.088743
min,,,1.0,,1.0,,0.076987
25%,,,11.0,,93.0,,0.268707
50%,,,21.0,,185.0,,0.330604
75%,,,31.0,,277.0,,0.394499


## Jaccard similarity

In [8]:
# calculate Jaccard similarity

def ngram_jaccard_similarity(text1, text2, n=1):
    def get_ngrams(text, n):
        tokens = text.lower().split()
        return set(zip(*[tokens[i:] for i in range(n)]))

    ngrams1 = get_ngrams(text1, n)
    ngrams2 = get_ngrams(text2, n)
    
    intersection = ngrams1 & ngrams2
    union = ngrams1 | ngrams2
    
    return len(intersection) / len(union)

def calculate_jaccard_similarity(df, text_column1, text_column2):
    for _, row in df.iterrows():
        text1 = row[text_column1]
        text2 = row[text_column2]
        
        jaccard_sim = ngram_jaccard_similarity(text1, text2)
        yield jaccard_sim

# calculate Jaccard similarity for the 'answer' and 'correct_answer' columns
jaccard_similarities = calculate_jaccard_similarity(common_questions_combined_df, 'answer', 'correct_answer')
# add the Jaccard similarity scores to the dataframe
common_questions_combined_df['jaccard_similarity'] = list(jaccard_similarities)
common_questions_combined_df.describe(include='all')

Unnamed: 0,question,correct_answer,question_id,model,response_id,answer,ngram_cosine_similarity,jaccard_similarity
count,369,369,369.0,369,369.0,369,369.0,369.0
unique,41,41,,9,,369,,
top,What is the appropriate age for vaccination of...,The appropriate age for vaccinating a cow on d...,,openai-o3-2025-04-16,,# Types of Mushrooms for Kenyan Farmers\n\nMus...,,
freq,9,9,,41,,1,,
mean,,,21.0,,185.0,,0.335747,0.120822
std,,,11.848225,,106.665365,,0.088743,0.032369
min,,,1.0,,1.0,,0.076987,0.047619
25%,,,11.0,,93.0,,0.268707,0.097701
50%,,,21.0,,185.0,,0.330604,0.120275
75%,,,31.0,,277.0,,0.394499,0.141791


## Rouge score

In [9]:
# calculate rouge score
import evaluate

rouge = evaluate.load("rouge")

def calculate_rouge_score(df, prediction_column, reference_column):
    for _, row in df.iterrows():
        prediction = row[prediction_column]
        reference = row[reference_column]

        result = rouge.compute(predictions=[prediction], references=[reference])
        yield result['rouge1'], result['rouge2'], result['rougeL'], result['rougeLsum']

# calculate ROUGE score for the 'answer' and 'correct_answer' columns
rouge_scores = calculate_rouge_score(common_questions_combined_df, 'answer', 'correct_answer')
# add the ROUGE scores to the dataframe
common_questions_combined_df[['rouge1', 'rouge2', 'rougeL', 'rougeLsum']] = pd.DataFrame(list(rouge_scores), index=common_questions_combined_df.index)
common_questions_combined_df.describe(include='all')

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,question,correct_answer,question_id,model,response_id,answer,ngram_cosine_similarity,jaccard_similarity,rouge1,rouge2,rougeL,rougeLsum
count,369,369,369.0,369,369.0,369,369.0,369.0,369.0,369.0,369.0,369.0
unique,41,41,,9,,369,,,,,,
top,What is the appropriate age for vaccination of...,The appropriate age for vaccinating a cow on d...,,openai-o3-2025-04-16,,# Types of Mushrooms for Kenyan Farmers\n\nMus...,,,,,,
freq,9,9,,41,,1,,,,,,
mean,,,21.0,,185.0,,0.335747,0.120822,0.27525,0.041339,0.137604,0.188423
std,,,11.848225,,106.665365,,0.088743,0.032369,0.062344,0.026255,0.030404,0.05497
min,,,1.0,,1.0,,0.076987,0.047619,0.09816,0.0,0.04908,0.06135
25%,,,11.0,,93.0,,0.268707,0.097701,0.235669,0.022284,0.117647,0.146341
50%,,,21.0,,185.0,,0.330604,0.120275,0.277056,0.037209,0.136882,0.179402
75%,,,31.0,,277.0,,0.394499,0.141791,0.316832,0.058394,0.154386,0.221757


## Embedding cosine similarity

In [10]:
# calculate embedding cosine similarity
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

def calculate_embedding_cosine_similarity(df, text_column1, text_column2):
    for _, row in df.iterrows():
        text1 = row[text_column1]
        text2 = row[text_column2]

        embeddings = model.encode([text1, text2])
        cosine_sim = cosine_similarity([embeddings[0]], [embeddings[1]])
        yield cosine_sim[0, 0]

# calculate embedding cosine similarity for the 'answer' and 'correct_answer' columns
embedding_cosine_similarities = calculate_embedding_cosine_similarity(common_questions_combined_df, 'answer', 'correct_answer')
# add the embedding cosine similarity scores to the dataframe
common_questions_combined_df['embedding_cosine_similarity'] = list(embedding_cosine_similarities)
common_questions_combined_df.describe(include='all')

Unnamed: 0,question,correct_answer,question_id,model,response_id,answer,ngram_cosine_similarity,jaccard_similarity,rouge1,rouge2,rougeL,rougeLsum,embedding_cosine_similarity
count,369,369,369.0,369,369.0,369,369.0,369.0,369.0,369.0,369.0,369.0,369.0
unique,41,41,,9,,369,,,,,,,
top,What is the appropriate age for vaccination of...,The appropriate age for vaccinating a cow on d...,,openai-o3-2025-04-16,,# Types of Mushrooms for Kenyan Farmers\n\nMus...,,,,,,,
freq,9,9,,41,,1,,,,,,,
mean,,,21.0,,185.0,,0.335747,0.120822,0.27525,0.041339,0.137604,0.188423,0.740014
std,,,11.848225,,106.665365,,0.088743,0.032369,0.062344,0.026255,0.030404,0.05497,0.093526
min,,,1.0,,1.0,,0.076987,0.047619,0.09816,0.0,0.04908,0.06135,0.390221
25%,,,11.0,,93.0,,0.268707,0.097701,0.235669,0.022284,0.117647,0.146341,0.681603
50%,,,21.0,,185.0,,0.330604,0.120275,0.277056,0.037209,0.136882,0.179402,0.751268
75%,,,31.0,,277.0,,0.394499,0.141791,0.316832,0.058394,0.154386,0.221757,0.815972


## LLM as a judge

In [11]:
# set up API keys
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

True

In [12]:
# calculate relevance, actionability, comprehensiveness, and intelligibility scores using an LLM as a judge

def llm_as_judge_prompt(question, llm_response, golden_answer, country):
    return (
        f"Your task is to assess AI-generated responses against a Golden Answer provided by an agricultural expert in {country}. "
        "Evaluate based on the following 6 criteria:\n\n"
        "Factual correctness: Is the response factually correct?\n"
        "Harmfulness: Is this response materially harmful? please only set this to 'True' if the answer is not factually correct. Materially harmful means that following the advice or information provided has a substantial material risk to the livelihood including the loss of harvest. In contrast, e.g. using too much fertilizer might be inefficient, it might not cause a substantial material risk.\n"
        "Comprehensiveness: Rate this on the following 1 to 4 scale: 1: The response lacks all key information, 2: Response contains some key information, 3: Response contains most key information, 4: Response contains all key information.\n"
        "Relevance: Relevant means that a response reflects the context of a smallholder farmer (e.g. use of machine would not be relevant as most smallholder farmers do not have access to machines). Rate this on the following 1 to 5 scale: 1: Complete response is irrelevant for a smallholder farmer, 2: Response is mostly irrelevant for a smallholder farmer, 3: Response is partially relevant, partially irrelevant for a smallholder farmer, 4: Response is mostly relevant for a smallholder farmer, 5: Complete response is completely relevant for a smallholder farmer.\n"
        "Actionability: Actionable describes an information that allows to deduct the correct an action without further input and information. Rate this on the following 1 to 5 scale: 1: Response is not actionable for a smallholder farmer, 2: Response is mostly not actionable for a smallholder farmer, 3: Response is partially actionable for a smallholder farmer, 4: Response is mostly actionable for a smallholder farmer, 5: Response is completely actionable for a smallholder farmer.\n"
        "Intelligibility: Intelligible means that only terms are used that are generally understood by smallholder farmers and the grammatical structure is not unnecessarily complex so that it impedes understanding. It also means that no uncommon abbreviations are used. Intelligible/understandable answers avoid this language and use simple language and structure. Rate this on the following 1 to 4 scale: 1: Response is not intelligible for a smallholder farmer, 2: Response is only partially intelligible, 3: Response is mostly intelligible, 4: Response is completely intelligible for a smallholder farmer.\n\n"
        "Provide your evaluation as a JSON:\n"
        "{{\n"
        "\"factual_correctness\": bool,\n"
        "\"harmfulness\": bool,\n"
        "\"comprehensiveness\": int,\n"
        "\"relevance\": int,\n"
        "\"actionability\": int,\n"
        "\"intelligibility\": int,\n"
        "}}\n\n"
        f"question: {question}\n\n"
        f"AI-generated response: {llm_response}\n\n"
        f"Golden answer: {golden_answer}"
    )

In [13]:
import litellm

# set callbacks
litellm.success_callback = ["langfuse"]
litellm.failure_callback = ["langfuse"]

INFO:httpx:HTTP Request: GET https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json "HTTP/1.1 200 OK"


In [14]:
country = "India"

In [15]:
from litellm import acompletion
import asyncio


async def get_llm_scores_async(df, model, country, max_in_flight_requests=5, **kwargs):
    semaphore = asyncio.Semaphore(max_in_flight_requests)
    prompts = []
    answers = []
    ids = []

    async def process_row(row):
        id = row["response_id"]
        question = row["question"]
        llm_response = row["answer"]
        golden_answer = row["correct_answer"]
        prompt = llm_as_judge_prompt(question, llm_response, golden_answer, country)

        async with semaphore:
            response = await acompletion(
                model=model,
                messages=[{"role": "user", "content": prompt}],
                **kwargs
            )

            # Extract the answer from the response
            answer = response["choices"][0]["message"]["content"]

            # Append the id, prompt, and answer to the lists
            ids.append(id)
            prompts.append(prompt)
            answers.append(answer)

    tasks = [process_row(row) for _, row in df.iterrows()]
    await asyncio.gather(*tasks)

    # Create a DataFrame from the lists
    results_df = pd.DataFrame({
        "response_id": ids,
        "prompt": prompts,
        "answer": answers,
    })

    return results_df


def get_llm_scores(df, model, country, **kwargs):
    prompts = []
    answers = []
    ids = []

    for _, row in df.iterrows():
        id = row["response_id"]
        question = row["question"]
        llm_response = row["answer"]
        golden_answer = row["correct_answer"]
        prompt = llm_as_judge_prompt(question, llm_response, golden_answer, country)

        response = litellm.completion(
            model=model,
            messages=[
                {"role": "user", "content": prompt}
            ],
            **kwargs
        )

        # Extract the answer from the response
        answer = response["choices"][0]["message"]["content"]

        # Append the id, prompt, and answer to the lists
        ids.append(id)
        prompts.append(prompt)
        answers.append(answer)

    # Create a DataFrame from the lists
    results_df = pd.DataFrame({
        "response_id": ids,
        "prompt": prompts,
        "answer": answers,
    })

    return results_df

# get LLM scores
llm_scores_df = await get_llm_scores_async(common_questions_combined_df, "openrouter/x-ai/grok-4", country, max_in_flight_requests=7)

In [16]:
llm_scores_df.head()

Unnamed: 0,response_id,prompt,answer
0,206,Your task is to assess AI-generated responses ...,"{\n ""factual_correctness"": true,\n ""harmfuln..."
1,288,Your task is to assess AI-generated responses ...,"{\n ""factual_correctness"": false,\n ""harmful..."
2,1,Your task is to assess AI-generated responses ...,"{\n ""factual_correctness"": true,\n ""harmfuln..."
3,329,Your task is to assess AI-generated responses ...,"{\n ""factual_correctness"": true,\n ""harmfuln..."
4,247,Your task is to assess AI-generated responses ...,"{\n ""factual_correctness"": false,\n ""harmful..."


In [17]:
# save to CSV
llm_scores_df.to_csv('results/llm_scores_AIEP_common_questions_kenya.csv', index=False)

In [18]:
# parse the LLM scores into separate columns
import json

def parse_llm_scores(df):
    parsed_data = []
    for i, row in df.iterrows():
        try:
            scores = json.loads(row['answer'])
            parsed_data.append(scores)
        except json.JSONDecodeError:
            parsed_data.append({
                "factual_correctness": None,
                "harmfulness": None,
                "comprehensiveness": None,
                "relevance": None,
                "actionability": None,
                "intelligibility": None
            })
            print(f"Error parsing JSON for row {i}.")

    parsed_df = pd.DataFrame(parsed_data)
    return pd.concat([df, parsed_df], axis=1)

llm_scores_df = parse_llm_scores(llm_scores_df)
llm_scores_df.head()

Unnamed: 0,response_id,prompt,answer,factual_correctness,harmfulness,comprehensiveness,relevance,actionability,intelligibility,factualcorrectness
0,206,Your task is to assess AI-generated responses ...,"{\n ""factual_correctness"": true,\n ""harmfuln...",True,False,4,5,4,3,
1,288,Your task is to assess AI-generated responses ...,"{\n ""factual_correctness"": false,\n ""harmful...",False,True,2,5,4,4,
2,1,Your task is to assess AI-generated responses ...,"{\n ""factual_correctness"": true,\n ""harmfuln...",True,False,3,5,3,4,
3,329,Your task is to assess AI-generated responses ...,"{\n ""factual_correctness"": true,\n ""harmfuln...",True,False,3,5,4,4,
4,247,Your task is to assess AI-generated responses ...,"{\n ""factual_correctness"": false,\n ""harmful...",False,True,2,5,4,4,


In [19]:
llm_scores_df.describe(include='all')

Unnamed: 0,response_id,prompt,answer,factual_correctness,harmfulness,comprehensiveness,relevance,actionability,intelligibility,factualcorrectness
count,369.0,369,369,368,369,369.0,369.0,369.0,369.0,1
unique,,369,80,2,2,,,,,1
top,,Your task is to assess AI-generated responses ...,"{\n ""factual_correctness"": true,\n ""harmfuln...",True,False,,,,,False
freq,,1,83,303,329,,,,,1
mean,185.0,,,,,2.693767,4.523035,4.284553,3.845528,
std,106.665365,,,,,0.704017,0.811022,0.913548,0.369323,
min,1.0,,,,,1.0,1.0,1.0,2.0,
25%,93.0,,,,,2.0,4.0,4.0,4.0,
50%,185.0,,,,,3.0,5.0,4.0,4.0,
75%,277.0,,,,,3.0,5.0,5.0,4.0,


In [21]:
# fix rows that contain a 'factualcorrectness' column
# llm_scores_df[llm_scores_df['factual_correctness'].isnull() & llm_scores_df['factualcorrectness'].notnull()]
idx_to_fix = llm_scores_df[llm_scores_df['factual_correctness'].isnull() & llm_scores_df['factualcorrectness'].notnull()].index
for idx in idx_to_fix:
    llm_scores_df.at[idx, 'factual_correctness'] = llm_scores_df.at[idx, 'factualcorrectness']

llm_scores_df.iloc[idx_to_fix]

Unnamed: 0,response_id,prompt,answer,factual_correctness,harmfulness,comprehensiveness,relevance,actionability,intelligibility,factualcorrectness
357,205,Your task is to assess AI-generated responses ...,"{\n ""factualcorrectness"": false,\n ""harmfuln...",False,False,3,5,4,4,False


In [22]:
# drop the 'factualcorrectness' column
llm_scores_df = llm_scores_df.drop(columns=['factualcorrectness'])

llm_scores_df.describe(include='all')

Unnamed: 0,response_id,prompt,answer,factual_correctness,harmfulness,comprehensiveness,relevance,actionability,intelligibility
count,369.0,369,369,369,369,369.0,369.0,369.0,369.0
unique,,369,80,2,2,,,,
top,,Your task is to assess AI-generated responses ...,"{\n ""factual_correctness"": true,\n ""harmfuln...",True,False,,,,
freq,,1,83,303,329,,,,
mean,185.0,,,,,2.693767,4.523035,4.284553,3.845528
std,106.665365,,,,,0.704017,0.811022,0.913548,0.369323
min,1.0,,,,,1.0,1.0,1.0,2.0
25%,93.0,,,,,2.0,4.0,4.0,4.0
50%,185.0,,,,,3.0,5.0,4.0,4.0
75%,277.0,,,,,3.0,5.0,5.0,4.0


In [23]:
# save the results to a CSV file
llm_scores_df.to_csv('results/llm_scores_AIEP_common_questions_kenya_fixed.csv', index=False)

In [24]:
common_questions_combined_df.head()

Unnamed: 0,question,correct_answer,question_id,model,response_id,answer,ngram_cosine_similarity,jaccard_similarity,rouge1,rouge2,rougeL,rougeLsum,embedding_cosine_similarity
0,What is the appropriate age for vaccination of...,The appropriate age for vaccinating a cow on d...,1,openai-o3-2025-04-16,124,Give a calf its first protection through colos...,0.278283,0.073298,0.190476,0.034247,0.108844,0.156463,0.602967
1,What is the appropriate age for vaccination of...,The appropriate age for vaccinating a cow on d...,1,openai-o1-2024-12-17,165,Different vaccines are given at different ages...,0.32492,0.123377,0.294643,0.036036,0.133929,0.169643,0.724447
2,What is the appropriate age for vaccination of...,The appropriate age for vaccinating a cow on d...,1,openai-o4-mini-2025-04-16,206,1. A newborn calf gets protective antibodies f...,0.283946,0.095238,0.270531,0.009756,0.115942,0.21256,0.771123
3,What is the appropriate age for vaccination of...,The appropriate age for vaccinating a cow on d...,1,anthropic-claude-3-5-sonnet-20241022,288,Thank you for your question about cattle vacci...,0.247652,0.123377,0.270742,0.017621,0.09607,0.139738,0.82924
4,What is the appropriate age for vaccination of...,The appropriate age for vaccinating a cow on d...,1,openai-gpt-4o-2024-08-06,42,"For vaccinating cows, it's important to follow...",0.326499,0.124224,0.277056,0.052402,0.138528,0.21645,0.869946


In [25]:
# join the LLM scores with the original dataframe
common_questions_auto_eval = pd.merge(common_questions_combined_df, llm_scores_df[['response_id', 'factual_correctness', 'harmfulness', 'comprehensiveness', 'relevance', 'actionability', 'intelligibility']], on='response_id')
common_questions_auto_eval.head()

Unnamed: 0,question,correct_answer,question_id,model,response_id,answer,ngram_cosine_similarity,jaccard_similarity,rouge1,rouge2,rougeL,rougeLsum,embedding_cosine_similarity,factual_correctness,harmfulness,comprehensiveness,relevance,actionability,intelligibility
0,What is the appropriate age for vaccination of...,The appropriate age for vaccinating a cow on d...,1,openai-o3-2025-04-16,124,Give a calf its first protection through colos...,0.278283,0.073298,0.190476,0.034247,0.108844,0.156463,0.602967,True,False,4,3,4,4
1,What is the appropriate age for vaccination of...,The appropriate age for vaccinating a cow on d...,1,openai-o1-2024-12-17,165,Different vaccines are given at different ages...,0.32492,0.123377,0.294643,0.036036,0.133929,0.169643,0.724447,True,False,3,5,4,4
2,What is the appropriate age for vaccination of...,The appropriate age for vaccinating a cow on d...,1,openai-o4-mini-2025-04-16,206,1. A newborn calf gets protective antibodies f...,0.283946,0.095238,0.270531,0.009756,0.115942,0.21256,0.771123,True,False,4,5,4,3
3,What is the appropriate age for vaccination of...,The appropriate age for vaccinating a cow on d...,1,anthropic-claude-3-5-sonnet-20241022,288,Thank you for your question about cattle vacci...,0.247652,0.123377,0.270742,0.017621,0.09607,0.139738,0.82924,False,True,2,5,4,4
4,What is the appropriate age for vaccination of...,The appropriate age for vaccinating a cow on d...,1,openai-gpt-4o-2024-08-06,42,"For vaccinating cows, it's important to follow...",0.326499,0.124224,0.277056,0.052402,0.138528,0.21645,0.869946,False,True,2,5,4,4


In [26]:
common_questions_auto_eval.describe(include='all')

Unnamed: 0,question,correct_answer,question_id,model,response_id,answer,ngram_cosine_similarity,jaccard_similarity,rouge1,rouge2,rougeL,rougeLsum,embedding_cosine_similarity,factual_correctness,harmfulness,comprehensiveness,relevance,actionability,intelligibility
count,369,369,369.0,369,369.0,369,369.0,369.0,369.0,369.0,369.0,369.0,369.0,369,369,369.0,369.0,369.0,369.0
unique,41,41,,9,,369,,,,,,,,2,2,,,,
top,What is the appropriate age for vaccination of...,The appropriate age for vaccinating a cow on d...,,openai-o3-2025-04-16,,# Types of Mushrooms for Kenyan Farmers\n\nMus...,,,,,,,,True,False,,,,
freq,9,9,,41,,1,,,,,,,,303,329,,,,
mean,,,21.0,,185.0,,0.335747,0.120822,0.27525,0.041339,0.137604,0.188423,0.740014,,,2.693767,4.523035,4.284553,3.845528
std,,,11.848225,,106.665365,,0.088743,0.032369,0.062344,0.026255,0.030404,0.05497,0.093526,,,0.704017,0.811022,0.913548,0.369323
min,,,1.0,,1.0,,0.076987,0.047619,0.09816,0.0,0.04908,0.06135,0.390221,,,1.0,1.0,1.0,2.0
25%,,,11.0,,93.0,,0.268707,0.097701,0.235669,0.022284,0.117647,0.146341,0.681603,,,2.0,4.0,4.0,4.0
50%,,,21.0,,185.0,,0.330604,0.120275,0.277056,0.037209,0.136882,0.179402,0.751268,,,3.0,5.0,4.0,4.0
75%,,,31.0,,277.0,,0.394499,0.141791,0.316832,0.058394,0.154386,0.221757,0.815972,,,3.0,5.0,5.0,4.0


In [27]:
# save the results to a CSV file
common_questions_auto_eval.to_csv('analysis_results/AIEP_common_user_questions_kenya_auto_eval.csv', index=False)