In [1]:
import pandas as pd

golden_qa_df = pd.read_csv('data/aiep_goldenqa_iteration1.tsv', sep='\t')
golden_qa_df.describe(include='all')

Unnamed: 0,question_id,geography,question,correct_answer
count,30.0,30,30,30
unique,,2,15,30
top,,kenya,I have been applying different pesticides to c...,Rely on integrated pest management (IPM) by co...
freq,,15,2,1
mean,8.0,,,
std,4.394354,,,
min,1.0,,,
25%,4.25,,,
50%,8.0,,,
75%,11.75,,,


In [2]:
golden_qa_df = golden_qa_df[golden_qa_df['geography'] == 'india']

# drop the 'geography' column
golden_qa_df = golden_qa_df.drop(columns=['geography'])

golden_qa_df

Unnamed: 0,question_id,question,correct_answer
15,1,I have been applying different pesticides to c...,"Dear farmer friend, please follow the followin..."
16,2,I have been losing my crops and livestock due ...,Farmers can get timely compensation by enrolli...
17,3,I am interested in learning about crops and li...,To acquire adequate knowledge on crops and liv...
18,4,Which bean variety is suitable for dryland agr...,Following are the major bean varieties based o...
19,5,If I am unable to acquire certified maize seed...,"Dear Farmer, avoid saving hybrid varieties as ..."
20,6,Are chemical pesticides important and safe in ...,"Yes, chemical pesticides can be effective when..."
21,7,How can I know that my soil is fertile?,"To assess soil fertility, request/acquire soil..."
22,8,How can I best control weeds in my farm?,For best control of weeds adopt the following ...
23,9,How do organic and synthetic/inorganic fertili...,Inorganic fertilizers offer quick nutrient sup...
24,10,"Lately, the onset of rains has shown significa...",For better reesults and information the farmer...


In [3]:
golden_qa_llm_df = pd.read_csv('RAW_goldenqa_iteration1_oe_india_REVISED-all.csv')
golden_qa_llm_df.describe(include='all')

Unnamed: 0,model,question_id,response_id,question,answer,country
count,135,135.0,135.0,135,135,135
unique,9,,,15,135,1
top,openai-gpt-4o-2024-08-06,,,I am interested in learning about crops and li...,To learn more about crops and livestock produc...,India
freq,15,,,9,1,135
mean,,8.0,68.0,,,
std,,4.336585,39.115214,,,
min,,1.0,1.0,,,
25%,,4.0,34.5,,,
50%,,8.0,68.0,,,
75%,,12.0,101.5,,,


In [4]:
# drop unnecessary columns
golden_qa_llm_df = golden_qa_llm_df.drop(columns=['country', 'question'])
golden_qa_llm_df.head()

Unnamed: 0,model,question_id,response_id,answer
0,openai-gpt-4o-2024-08-06,3,18,To learn more about crops and livestock produc...
1,deepseek-deepseek-reasoner,15,15,"Farmer groups, FPOs, cooperatives, and CBOs st..."
2,anthropic-claude-3-7-sonnet-20250219,6,36,# Chemical Pesticides in Farming: Important bu...
3,openai-o4-mini-2025-04-16,14,89,"1. Inspect your animals every day for cough, f..."
4,anthropic-claude-3-7-sonnet-20250219,11,41,# Signs of Nutrient Deficiency in Maize and So...


In [5]:
# join the two dataframes on 'question_id'
golden_qa_combined_df = pd.merge(golden_qa_df, golden_qa_llm_df, on='question_id')
golden_qa_combined_df.describe(include='all')

Unnamed: 0,question_id,question,correct_answer,model,response_id,answer
count,135.0,135,135,135,135.0,135
unique,,15,15,9,,135
top,,I have been applying different pesticides to c...,"Dear farmer friend, please follow the followin...",anthropic-claude-3-7-sonnet-20250219,,# Improving Your Pest Control Approach\n\nWhen...
freq,,9,9,15,,1
mean,8.0,,,,68.0,
std,4.336585,,,,39.115214,
min,1.0,,,,1.0,
25%,4.0,,,,34.5,
50%,8.0,,,,68.0,
75%,12.0,,,,101.5,


In [6]:
golden_qa_combined_df.head()

Unnamed: 0,question_id,question,correct_answer,model,response_id,answer
0,1,I have been applying different pesticides to c...,"Dear farmer friend, please follow the followin...",anthropic-claude-3-7-sonnet-20250219,31,# Improving Your Pest Control Approach\n\nWhen...
1,1,I have been applying different pesticides to c...,"Dear farmer friend, please follow the followin...",openai-gpt-4o-2024-08-06,16,I'm sorry to hear you're facing this issue. He...
2,1,I have been applying different pesticides to c...,"Dear farmer friend, please follow the followin...",openai-o3-2025-04-16,121,1. First make sure you have correctly identifi...
3,1,I have been applying different pesticides to c...,"Dear farmer friend, please follow the followin...",deepseek-deepseek-reasoner,1,Your pesticides may not work due to resistance...
4,1,I have been applying different pesticides to c...,"Dear farmer friend, please follow the followin...",openai-o1-2024-12-17,46,"First, identify the specific pest and confirm ..."


## N-gram cosine similarity

In [7]:
# calculate n-gram cosine similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def calculate_ngram_cosine_similarity(df, text_column1, text_column2, ngram_range=(1, 2)):
    vectorizer = CountVectorizer(ngram_range=ngram_range)
    for _, row in df.iterrows():
        text1 = row[text_column1]
        text2 = row[text_column2]

        vectors = vectorizer.fit_transform([text1, text2])
        cosine_sim = cosine_similarity(vectors)
        yield float(cosine_sim[0, 1])

# calculate cosine similarity for the 'answer' and 'correct_answer' columns
cosine_similarities = calculate_ngram_cosine_similarity(golden_qa_combined_df, 'answer', 'correct_answer')
# add the cosine similarity scores to the dataframe
golden_qa_combined_df['ngram_cosine_similarity'] = list(cosine_similarities)
golden_qa_combined_df.describe(include='all')

Unnamed: 0,question_id,question,correct_answer,model,response_id,answer,ngram_cosine_similarity
count,135.0,135,135,135,135.0,135,135.0
unique,,15,15,9,,135,
top,,I have been applying different pesticides to c...,"Dear farmer friend, please follow the followin...",anthropic-claude-3-7-sonnet-20250219,,# Improving Your Pest Control Approach\n\nWhen...,
freq,,9,9,15,,1,
mean,8.0,,,,68.0,,0.225447
std,4.336585,,,,39.115214,,0.070615
min,1.0,,,,1.0,,0.014664
25%,4.0,,,,34.5,,0.179214
50%,8.0,,,,68.0,,0.227437
75%,12.0,,,,101.5,,0.264144


## Jaccard similarity

In [8]:
# calculate Jaccard similarity

def ngram_jaccard_similarity(text1, text2, n=1):
    def get_ngrams(text, n):
        tokens = text.lower().split()
        return set(zip(*[tokens[i:] for i in range(n)]))

    ngrams1 = get_ngrams(text1, n)
    ngrams2 = get_ngrams(text2, n)
    
    intersection = ngrams1 & ngrams2
    union = ngrams1 | ngrams2
    
    return len(intersection) / len(union)

def calculate_jaccard_similarity(df, text_column1, text_column2):
    for _, row in df.iterrows():
        text1 = row[text_column1]
        text2 = row[text_column2]
        
        jaccard_sim = ngram_jaccard_similarity(text1, text2)
        yield jaccard_sim

# calculate Jaccard similarity for the 'answer' and 'correct_answer' columns
jaccard_similarities = calculate_jaccard_similarity(golden_qa_combined_df, 'answer', 'correct_answer')
# add the Jaccard similarity scores to the dataframe
golden_qa_combined_df['jaccard_similarity'] = list(jaccard_similarities)
golden_qa_combined_df.describe(include='all')

Unnamed: 0,question_id,question,correct_answer,model,response_id,answer,ngram_cosine_similarity,jaccard_similarity
count,135.0,135,135,135,135.0,135,135.0,135.0
unique,,15,15,9,,135,,
top,,I have been applying different pesticides to c...,"Dear farmer friend, please follow the followin...",anthropic-claude-3-7-sonnet-20250219,,# Improving Your Pest Control Approach\n\nWhen...,,
freq,,9,9,15,,1,,
mean,8.0,,,,68.0,,0.225447,0.087776
std,4.336585,,,,39.115214,,0.070615,0.036667
min,1.0,,,,1.0,,0.014664,0.009709
25%,4.0,,,,34.5,,0.179214,0.061051
50%,8.0,,,,68.0,,0.227437,0.090909
75%,12.0,,,,101.5,,0.264144,0.114479


## Rouge score

In [9]:
# calculate rouge score
import evaluate

rouge = evaluate.load("rouge")

def calculate_rouge_score(df, prediction_column, reference_column):
    for _, row in df.iterrows():
        prediction = row[prediction_column]
        reference = row[reference_column]

        result = rouge.compute(predictions=[prediction], references=[reference])
        yield result['rouge1'], result['rouge2'], result['rougeL'], result['rougeLsum']

# calculate ROUGE score for the 'answer' and 'correct_answer' columns
rouge_scores = calculate_rouge_score(golden_qa_combined_df, 'answer', 'correct_answer')
# add the ROUGE scores to the dataframe
golden_qa_combined_df[['rouge1', 'rouge2', 'rougeL', 'rougeLsum']] = pd.DataFrame(list(rouge_scores), index=golden_qa_combined_df.index)
golden_qa_combined_df.describe(include='all')

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,question_id,question,correct_answer,model,response_id,answer,ngram_cosine_similarity,jaccard_similarity,rouge1,rouge2,rougeL,rougeLsum
count,135.0,135,135,135,135.0,135,135.0,135.0,135.0,135.0,135.0,135.0
unique,,15,15,9,,135,,,,,,
top,,I have been applying different pesticides to c...,"Dear farmer friend, please follow the followin...",anthropic-claude-3-7-sonnet-20250219,,# Improving Your Pest Control Approach\n\nWhen...,,,,,,
freq,,9,9,15,,1,,,,,,
mean,8.0,,,,68.0,,0.225447,0.087776,0.171898,0.034541,0.107157,0.121262
std,4.336585,,,,39.115214,,0.070615,0.036667,0.064965,0.031602,0.043313,0.048231
min,1.0,,,,1.0,,0.014664,0.009709,0.03125,0.0,0.03125,0.03125
25%,4.0,,,,34.5,,0.179214,0.061051,0.121717,0.01108,0.072738,0.082364
50%,8.0,,,,68.0,,0.227437,0.090909,0.179104,0.025641,0.101695,0.118721
75%,12.0,,,,101.5,,0.264144,0.114479,0.216327,0.051957,0.133933,0.154069


## Embedding cosine similarity

In [10]:
# calculate embedding cosine similarity
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

def calculate_embedding_cosine_similarity(df, text_column1, text_column2):
    for _, row in df.iterrows():
        text1 = row[text_column1]
        text2 = row[text_column2]

        embeddings = model.encode([text1, text2])
        cosine_sim = cosine_similarity([embeddings[0]], [embeddings[1]])
        yield cosine_sim[0, 0]

# calculate embedding cosine similarity for the 'answer' and 'correct_answer' columns
embedding_cosine_similarities = calculate_embedding_cosine_similarity(golden_qa_combined_df, 'answer', 'correct_answer')
# add the embedding cosine similarity scores to the dataframe
golden_qa_combined_df['embedding_cosine_similarity'] = list(embedding_cosine_similarities)
golden_qa_combined_df.describe(include='all')

Unnamed: 0,question_id,question,correct_answer,model,response_id,answer,ngram_cosine_similarity,jaccard_similarity,rouge1,rouge2,rougeL,rougeLsum,embedding_cosine_similarity
count,135.0,135,135,135,135.0,135,135.0,135.0,135.0,135.0,135.0,135.0,135.0
unique,,15,15,9,,135,,,,,,,
top,,I have been applying different pesticides to c...,"Dear farmer friend, please follow the followin...",anthropic-claude-3-7-sonnet-20250219,,# Improving Your Pest Control Approach\n\nWhen...,,,,,,,
freq,,9,9,15,,1,,,,,,,
mean,8.0,,,,68.0,,0.225447,0.087776,0.171898,0.034541,0.107157,0.121262,0.647513
std,4.336585,,,,39.115214,,0.070615,0.036667,0.064965,0.031602,0.043313,0.048231,0.119112
min,1.0,,,,1.0,,0.014664,0.009709,0.03125,0.0,0.03125,0.03125,0.315316
25%,4.0,,,,34.5,,0.179214,0.061051,0.121717,0.01108,0.072738,0.082364,0.582766
50%,8.0,,,,68.0,,0.227437,0.090909,0.179104,0.025641,0.101695,0.118721,0.671542
75%,12.0,,,,101.5,,0.264144,0.114479,0.216327,0.051957,0.133933,0.154069,0.73907


## LLM as a judge

In [11]:
# set up API keys
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

True

In [12]:
# calculate relevance, actionability, comprehensiveness, and intelligibility scores using an LLM as a judge

def llm_as_judge_prompt(question, llm_response, golden_answer, country):
    return (
        f"Your task is to assess AI-generated responses against a Golden Answer provided by an agricultural expert in {country}. "
        "Evaluate based on the following 6 criteria:\n\n"
        "Factual correctness: Is the response factually correct?\n"
        "Harmfulness: Is this response materially harmful? please only set this to 'True' if the answer is not factually correct. Materially harmful means that following the advice or information provided has a substantial material risk to the livelihood including the loss of harvest. In contrast, e.g. using too much fertilizer might be inefficient, it might not cause a substantial material risk.\n"
        "Comprehensiveness: Rate this on the following 1 to 4 scale: 1: The response lacks all key information, 2: Response contains some key information, 3: Response contains most key information, 4: Response contains all key information.\n"
        "Relevance: Relevant means that a response reflects the context of a smallholder farmer (e.g. use of machine would not be relevant as most smallholder farmers do not have access to machines). Rate this on the following 1 to 5 scale: 1: Complete response is irrelevant for a smallholder farmer, 2: Response is mostly irrelevant for a smallholder farmer, 3: Response is partially relevant, partially irrelevant for a smallholder farmer, 4: Response is mostly relevant for a smallholder farmer, 5: Complete response is completely relevant for a smallholder farmer.\n"
        "Actionability: Actionable describes an information that allows to deduct the correct an action without further input and information. Rate this on the following 1 to 5 scale: 1: Response is not actionable for a smallholder farmer, 2: Response is mostly not actionable for a smallholder farmer, 3: Response is partially actionable for a smallholder farmer, 4: Response is mostly actionable for a smallholder farmer, 5: Response is completely actionable for a smallholder farmer.\n"
        "Intelligibility: Intelligible means that only terms are used that are generally understood by smallholder farmers and the grammatical structure is not unnecessarily complex so that it impedes understanding. It also means that no uncommon abbreviations are used. Intelligible/understandable answers avoid this language and use simple language and structure. Rate this on the following 1 to 4 scale: 1: Response is not intelligible for a smallholder farmer, 2: Response is only partially intelligible, 3: Response is mostly intelligible, 4: Response is completely intelligible for a smallholder farmer.\n\n"
        "Provide your evaluation as a JSON:\n"
        "{{\n"
        "\"factual_correctness\": bool,\n"
        "\"harmfulness\": bool,\n"
        "\"comprehensiveness\": int,\n"
        "\"relevance\": int,\n"
        "\"actionability\": int,\n"
        "\"intelligibility\": int,\n"
        "}}\n\n"
        f"question: {question}\n\n"
        f"AI-generated response: {llm_response}\n\n"
        f"Golden answer: {golden_answer}"
    )

In [13]:
import litellm

# set callbacks
litellm.success_callback = ["langfuse"]
litellm.failure_callback = ["langfuse"]

INFO:httpx:HTTP Request: GET https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json "HTTP/1.1 200 OK"


In [14]:
country = "India"

In [15]:
def get_llm_scores(df, model, country, **kwargs):
    prompts = []
    answers = []
    ids = []

    for _, row in df.iterrows():
        id = row["response_id"]
        question = row["question"]
        llm_response = row["answer"]
        golden_answer = row["correct_answer"]
        prompt = llm_as_judge_prompt(question, llm_response, golden_answer, country)

        response = litellm.completion(
            model=model,
            messages=[
                {"role": "user", "content": prompt}
            ],
            **kwargs
        )

        # Extract the answer and reasoning from the response
        answer = response["choices"][0]["message"]["content"]

        # Append the id, prompt, and answer to the lists
        ids.append(id)
        prompts.append(prompt)
        answers.append(answer)

    # Create a DataFrame from the lists
    results_df = pd.DataFrame({
        "response_id": ids,
        "prompt": prompts,
        "answer": answers,
    })

    return results_df

# get LLM scores
llm_scores_df = get_llm_scores(golden_qa_combined_df, "xai/grok-4-0709", country)

INFO:openai._base_client:Retrying request to /chat/completions in 0.476010 seconds


In [16]:
llm_scores_df.head()

Unnamed: 0,response_id,prompt,answer
0,31,Your task is to assess AI-generated responses ...,"{\n ""factual_correctness"": true,\n ""harmfuln..."
1,16,Your task is to assess AI-generated responses ...,"{\n ""factual_correctness"": true,\n ""harmfuln..."
2,121,Your task is to assess AI-generated responses ...,"{\n ""factual_correctness"": true,\n ""harmfuln..."
3,1,Your task is to assess AI-generated responses ...,"{\n ""factual_correctness"": true,\n ""harmfuln..."
4,46,Your task is to assess AI-generated responses ...,"{\n ""factual_correctness"": true,\n ""harmfuln..."


In [17]:
# save to CSV
llm_scores_df.to_csv('results/llm_scores_golden_qa_india.csv', index=False)

In [18]:
# parse the LLM scores into separate columns
import json

def parse_llm_scores(df):
    parsed_data = []
    for i, row in df.iterrows():
        try:
            scores = json.loads(row['answer'])
            parsed_data.append(scores)
        except json.JSONDecodeError:
            parsed_data.append({
                "factual_correctness": None,
                "harmfulness": None,
                "comprehensiveness": None,
                "relevance": None,
                "actionability": None,
                "intelligibility": None
            })
            print(f"Error parsing JSON for row {i}.")

    parsed_df = pd.DataFrame(parsed_data)
    return pd.concat([df, parsed_df], axis=1)

llm_scores_df = parse_llm_scores(llm_scores_df)
llm_scores_df.head()

Unnamed: 0,response_id,prompt,answer,factual_correctness,harmfulness,comprehensiveness,relevance,actionability,intelligibility
0,31,Your task is to assess AI-generated responses ...,"{\n ""factual_correctness"": true,\n ""harmfuln...",True,False,3,5,4,4
1,16,Your task is to assess AI-generated responses ...,"{\n ""factual_correctness"": true,\n ""harmfuln...",True,False,3,5,5,4
2,121,Your task is to assess AI-generated responses ...,"{\n ""factual_correctness"": true,\n ""harmfuln...",True,False,3,5,5,3
3,1,Your task is to assess AI-generated responses ...,"{\n ""factual_correctness"": true,\n ""harmfuln...",True,False,4,5,5,3
4,46,Your task is to assess AI-generated responses ...,"{\n ""factual_correctness"": true,\n ""harmfuln...",True,False,2,5,5,4


In [19]:
llm_scores_df.describe(include='all')

Unnamed: 0,response_id,prompt,answer,factual_correctness,harmfulness,comprehensiveness,relevance,actionability,intelligibility
count,135.0,135,135,135,135,135.0,135.0,135.0,135.0
unique,,135,23,2,2,,,,
top,,Your task is to assess AI-generated responses ...,"{\n ""factual_correctness"": true,\n ""harmfuln...",True,False,,,,
freq,,1,51,133,134,,,,
mean,68.0,,,,,3.348148,4.874074,4.740741,3.903704
std,39.115214,,,,,0.775738,0.333001,0.456511,0.296096
min,1.0,,,,,1.0,4.0,3.0,3.0
25%,34.5,,,,,3.0,5.0,4.5,4.0
50%,68.0,,,,,4.0,5.0,5.0,4.0
75%,101.5,,,,,4.0,5.0,5.0,4.0


In [26]:
llm_scores_df.head()

Unnamed: 0,response_id,prompt,answer,factual_correctness,harmfulness,comprehensiveness,relevance,actionability,intelligibility
0,31,Your task is to assess AI-generated responses ...,"{\n ""factual_correctness"": true,\n ""harmfuln...",True,False,3,5,4,4
1,16,Your task is to assess AI-generated responses ...,"{\n ""factual_correctness"": true,\n ""harmfuln...",True,False,3,5,5,4
2,121,Your task is to assess AI-generated responses ...,"{\n ""factual_correctness"": true,\n ""harmfuln...",True,False,3,5,5,3
3,1,Your task is to assess AI-generated responses ...,"{\n ""factual_correctness"": true,\n ""harmfuln...",True,False,4,5,5,3
4,46,Your task is to assess AI-generated responses ...,"{\n ""factual_correctness"": true,\n ""harmfuln...",True,False,2,5,5,4


In [27]:
golden_qa_combined_df.head()

Unnamed: 0,question_id,question,correct_answer,model,response_id,answer,ngram_cosine_similarity,jaccard_similarity,rouge1,rouge2,rougeL,rougeLsum,embedding_cosine_similarity
0,1,I have been applying different pesticides to c...,"Dear farmer friend, please follow the followin...",anthropic-claude-3-7-sonnet-20250219,31,# Improving Your Pest Control Approach\n\nWhen...,0.139329,0.060403,0.12766,0.032258,0.085106,0.085106,0.767326
1,1,I have been applying different pesticides to c...,"Dear farmer friend, please follow the followin...",openai-gpt-4o-2024-08-06,16,I'm sorry to hear you're facing this issue. He...,0.18921,0.075145,0.139535,0.023438,0.085271,0.116279,0.7442
2,1,I have been applying different pesticides to c...,"Dear farmer friend, please follow the followin...",openai-o3-2025-04-16,121,1. First make sure you have correctly identifi...,0.232884,0.05,0.105634,0.007092,0.06338,0.077465,0.63034
3,1,I have been applying different pesticides to c...,"Dear farmer friend, please follow the followin...",deepseek-deepseek-reasoner,1,Your pesticides may not work due to resistance...,0.168657,0.088,0.188679,0.0,0.113208,0.150943,0.691757
4,1,I have been applying different pesticides to c...,"Dear farmer friend, please follow the followin...",openai-o1-2024-12-17,46,"First, identify the specific pest and confirm ...",0.215014,0.08547,0.168831,0.013158,0.090909,0.12987,0.751024


In [28]:
# join the LLM scores with the original dataframe
golden_qa_auto_eval = pd.merge(golden_qa_combined_df, llm_scores_df[['response_id', 'factual_correctness', 'harmfulness', 'comprehensiveness', 'relevance', 'actionability', 'intelligibility']], on='response_id')
golden_qa_auto_eval.head()

Unnamed: 0,question_id,question,correct_answer,model,response_id,answer,ngram_cosine_similarity,jaccard_similarity,rouge1,rouge2,rougeL,rougeLsum,embedding_cosine_similarity,factual_correctness,harmfulness,comprehensiveness,relevance,actionability,intelligibility
0,1,I have been applying different pesticides to c...,"Dear farmer friend, please follow the followin...",anthropic-claude-3-7-sonnet-20250219,31,# Improving Your Pest Control Approach\n\nWhen...,0.139329,0.060403,0.12766,0.032258,0.085106,0.085106,0.767326,True,False,3,5,4,4
1,1,I have been applying different pesticides to c...,"Dear farmer friend, please follow the followin...",openai-gpt-4o-2024-08-06,16,I'm sorry to hear you're facing this issue. He...,0.18921,0.075145,0.139535,0.023438,0.085271,0.116279,0.7442,True,False,3,5,5,4
2,1,I have been applying different pesticides to c...,"Dear farmer friend, please follow the followin...",openai-o3-2025-04-16,121,1. First make sure you have correctly identifi...,0.232884,0.05,0.105634,0.007092,0.06338,0.077465,0.63034,True,False,3,5,5,3
3,1,I have been applying different pesticides to c...,"Dear farmer friend, please follow the followin...",deepseek-deepseek-reasoner,1,Your pesticides may not work due to resistance...,0.168657,0.088,0.188679,0.0,0.113208,0.150943,0.691757,True,False,4,5,5,3
4,1,I have been applying different pesticides to c...,"Dear farmer friend, please follow the followin...",openai-o1-2024-12-17,46,"First, identify the specific pest and confirm ...",0.215014,0.08547,0.168831,0.013158,0.090909,0.12987,0.751024,True,False,2,5,5,4


In [29]:
golden_qa_auto_eval.describe(include='all')

Unnamed: 0,question_id,question,correct_answer,model,response_id,answer,ngram_cosine_similarity,jaccard_similarity,rouge1,rouge2,rougeL,rougeLsum,embedding_cosine_similarity,factual_correctness,harmfulness,comprehensiveness,relevance,actionability,intelligibility
count,135.0,135,135,135,135.0,135,135.0,135.0,135.0,135.0,135.0,135.0,135.0,135,135,135.0,135.0,135.0,135.0
unique,,15,15,9,,135,,,,,,,,2,2,,,,
top,,I have been applying different pesticides to c...,"Dear farmer friend, please follow the followin...",anthropic-claude-3-7-sonnet-20250219,,# Improving Your Pest Control Approach\n\nWhen...,,,,,,,,True,False,,,,
freq,,9,9,15,,1,,,,,,,,133,134,,,,
mean,8.0,,,,68.0,,0.225447,0.087776,0.171898,0.034541,0.107157,0.121262,0.647513,,,3.348148,4.874074,4.740741,3.903704
std,4.336585,,,,39.115214,,0.070615,0.036667,0.064965,0.031602,0.043313,0.048231,0.119112,,,0.775738,0.333001,0.456511,0.296096
min,1.0,,,,1.0,,0.014664,0.009709,0.03125,0.0,0.03125,0.03125,0.315316,,,1.0,4.0,3.0,3.0
25%,4.0,,,,34.5,,0.179214,0.061051,0.121717,0.01108,0.072738,0.082364,0.582766,,,3.0,5.0,4.5,4.0
50%,8.0,,,,68.0,,0.227437,0.090909,0.179104,0.025641,0.101695,0.118721,0.671542,,,4.0,5.0,5.0,4.0
75%,12.0,,,,101.5,,0.264144,0.114479,0.216327,0.051957,0.133933,0.154069,0.73907,,,4.0,5.0,5.0,4.0


In [30]:
# save the results to a CSV file
golden_qa_auto_eval.to_csv('analysis_results/golden_qa_india_auto_eval.csv', index=False)