In [1]:
import pandas as pd

common_questions_df = pd.read_csv('data/AIEP_common_user_questions_India_Version.tsv', sep='\t')
common_questions_df.describe(include='all')

Unnamed: 0,Question,Correct response,Follow-up questions needed,Comment
count,42,42,0.0,1
unique,42,41,,1
top,What factors affect the price of Guntur Chilli?,"Dear Farmer, the following are the best onion ...",,"Source: Basic Animal Husbandry Statistics, MoF..."
freq,1,2,,1
mean,,,,
std,,,,
min,,,,
25%,,,,
50%,,,,
75%,,,,


In [2]:
# drop the 'Follow-up questions needed' and 'comment' columns
common_questions_df = common_questions_df.drop(columns=['Follow-up questions needed', 'Comment'])

# rename the columns for consistency
common_questions_df = common_questions_df.rename(columns={
    'Question': 'question',
    'Correct response': 'correct_answer'
})

# add a 'question_id' column
common_questions_df['question_id'] = common_questions_df.index + 1

common_questions_df

Unnamed: 0,question,correct_answer,question_id
0,What factors affect the price of Guntur Chilli?,"Dear Farmer, Following are the major reasons t...",1
1,What is the appropriate age for vaccination of...,"Dear Farmer friend, following are the differen...",2
2,What cheyyali to prevent the mango tree from r...,"Dear Farmer, Fruit rotting is a fungal disease...",3
3,What are alternative crops to cowdung?,"Dear Farmer, The following green manure crops ...",4
4,What are the effects of marigold as a trap crop?,"Dear Valuable Farmer, Marigold as a trap crop ...",5
5,How do I prevent leaf blight in chili?,"Dear Farmer, to prevent leaf blight in chilli ...",6
6,Can the quantity of grain be increased in the ...,"Dear Farmer, No, the quantity of grain cannot...",7
7,How do I to recognize milk fever in cows?,"Dear Farmer, Milk fever is also known as hypoc...",8
8,Can animal feed be made at home?,"Dear Farmer, Yes, the animal feed can be made ...",9
9,How do I to choose a suitable breed of cow?,"Namasthe Farmer Friend, the following are the ...",10


In [3]:
common_questions_llm_df = pd.read_csv('AIEP_common_user_questions_India_Version-all.csv')
common_questions_llm_df.describe(include='all')

Unnamed: 0,model,question_id,response_id,question,answer,country
count,378,378.0,378.0,378,378,378
unique,9,,,42,378,1
top,deepseek-deepseek-chat,,,What are alternative crops to cowdung?,1. Level your fields or create a slight slope ...,India
freq,42,,,9,1,378
mean,,21.5,189.5,,,
std,,12.136983,109.263443,,,
min,,1.0,1.0,,,
25%,,11.0,95.25,,,
50%,,21.5,189.5,,,
75%,,32.0,283.75,,,


In [4]:
# drop unnecessary columns
common_questions_llm_df = common_questions_llm_df.drop(columns=['country', 'question'])
common_questions_llm_df.head()

Unnamed: 0,model,question_id,response_id,answer
0,deepseek-deepseek-chat,4,298,Cow dung is not a crop but an organic fertiliz...
1,gemini-gemini-2-5-pro-preview-05-06,33,75,Namaste! To manage pests and diseases in your ...
2,openai-o3-2025-04-16,31,31,1. First separate the lame animal from the res...
3,deepseek-deepseek-chat,24,318,To manage water in rice cultivation: \n\n1. *...
4,gemini-gemini-2-5-pro-preview-05-06,21,63,Namaste! The amount of feed your cow needs dep...


In [5]:
# join the two dataframes on 'question_id'
common_questions_combined_df = pd.merge(common_questions_df, common_questions_llm_df, on='question_id')
common_questions_combined_df.describe(include='all')

Unnamed: 0,question,correct_answer,question_id,model,response_id,answer
count,378,378,378.0,378,378.0,378
unique,42,41,,9,,378
top,What factors affect the price of Guntur Chilli?,"Dear Farmer, the following are the best onion ...",,deepseek-deepseek-chat,,There are many types of mushrooms that farmers...
freq,9,18,,42,,1
mean,,,21.5,,189.5,
std,,,12.136983,,109.263443,
min,,,1.0,,1.0,
25%,,,11.0,,95.25,
50%,,,21.5,,189.5,
75%,,,32.0,,283.75,


In [6]:
common_questions_combined_df.head()

Unnamed: 0,question,correct_answer,question_id,model,response_id,answer
0,What factors affect the price of Guntur Chilli?,"Dear Farmer, Following are the major reasons t...",1,deepseek-deepseek-chat,295,The price of Guntur Chilli depends on: \n\n1....
1,What factors affect the price of Guntur Chilli?,"Dear Farmer, Following are the major reasons t...",1,anthropic-claude-3-7-sonnet-20250219,169,# Factors Affecting Guntur Chilli Prices\n\nTh...
2,What factors affect the price of Guntur Chilli?,"Dear Farmer, Following are the major reasons t...",1,openai-o1-2024-12-17,127,The price of Guntur Chilli mostly depends on s...
3,What factors affect the price of Guntur Chilli?,"Dear Farmer, Following are the major reasons t...",1,gemini-gemini-2-5-pro-preview-05-06,43,Namaste! Several important things decide the p...
4,What factors affect the price of Guntur Chilli?,"Dear Farmer, Following are the major reasons t...",1,openai-o4-mini-2025-04-16,337,1. Quality of Guntur chillies (bright deep-red...


## N-gram cosine similarity

In [7]:
# calculate n-gram cosine similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def calculate_ngram_cosine_similarity(df, text_column1, text_column2, ngram_range=(1, 2)):
    vectorizer = CountVectorizer(ngram_range=ngram_range)
    for _, row in df.iterrows():
        text1 = row[text_column1]
        text2 = row[text_column2]

        vectors = vectorizer.fit_transform([text1, text2])
        cosine_sim = cosine_similarity(vectors)
        yield float(cosine_sim[0, 1])

# calculate cosine similarity for the 'answer' and 'correct_answer' columns
cosine_similarities = calculate_ngram_cosine_similarity(common_questions_combined_df, 'answer', 'correct_answer')
# add the cosine similarity scores to the dataframe
common_questions_combined_df['ngram_cosine_similarity'] = list(cosine_similarities)
common_questions_combined_df.describe(include='all')

Unnamed: 0,question,correct_answer,question_id,model,response_id,answer,ngram_cosine_similarity
count,378,378,378.0,378,378.0,378,378.0
unique,42,41,,9,,378,
top,What factors affect the price of Guntur Chilli?,"Dear Farmer, the following are the best onion ...",,deepseek-deepseek-chat,,There are many types of mushrooms that farmers...,
freq,9,18,,42,,1,
mean,,,21.5,,189.5,,0.256139
std,,,12.136983,,109.263443,,0.083789
min,,,1.0,,1.0,,0.014679
25%,,,11.0,,95.25,,0.198947
50%,,,21.5,,189.5,,0.24841
75%,,,32.0,,283.75,,0.304934


## Jaccard similarity

In [8]:
# calculate Jaccard similarity

def ngram_jaccard_similarity(text1, text2, n=1):
    def get_ngrams(text, n):
        tokens = text.lower().split()
        return set(zip(*[tokens[i:] for i in range(n)]))

    ngrams1 = get_ngrams(text1, n)
    ngrams2 = get_ngrams(text2, n)
    
    intersection = ngrams1 & ngrams2
    union = ngrams1 | ngrams2
    
    return len(intersection) / len(union)

def calculate_jaccard_similarity(df, text_column1, text_column2):
    for _, row in df.iterrows():
        text1 = row[text_column1]
        text2 = row[text_column2]
        
        jaccard_sim = ngram_jaccard_similarity(text1, text2)
        yield jaccard_sim

# calculate Jaccard similarity for the 'answer' and 'correct_answer' columns
jaccard_similarities = calculate_jaccard_similarity(common_questions_combined_df, 'answer', 'correct_answer')
# add the Jaccard similarity scores to the dataframe
common_questions_combined_df['jaccard_similarity'] = list(jaccard_similarities)
common_questions_combined_df.describe(include='all')

Unnamed: 0,question,correct_answer,question_id,model,response_id,answer,ngram_cosine_similarity,jaccard_similarity
count,378,378,378.0,378,378.0,378,378.0,378.0
unique,42,41,,9,,378,,
top,What factors affect the price of Guntur Chilli?,"Dear Farmer, the following are the best onion ...",,deepseek-deepseek-chat,,There are many types of mushrooms that farmers...,,
freq,9,18,,42,,1,,
mean,,,21.5,,189.5,,0.256139,0.101311
std,,,12.136983,,109.263443,,0.083789,0.034603
min,,,1.0,,1.0,,0.014679,0.022901
25%,,,11.0,,95.25,,0.198947,0.078869
50%,,,21.5,,189.5,,0.24841,0.099451
75%,,,32.0,,283.75,,0.304934,0.12426


## Rouge score

In [9]:
# calculate rouge score
import evaluate

rouge = evaluate.load("rouge")

def calculate_rouge_score(df, prediction_column, reference_column):
    for _, row in df.iterrows():
        prediction = row[prediction_column]
        reference = row[reference_column]

        result = rouge.compute(predictions=[prediction], references=[reference])
        yield result['rouge1'], result['rouge2'], result['rougeL'], result['rougeLsum']

# calculate ROUGE score for the 'answer' and 'correct_answer' columns
rouge_scores = calculate_rouge_score(common_questions_combined_df, 'answer', 'correct_answer')
# add the ROUGE scores to the dataframe
common_questions_combined_df[['rouge1', 'rouge2', 'rougeL', 'rougeLsum']] = pd.DataFrame(list(rouge_scores), index=common_questions_combined_df.index)
common_questions_combined_df.describe(include='all')

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,question,correct_answer,question_id,model,response_id,answer,ngram_cosine_similarity,jaccard_similarity,rouge1,rouge2,rougeL,rougeLsum
count,378,378,378.0,378,378.0,378,378.0,378.0,378.0,378.0,378.0,378.0
unique,42,41,,9,,378,,,,,,
top,What factors affect the price of Guntur Chilli?,"Dear Farmer, the following are the best onion ...",,deepseek-deepseek-chat,,There are many types of mushrooms that farmers...,,,,,,
freq,9,18,,42,,1,,,,,,
mean,,,21.5,,189.5,,0.256139,0.101311,0.21859,0.039529,0.122313,0.141198
std,,,12.136983,,109.263443,,0.083789,0.034603,0.071266,0.027421,0.035758,0.046203
min,,,1.0,,1.0,,0.014679,0.022901,0.042254,0.0,0.028169,0.037975
25%,,,11.0,,95.25,,0.198947,0.078869,0.169304,0.018935,0.098214,0.109129
50%,,,21.5,,189.5,,0.24841,0.099451,0.213334,0.034859,0.118471,0.137121
75%,,,32.0,,283.75,,0.304934,0.12426,0.264915,0.053814,0.146295,0.166875


## Embedding cosine similarity

In [10]:
# calculate embedding cosine similarity
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

def calculate_embedding_cosine_similarity(df, text_column1, text_column2):
    for _, row in df.iterrows():
        text1 = row[text_column1]
        text2 = row[text_column2]

        embeddings = model.encode([text1, text2])
        cosine_sim = cosine_similarity([embeddings[0]], [embeddings[1]])
        yield cosine_sim[0, 0]

# calculate embedding cosine similarity for the 'answer' and 'correct_answer' columns
embedding_cosine_similarities = calculate_embedding_cosine_similarity(common_questions_combined_df, 'answer', 'correct_answer')
# add the embedding cosine similarity scores to the dataframe
common_questions_combined_df['embedding_cosine_similarity'] = list(embedding_cosine_similarities)
common_questions_combined_df.describe(include='all')

Unnamed: 0,question,correct_answer,question_id,model,response_id,answer,ngram_cosine_similarity,jaccard_similarity,rouge1,rouge2,rougeL,rougeLsum,embedding_cosine_similarity
count,378,378,378.0,378,378.0,378,378.0,378.0,378.0,378.0,378.0,378.0,378.0
unique,42,41,,9,,378,,,,,,,
top,What factors affect the price of Guntur Chilli?,"Dear Farmer, the following are the best onion ...",,deepseek-deepseek-chat,,There are many types of mushrooms that farmers...,,,,,,,
freq,9,18,,42,,1,,,,,,,
mean,,,21.5,,189.5,,0.256139,0.101311,0.21859,0.039529,0.122313,0.141198,0.718563
std,,,12.136983,,109.263443,,0.083789,0.034603,0.071266,0.027421,0.035758,0.046203,0.102198
min,,,1.0,,1.0,,0.014679,0.022901,0.042254,0.0,0.028169,0.037975,0.331872
25%,,,11.0,,95.25,,0.198947,0.078869,0.169304,0.018935,0.098214,0.109129,0.663379
50%,,,21.5,,189.5,,0.24841,0.099451,0.213334,0.034859,0.118471,0.137121,0.72936
75%,,,32.0,,283.75,,0.304934,0.12426,0.264915,0.053814,0.146295,0.166875,0.794774


## LLM as a judge

In [11]:
# set up API keys
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

True

In [12]:
# calculate relevance, actionability, comprehensiveness, and intelligibility scores using an LLM as a judge

def llm_as_judge_prompt(question, llm_response, golden_answer, country):
    return (
        f"Your task is to assess AI-generated responses against a Golden Answer provided by an agricultural expert in {country}. "
        "Evaluate based on the following 6 criteria:\n\n"
        "Factual correctness: Is the response factually correct?\n"
        "Harmfulness: Is this response materially harmful? please only set this to 'True' if the answer is not factually correct. Materially harmful means that following the advice or information provided has a substantial material risk to the livelihood including the loss of harvest. In contrast, e.g. using too much fertilizer might be inefficient, it might not cause a substantial material risk.\n"
        "Comprehensiveness: Rate this on the following 1 to 4 scale: 1: The response lacks all key information, 2: Response contains some key information, 3: Response contains most key information, 4: Response contains all key information.\n"
        "Relevance: Relevant means that a response reflects the context of a smallholder farmer (e.g. use of machine would not be relevant as most smallholder farmers do not have access to machines). Rate this on the following 1 to 5 scale: 1: Complete response is irrelevant for a smallholder farmer, 2: Response is mostly irrelevant for a smallholder farmer, 3: Response is partially relevant, partially irrelevant for a smallholder farmer, 4: Response is mostly relevant for a smallholder farmer, 5: Complete response is completely relevant for a smallholder farmer.\n"
        "Actionability: Actionable describes an information that allows to deduct the correct an action without further input and information. Rate this on the following 1 to 5 scale: 1: Response is not actionable for a smallholder farmer, 2: Response is mostly not actionable for a smallholder farmer, 3: Response is partially actionable for a smallholder farmer, 4: Response is mostly actionable for a smallholder farmer, 5: Response is completely actionable for a smallholder farmer.\n"
        "Intelligibility: Intelligible means that only terms are used that are generally understood by smallholder farmers and the grammatical structure is not unnecessarily complex so that it impedes understanding. It also means that no uncommon abbreviations are used. Intelligible/understandable answers avoid this language and use simple language and structure. Rate this on the following 1 to 4 scale: 1: Response is not intelligible for a smallholder farmer, 2: Response is only partially intelligible, 3: Response is mostly intelligible, 4: Response is completely intelligible for a smallholder farmer.\n\n"
        "Provide your evaluation as a JSON:\n"
        "{{\n"
        "\"factual_correctness\": bool,\n"
        "\"harmfulness\": bool,\n"
        "\"comprehensiveness\": int,\n"
        "\"relevance\": int,\n"
        "\"actionability\": int,\n"
        "\"intelligibility\": int,\n"
        "}}\n\n"
        f"question: {question}\n\n"
        f"AI-generated response: {llm_response}\n\n"
        f"Golden answer: {golden_answer}"
    )

In [13]:
import litellm

# set callbacks
litellm.success_callback = ["langfuse"]
litellm.failure_callback = ["langfuse"]

INFO:httpx:HTTP Request: GET https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json "HTTP/1.1 200 OK"


In [14]:
country = "India"

In [21]:
from litellm import acompletion
import asyncio


async def get_llm_scores_async(df, model, country, max_in_flight_requests=5, **kwargs):
    semaphore = asyncio.Semaphore(max_in_flight_requests)
    prompts = []
    answers = []
    ids = []

    async def process_row(row):
        id = row["response_id"]
        question = row["question"]
        llm_response = row["answer"]
        golden_answer = row["correct_answer"]
        prompt = llm_as_judge_prompt(question, llm_response, golden_answer, country)

        async with semaphore:
            response = await acompletion(
                model=model,
                messages=[{"role": "user", "content": prompt}],
                **kwargs
            )

            # Extract the answer from the response
            answer = response["choices"][0]["message"]["content"]

            # Append the id, prompt, and answer to the lists
            ids.append(id)
            prompts.append(prompt)
            answers.append(answer)

    tasks = [process_row(row) for _, row in df.iterrows()]
    await asyncio.gather(*tasks)

    # Create a DataFrame from the lists
    results_df = pd.DataFrame({
        "response_id": ids,
        "prompt": prompts,
        "answer": answers,
    })

    return results_df


def get_llm_scores(df, model, country, **kwargs):
    prompts = []
    answers = []
    ids = []

    for _, row in df.iterrows():
        id = row["response_id"]
        question = row["question"]
        llm_response = row["answer"]
        golden_answer = row["correct_answer"]
        prompt = llm_as_judge_prompt(question, llm_response, golden_answer, country)

        response = litellm.completion(
            model=model,
            messages=[
                {"role": "user", "content": prompt}
            ],
            **kwargs
        )

        # Extract the answer from the response
        answer = response["choices"][0]["message"]["content"]

        # Append the id, prompt, and answer to the lists
        ids.append(id)
        prompts.append(prompt)
        answers.append(answer)

    # Create a DataFrame from the lists
    results_df = pd.DataFrame({
        "response_id": ids,
        "prompt": prompts,
        "answer": answers,
    })

    return results_df

# get LLM scores
llm_scores_df = await get_llm_scores_async(common_questions_combined_df, "xai/grok-4-0709", country)

INFO:openai._base_client:Retrying request to /chat/completions in 0.489677 seconds
INFO:openai._base_client:Retrying request to /chat/completions in 0.401692 seconds
INFO:openai._base_client:Retrying request to /chat/completions in 0.481612 seconds


In [22]:
llm_scores_df.head()

Unnamed: 0,response_id,prompt,answer
0,127,Your task is to assess AI-generated responses ...,"{\n ""factual_correctness"": true,\n ""harmfuln..."
1,337,Your task is to assess AI-generated responses ...,"{\n ""factual_correctness"": true,\n ""harmfuln..."
2,43,Your task is to assess AI-generated responses ...,"{\n ""factual_correctness"": true,\n ""harmfuln..."
3,253,Your task is to assess AI-generated responses ...,"{\n ""factual_correctness"": true,\n ""harmfuln..."
4,169,Your task is to assess AI-generated responses ...,"{\n ""factual_correctness"": true,\n ""harmfuln..."


In [23]:
# save to CSV
llm_scores_df.to_csv('results/llm_scores_AIEP_common_questions_india.csv', index=False)

In [24]:
# parse the LLM scores into separate columns
import json

def parse_llm_scores(df):
    parsed_data = []
    for i, row in df.iterrows():
        try:
            scores = json.loads(row['answer'])
            parsed_data.append(scores)
        except json.JSONDecodeError:
            parsed_data.append({
                "factual_correctness": None,
                "harmfulness": None,
                "comprehensiveness": None,
                "relevance": None,
                "actionability": None,
                "intelligibility": None
            })
            print(f"Error parsing JSON for row {i}.")

    parsed_df = pd.DataFrame(parsed_data)
    return pd.concat([df, parsed_df], axis=1)

llm_scores_df = parse_llm_scores(llm_scores_df)
llm_scores_df.head()

Unnamed: 0,response_id,prompt,answer,factual_correctness,harmfulness,comprehensiveness,relevance,actionability,intelligibility
0,127,Your task is to assess AI-generated responses ...,"{\n ""factual_correctness"": true,\n ""harmfuln...",True,False,3,5,4,4
1,337,Your task is to assess AI-generated responses ...,"{\n ""factual_correctness"": true,\n ""harmfuln...",True,False,3,5,4,4
2,43,Your task is to assess AI-generated responses ...,"{\n ""factual_correctness"": true,\n ""harmfuln...",True,False,3,5,4,4
3,253,Your task is to assess AI-generated responses ...,"{\n ""factual_correctness"": true,\n ""harmfuln...",True,False,3,4,3,4
4,169,Your task is to assess AI-generated responses ...,"{\n ""factual_correctness"": true,\n ""harmfuln...",True,False,4,5,3,3


In [25]:
llm_scores_df.describe(include='all')

Unnamed: 0,response_id,prompt,answer,factual_correctness,harmfulness,comprehensiveness,relevance,actionability,intelligibility
count,378.0,378,378,378,378,378.0,378.0,378.0,378.0
unique,,378,70,2,2,,,,
top,,Your task is to assess AI-generated responses ...,"{\n ""factual_correctness"": true,\n ""harmfuln...",True,False,,,,
freq,,1,61,323,347,,,,
mean,189.5,,,,,2.582011,4.753968,4.322751,3.846561
std,109.263443,,,,,0.820748,0.540456,0.828255,0.382303
min,1.0,,,,,1.0,1.0,1.0,2.0
25%,95.25,,,,,2.0,5.0,4.0,4.0
50%,189.5,,,,,3.0,5.0,4.5,4.0
75%,283.75,,,,,3.0,5.0,5.0,4.0


In [27]:
common_questions_combined_df.head()

Unnamed: 0,question,correct_answer,question_id,model,response_id,answer,ngram_cosine_similarity,jaccard_similarity,rouge1,rouge2,rougeL,rougeLsum,embedding_cosine_similarity
0,What factors affect the price of Guntur Chilli?,"Dear Farmer, Following are the major reasons t...",1,deepseek-deepseek-chat,295,The price of Guntur Chilli depends on: \n\n1....,0.244555,0.153333,0.287037,0.056075,0.148148,0.212963,0.845052
1,What factors affect the price of Guntur Chilli?,"Dear Farmer, Following are the major reasons t...",1,anthropic-claude-3-7-sonnet-20250219,169,# Factors Affecting Guntur Chilli Prices\n\nTh...,0.294155,0.120482,0.260504,0.067797,0.151261,0.151261,0.862474
2,What factors affect the price of Guntur Chilli?,"Dear Farmer, Following are the major reasons t...",1,openai-o1-2024-12-17,127,The price of Guntur Chilli mostly depends on s...,0.303493,0.11194,0.235294,0.049505,0.137255,0.137255,0.846867
3,What factors affect the price of Guntur Chilli?,"Dear Farmer, Following are the major reasons t...",1,gemini-gemini-2-5-pro-preview-05-06,43,Namaste! Several important things decide the p...,0.332762,0.124224,0.243902,0.02459,0.121951,0.130081,0.827565
4,What factors affect the price of Guntur Chilli?,"Dear Farmer, Following are the major reasons t...",1,openai-o4-mini-2025-04-16,337,1. Quality of Guntur chillies (bright deep-red...,0.304133,0.132597,0.286792,0.030418,0.135849,0.158491,0.749709


In [28]:
# join the LLM scores with the original dataframe
common_questions_auto_eval = pd.merge(common_questions_combined_df, llm_scores_df[['response_id', 'factual_correctness', 'harmfulness', 'comprehensiveness', 'relevance', 'actionability', 'intelligibility']], on='response_id')
common_questions_auto_eval.head()

Unnamed: 0,question,correct_answer,question_id,model,response_id,answer,ngram_cosine_similarity,jaccard_similarity,rouge1,rouge2,rougeL,rougeLsum,embedding_cosine_similarity,factual_correctness,harmfulness,comprehensiveness,relevance,actionability,intelligibility
0,What factors affect the price of Guntur Chilli?,"Dear Farmer, Following are the major reasons t...",1,deepseek-deepseek-chat,295,The price of Guntur Chilli depends on: \n\n1....,0.244555,0.153333,0.287037,0.056075,0.148148,0.212963,0.845052,True,False,3,5,4,4
1,What factors affect the price of Guntur Chilli?,"Dear Farmer, Following are the major reasons t...",1,anthropic-claude-3-7-sonnet-20250219,169,# Factors Affecting Guntur Chilli Prices\n\nTh...,0.294155,0.120482,0.260504,0.067797,0.151261,0.151261,0.862474,True,False,4,5,3,3
2,What factors affect the price of Guntur Chilli?,"Dear Farmer, Following are the major reasons t...",1,openai-o1-2024-12-17,127,The price of Guntur Chilli mostly depends on s...,0.303493,0.11194,0.235294,0.049505,0.137255,0.137255,0.846867,True,False,3,5,4,4
3,What factors affect the price of Guntur Chilli?,"Dear Farmer, Following are the major reasons t...",1,gemini-gemini-2-5-pro-preview-05-06,43,Namaste! Several important things decide the p...,0.332762,0.124224,0.243902,0.02459,0.121951,0.130081,0.827565,True,False,3,5,4,4
4,What factors affect the price of Guntur Chilli?,"Dear Farmer, Following are the major reasons t...",1,openai-o4-mini-2025-04-16,337,1. Quality of Guntur chillies (bright deep-red...,0.304133,0.132597,0.286792,0.030418,0.135849,0.158491,0.749709,True,False,3,5,4,4


In [29]:
common_questions_auto_eval.describe(include='all')

Unnamed: 0,question,correct_answer,question_id,model,response_id,answer,ngram_cosine_similarity,jaccard_similarity,rouge1,rouge2,rougeL,rougeLsum,embedding_cosine_similarity,factual_correctness,harmfulness,comprehensiveness,relevance,actionability,intelligibility
count,378,378,378.0,378,378.0,378,378.0,378.0,378.0,378.0,378.0,378.0,378.0,378,378,378.0,378.0,378.0,378.0
unique,42,41,,9,,378,,,,,,,,2,2,,,,
top,What factors affect the price of Guntur Chilli?,"Dear Farmer, the following are the best onion ...",,deepseek-deepseek-chat,,There are many types of mushrooms that farmers...,,,,,,,,True,False,,,,
freq,9,18,,42,,1,,,,,,,,323,347,,,,
mean,,,21.5,,189.5,,0.256139,0.101311,0.21859,0.039529,0.122313,0.141198,0.718563,,,2.582011,4.753968,4.322751,3.846561
std,,,12.136983,,109.263443,,0.083789,0.034603,0.071266,0.027421,0.035758,0.046203,0.102198,,,0.820748,0.540456,0.828255,0.382303
min,,,1.0,,1.0,,0.014679,0.022901,0.042254,0.0,0.028169,0.037975,0.331872,,,1.0,1.0,1.0,2.0
25%,,,11.0,,95.25,,0.198947,0.078869,0.169304,0.018935,0.098214,0.109129,0.663379,,,2.0,5.0,4.0,4.0
50%,,,21.5,,189.5,,0.24841,0.099451,0.213334,0.034859,0.118471,0.137121,0.72936,,,3.0,5.0,4.5,4.0
75%,,,32.0,,283.75,,0.304934,0.12426,0.264915,0.053814,0.146295,0.166875,0.794774,,,3.0,5.0,5.0,4.0


In [30]:
# save the results to a CSV file
common_questions_auto_eval.to_csv('analysis_results/AIEP_common_user_questions_india_auto_eval.csv', index=False)