In [1]:
import pandas as pd

In [2]:
mcq_df = pd.read_csv('MCQs-all.csv')
mcq_df.describe(include='all')

Unnamed: 0,model,question_id,question,answer,correct_answer,correct_rationale,country
count,135,135.0,135,135,135,135,135
unique,9,,15,135,8,15,1
top,openai-o3-2025-04-16,,Which irrigation method is most efficient for ...,Correct answer: A\n\nRationale: Drip irrigatio...,B,"In dry regions, water is a limited resource an...",Kenya
freq,15,,9,1,36,9,135
mean,,8.0,,,,,
std,,4.336585,,,,,
min,,1.0,,,,,
25%,,4.0,,,,,
50%,,8.0,,,,,
75%,,12.0,,,,,


In [3]:
mcq_df['correct_answer'].value_counts()

correct_answer
B          36
C          27
A          18
D          18
A, C, E     9
B, C, D     9
B, D        9
B, E        9
Name: count, dtype: int64

In [4]:
"""
The format of the answers is:
Correct answer: [OPTION(S)]


Rationale: [Not longer than 10 sentences]"
"""
import re

# parse the answers
def parse_answer(answer):
    options_match = re.search(r'Correct answer: (.+)', answer)
    rationale_match = re.search(r'Rationale:\s*(.+)', answer)
    
    answer_options = options_match.group(1).strip() if options_match else None
    # remove brackets
    answer_options = answer_options.replace('[', '').replace(']', '')
    # handle verbose answers
    answer_options = answer_options.split('.')[0].strip() if '.' in answer_options else answer_options
    # replace 'and' with comma for consistency
    answer_options = answer_options.replace(' and ', ', ')

    rationale = rationale_match.group(1).strip() if rationale_match else None
    
    return answer_options, rationale

# apply the parsing function to the 'answer' column
mcq_df['parsed_answer'] = mcq_df['answer'].apply(parse_answer)
# split the parsed answer into two new columns
mcq_df[['answer_options', 'rationale']] = pd.DataFrame(mcq_df['parsed_answer'].tolist(), index=mcq_df.index)
mcq_df.describe(include='all')

Unnamed: 0,model,question_id,question,answer,correct_answer,correct_rationale,country,parsed_answer,answer_options,rationale
count,135,135.0,135,135,135,135,135,135,135,135
unique,9,,15,135,8,15,1,135,17,135
top,openai-o3-2025-04-16,,Which irrigation method is most efficient for ...,Correct answer: A\n\nRationale: Drip irrigatio...,B,"In dry regions, water is a limited resource an...",Kenya,"(A, Drip irrigation delivers small, precise am...",B,"Drip irrigation delivers small, precise amount..."
freq,15,,9,1,36,9,135,1,29,1
mean,,8.0,,,,,,,,
std,,4.336585,,,,,,,,
min,,1.0,,,,,,,,
25%,,4.0,,,,,,,,
50%,,8.0,,,,,,,,
75%,,12.0,,,,,,,,


In [5]:
mcq_df.head()

Unnamed: 0,model,question_id,question,answer,correct_answer,correct_rationale,country,parsed_answer,answer_options,rationale
0,openai-o3-2025-04-16,1,Which irrigation method is most efficient for ...,Correct answer: A\n\nRationale: Drip irrigatio...,A,"In dry regions, water is a limited resource an...",Kenya,"(A, Drip irrigation delivers small, precise am...",A,"Drip irrigation delivers small, precise amount..."
1,openai-o3-2025-04-16,2,How do I apply fertilizer in my farm to maximi...,Correct answer: B\n\nRationale: Effective fert...,B,Fertilizer application is a technique that req...,Kenya,"(B, Effective fertilizer use follows the “4Rs”...",B,Effective fertilizer use follows the “4Rs”: th...
2,openai-o3-2025-04-16,3,What are the best practices in the intercroppi...,Correct answer: B\n\nRationale: Successful cer...,B,Intercropping enhances productivity when compa...,Kenya,"(B, Successful cereal–legume intercrops depend...",B,Successful cereal–legume intercrops depend on ...
3,openai-o3-2025-04-16,4,Which is the best way to apply herbicide in my...,Correct answer: C\n\nRationale: Effective weed...,C,Weed suppression before and after emergence of...,Kenya,"(C, Effective weed control in maize relies on ...",C,Effective weed control in maize relies on usin...
4,openai-o3-2025-04-16,5,Which is the best practice for storing grain t...,Correct answer: A\n\nRationale: Grain should f...,A,Post harvest losses in grain occur due to pest...,Kenya,"(A, Grain should first be properly dried, then...",A,"Grain should first be properly dried, then kep..."


In [6]:
mcq_df['answer_options'].value_counts()

answer_options
B             29
A             18
C             16
B, C, D       10
C, D           9
B, D           9
D              9
A, C, E        8
D, E           8
C, E           5
B, C           4
A, B, C, E     3
B, E           3
A, B, E        1
A, D, E        1
B, C, D, E     1
E              1
Name: count, dtype: int64

In [7]:
mcq_df['is_correct'] = mcq_df['correct_answer'] == mcq_df['answer_options']
# calculate accuracy per model
accuracy_results = mcq_df.groupby('model')['is_correct'].mean().reset_index().round(2)

# calculate overall accuracy
overall_accuracy = mcq_df['is_correct'].mean()
print(f'Overall accuracy: {overall_accuracy:.2f}')

# sort the results by accuracy
accuracy_results = accuracy_results.sort_values(by='is_correct', ascending=False)

# write accuracy results to a CSV file
accuracy_results = accuracy_results.rename(columns={'is_correct': 'accuracy'})
accuracy_results.to_csv('analysis_results/mcq_accuracy.csv', index=False)

Overall accuracy: 0.66


In [8]:
accuracy_results

Unnamed: 0,model,accuracy
0,anthropic-claude-3-5-sonnet-20241022,0.8
1,anthropic-claude-3-7-sonnet-20250219,0.73
3,deepseek-deepseek-reasoner,0.73
7,openai-o3-2025-04-16,0.67
6,openai-o1-2024-12-17,0.67
8,openai-o4-mini-2025-04-16,0.67
2,deepseek-deepseek-chat,0.6
4,gemini-gemini-2-5-pro-preview-05-06,0.53
5,openai-gpt-4o-2024-08-06,0.53


In [9]:
# look at the incorrect answers
mcq_df[mcq_df['is_correct'] == False]

Unnamed: 0,model,question_id,question,answer,correct_answer,correct_rationale,country,parsed_answer,answer_options,rationale,is_correct
5,openai-o3-2025-04-16,6,Which livestock breed is best for dairy produc...,"Correct answer: C, E\n\nRationale: Friesians (...","A, C, E",Best livestock breeds are bred for high milk y...,Kenya,"(C, E, Friesians (Holsteins) have the highest ...","C, E",Friesians (Holsteins) have the highest average...,False
6,openai-o3-2025-04-16,7,Which livestock breed is best for beef product...,"Correct answer: C, D\n\nRationale: Jersey and ...","B, C, D",Best livestock breeds for beef are bred for me...,Kenya,"(C, D, Jersey and Guernsey are specialized dai...","C, D",Jersey and Guernsey are specialized dairy bree...,False
9,openai-o3-2025-04-16,10,What are the ways of controlling fall armyworm...,"Correct answer: B, C, D\n\nRationale: Fall arm...",C,Fall armyworm (FAW) is a destructive pest that...,Kenya,"(B, C, D, Fall armyworm (FAW) is an insect pes...","B, C, D","Fall armyworm (FAW) is an insect pest, so fung...",False
10,openai-o3-2025-04-16,11,Which fertilizer application method is more ef...,"Correct answer: D, E\n\nRationale: Broadcastin...",D,The most effective fertilizer application meth...,Kenya,"(D, E, Broadcasting leaves most fertilizer on ...","D, E",Broadcasting leaves most fertilizer on or near...,False
12,openai-o3-2025-04-16,13,Which disease should I look at during cold wea...,"Correct answer: A, B, E\n\nRationale: Cold wea...","B, E","During cold weather, poultry is more vulnerabl...",Kenya,"(A, B, E, Cold weather stresses birds and lead...","A, B, E",Cold weather stresses birds and leads to crowd...,False
16,deepseek-deepseek-chat,2,How do I apply fertilizer in my farm to maximi...,"Correct answer: B, E \n\nRationale: \n- **Op...",B,Fertilizer application is a technique that req...,Kenya,"(B, E, - **Option B** is correct because spot ...","B, E",- **Option B** is correct because spot applica...,False
24,deepseek-deepseek-chat,10,What are the ways of controlling fall armyworm...,"Correct answer: B, C, D\n\nRationale: \n- **B...",C,Fall armyworm (FAW) is a destructive pest that...,Kenya,"(B, C, D, - **B (Remove attacked crops)**: Thi...","B, C, D",- **B (Remove attacked crops)**: This helps re...,False
25,deepseek-deepseek-chat,11,Which fertilizer application method is more ef...,"Correct answer: A, D, E\n\nRationale: For rain...",D,The most effective fertilizer application meth...,Kenya,"(A, D, E, For rainfed crops, broadcasting (A) ...","A, D, E","For rainfed crops, broadcasting (A) is effecti...",False
26,deepseek-deepseek-chat,12,Which fertilizer is suitable during top-dressi...,"Correct answer: C, D \n\nRationale: During to...",C,"During top dressing, maize requires fertilizer...",Kenya,"(C, D, During top-dressing of maize, nitrogen-...","C, D","During top-dressing of maize, nitrogen-rich fe...",False
27,deepseek-deepseek-chat,13,Which disease should I look at during cold wea...,"Correct answer: A, C, E\n\nRationale: During c...","B, E","During cold weather, poultry is more vulnerabl...",Kenya,"(A, C, E, During cold weather, poultry are mor...","A, C, E","During cold weather, poultry are more suscepti...",False
