In [1]:
import pandas as pd

In [2]:
harder_mcq_df = pd.read_csv('20_harder_MCQs-all.csv')
harder_mcq_df.describe(include='all')

Unnamed: 0,model,question_id,question,answer,correct_answer,country
count,162,162.0,162,162,162,162
unique,9,,18,162,4,1
top,openai-o3-2025-04-16,,"How do I, in the shortest duration and cost ef...",Correct answer: A\n\nRationale: A follows the ...,A,Kenya
freq,18,,9,1,54,162
mean,,9.5,,,,
std,,5.204215,,,,
min,,1.0,,,,
25%,,5.0,,,,
50%,,9.5,,,,
75%,,14.0,,,,


In [3]:
harder_mcq_df['correct_answer'].value_counts()

correct_answer
A    54
B    54
C    27
D    27
Name: count, dtype: int64

In [4]:
"""
The format of the answers is:
Correct answer: [OPTION(S)]


Rationale: [Not longer than 10 sentences]"
"""
import re

# parse the answers
def parse_answer(answer):
    options_match = re.search(r'Correct answer: (.+)', answer)
    rationale_match = re.search(r'Rationale:\s*(.+)', answer)
    
    answer_options = options_match.group(1).strip() if options_match else None
    # remove brackets
    answer_options = answer_options.replace('[', '').replace(']', '')
    # handle verbose answers
    answer_options = answer_options.split('.')[0].strip() if '.' in answer_options else answer_options

    rationale = rationale_match.group(1).strip() if rationale_match else None
    
    return answer_options, rationale

# apply the parsing function to the 'answer' column
harder_mcq_df['parsed_answer'] = harder_mcq_df['answer'].apply(parse_answer)
# split the parsed answer into two new columns
harder_mcq_df[['answer_options', 'rationale']] = pd.DataFrame(harder_mcq_df['parsed_answer'].tolist(), index=harder_mcq_df.index)
harder_mcq_df.describe(include='all')

Unnamed: 0,model,question_id,question,answer,correct_answer,country,parsed_answer,answer_options,rationale
count,162,162.0,162,162,162,162,162,162,162
unique,9,,18,162,4,1,162,12,162
top,openai-o3-2025-04-16,,"How do I, in the shortest duration and cost ef...",Correct answer: A\n\nRationale: A follows the ...,A,Kenya,"(A, A follows the full “4R” nutrient-managemen...",B,A follows the full “4R” nutrient-management pr...
freq,18,,9,1,54,162,1,50,1
mean,,9.5,,,,,,,
std,,5.204215,,,,,,,
min,,1.0,,,,,,,
25%,,5.0,,,,,,,
50%,,9.5,,,,,,,
75%,,14.0,,,,,,,


In [5]:
harder_mcq_df.head()

Unnamed: 0,model,question_id,question,answer,correct_answer,country,parsed_answer,answer_options,rationale
0,openai-o3-2025-04-16,1,"How do I, in the shortest duration and cost ef...",Correct answer: A\n\nRationale: A follows the ...,A,Kenya,"(A, A follows the full “4R” nutrient-managemen...",A,A follows the full “4R” nutrient-management pr...
1,openai-o3-2025-04-16,2,How do I implement integrated pest management ...,"Correct answer: A, B\n\nRationale: IPM begins ...",A,Kenya,"(A, B, IPM begins with preventive cultural mea...","A, B",IPM begins with preventive cultural measures (...
2,openai-o3-2025-04-16,3,How do I calibrate a knapsack sprayer to ensur...,Correct answer: A\n\nRationale: Effective cali...,A,Kenya,"(A, Effective calibration follows a logical se...",A,Effective calibration follows a logical sequen...
3,openai-o3-2025-04-16,4,What is the right procedure to harvest onions?,Correct answer: A\n\nRationale: Harvesting oni...,A,Kenya,"(A, Harvesting onions is best done in dry weat...",A,Harvesting onions is best done in dry weather;...
4,openai-o3-2025-04-16,5,How should I wear PPE?,Correct answer: C\n\nRationale: PPE should be ...,C,Kenya,"(C, PPE should be donned from the “inside out,...",C,"PPE should be donned from the “inside out,” st..."


In [6]:
harder_mcq_df['answer_options'].value_counts()

answer_options
B             50
A             42
C             25
D             12
A, B           8
A, C           7
A, B, D        6
B, D           6
B, C, D        2
A, B, C        2
A, B, C, D     1
B, C           1
Name: count, dtype: int64

In [7]:
harder_mcq_df['is_correct'] = harder_mcq_df['correct_answer'] == harder_mcq_df['answer_options']
# calculate accuracy per model
accuracy_results = harder_mcq_df.groupby('model')['is_correct'].mean().reset_index().round(2)

# calculate overall accuracy
overall_accuracy = harder_mcq_df['is_correct'].mean()
print(f'Overall accuracy: {overall_accuracy:.2f}')

# sort the results by accuracy
accuracy_results = accuracy_results.sort_values(by='is_correct', ascending=False)

# write accuracy results to a CSV file
accuracy_results = accuracy_results.rename(columns={'is_correct': 'accuracy'})
accuracy_results.to_csv('analysis_results/harder_mcq_accuracy.csv', index=False)

Overall accuracy: 0.77


In [8]:
accuracy_results

Unnamed: 0,model,accuracy
0,anthropic-claude-3-5-sonnet-20241022,0.94
1,anthropic-claude-3-7-sonnet-20250219,0.89
3,deepseek-deepseek-reasoner,0.89
7,openai-o3-2025-04-16,0.89
6,openai-o1-2024-12-17,0.83
8,openai-o4-mini-2025-04-16,0.72
2,deepseek-deepseek-chat,0.67
5,openai-gpt-4o-2024-08-06,0.67
4,gemini-gemini-2-5-pro-preview-05-06,0.39


In [9]:
# look at the incorrect answers
harder_mcq_df[harder_mcq_df['is_correct'] == False]

Unnamed: 0,model,question_id,question,answer,correct_answer,country,parsed_answer,answer_options,rationale,is_correct
1,openai-o3-2025-04-16,2,How do I implement integrated pest management ...,"Correct answer: A, B\n\nRationale: IPM begins ...",A,Kenya,"(A, B, IPM begins with preventive cultural mea...","A, B",IPM begins with preventive cultural measures (...,False
12,openai-o3-2025-04-16,13,Which diseases should I look out for during co...,"Correct answer: A, B\n\nRationale: Cold, damp ...",D,Kenya,"(A, B, Cold, damp weather stresses birds and a...","A, B","Cold, damp weather stresses birds and aids sur...",False
28,deepseek-deepseek-reasoner,11,Which maize varieties are suitable lowlands of...,"Correct answer: A, B, D\n\nRationale: The lowl...",B,Kenya,"(A, B, D, The lowlands of Kenya require drough...","A, B, D",The lowlands of Kenya require drought-tolerant...,False
30,deepseek-deepseek-reasoner,13,Which diseases should I look out for during co...,Correct answer: C\n\nRationale: During cold we...,D,Kenya,"(C, During cold weather, poultry are more susc...",C,"During cold weather, poultry are more suscepti...",False
36,gemini-gemini-2-5-pro-preview-05-06,1,"How do I, in the shortest duration and cost ef...","Correct answer: [A, B]\n\nRationale:\nOption A...",A,Kenya,"(A, B, Option A describes the most technically...","A, B",Option A describes the most technically sound ...,False
37,gemini-gemini-2-5-pro-preview-05-06,2,How do I implement integrated pest management ...,"Correct answer: [A, B]\n\nRationale: Option A ...",A,Kenya,"(A, B, Option A provides the most comprehensiv...","A, B",Option A provides the most comprehensive and a...,False
39,gemini-gemini-2-5-pro-preview-05-06,4,What is the right procedure to harvest onions?,"Correct answer: [A, C]\n\nRationale: Both opti...",A,Kenya,"(A, C, Both options A and C describe key corre...","A, C",Both options A and C describe key correct step...,False
41,gemini-gemini-2-5-pro-preview-05-06,6,How do I use animal waste management on a mixe...,"Correct answer: A, C\n\nRationale: Both compos...",C,Kenya,"(A, C, Both composting (Option C) and direct a...","A, C",Both composting (Option C) and direct applicat...,False
42,gemini-gemini-2-5-pro-preview-05-06,7,How do I determine the seed rate required for ...,"Correct answer: [A, B, D]\n\nRationale:\nOptio...",D,Kenya,"(A, B, D, Options A and B correctly outline es...","A, B, D",Options A and B correctly outline essential in...,False
43,gemini-gemini-2-5-pro-preview-05-06,8,Last year I experienced significant losses in ...,"Correct answer: [B, D]\n\nRationale:\nOption D...",D,Kenya,"(B, D, Option D provides the most comprehensiv...","B, D",Option D provides the most comprehensive and i...,False
