In [None]:
# ORIGINAL DATASETS FROM THESE SOURCES:
# dermavqa-iiyi subset: https://osf.io/72rp3/overview: Files/data/iiyi/{'valid_ht','test_ht_spanishtestsetcorrected'}.json
# (v2 and v3 have translation corrections but this was done after this ratings work) - these later version should be used in future work.
# Please also include iiyi_{valid,test}_extension.json - supplied in this challenge. These are extra gold standards we collected.
#
# woundcarevqa subset: https://osf.io/xsj5u/overview: Files/dataset-challenge-mediqa-2025-wv/{valid,test}.json

In [1]:
!ls original_datasets

[34miiyi[m[m      [34mwoundcare[m[m


In [2]:
!ls original_datasets/iiyi

iiyi_test_extension.json             test_ht_spanishtestsetcorrected.json
iiyi_valid_extension.json            valid_ht.json


In [3]:
!ls original_datasets/woundcare

test.json  valid.json


In [4]:
import json
data_gold_by_ds = {}

with open( 'original_datasets/iiyi/valid_ht.json' ) as f :
    data = json.load(f)
    data_gold_by_ds['iiyi'] = {}
    for item in data :
        encounter_id = item['encounter_id']
        data_gold_by_ds['iiyi'][encounter_id] = item

with open( 'original_datasets/woundcare/valid.json' ) as f :
    data = json.load(f)
    data_gold_by_ds['woundcare'] = {}
    for item in data :
        encounter_id = item['encounter_id']
        data_gold_by_ds['woundcare'][encounter_id] = item

In [5]:
#add iiyi extension datasets
with open( 'original_datasets/iiyi/iiyi_valid_extension.json' ) as f :
    data = json.load(f)
    for item in data :
        encounter_id = item['encounter_id']
        responses = item['responses']
        data_gold_by_ds['iiyi'][encounter_id]['responses'].append( responses[-2] )
        data_gold_by_ds['iiyi'][encounter_id]['responses'].append( responses[-1] )

In [6]:
data_gold_by_ds['iiyi'][encounter_id]['responses']

[{'author_id': 'U00780',
  'content_zh': '可能是酒红斑',
  'content_en': 'Could be alcoholic erythema.',
  'content_es': 'Podría ser un eritema alcohólico.',
  'completeness': 0.5,
  'contains_freq_ans': 1.0},
 {'author_id': 'U00758',
  'content_zh': '酒精过敏。',
  'content_en': 'Could be allergy to alcohol.',
  'content_es': 'Podría ser alergia al alcohol.',
  'completeness': 0.5,
  'contains_freq_ans': 0.0},
 {'author_id': 'U00121',
  'content_zh': '酒精过敏？胆碱能荨麻疹？',
  'content_en': 'Allergy to alcohol?  Cholinergic urticaria?',
  'content_es': '¿Alergia al alcohol?  ¿Urticaria colinérgica?',
  'completeness': 0.5,
  'contains_freq_ans': 0.0},
 {'author_id': 'U00000',
  'content_zh': '酒精过敏。同意。',
  'content_en': 'Allergy to alcohol.  Concur.',
  'content_es': 'Alergia al alcohol, estoy de acuerdo.',
  'completeness': 0.5,
  'contains_freq_ans': 0.0},
 {'author_id': 'U00120',
  'content_zh': '酒性红斑',
  'content_en': 'alcoholic erythema',
  'content_es': 'Eritema alcohólico',
  'completeness': 0.5,
 

In [None]:
# read in input file - make sure to deduplicate so you don't get duplicates due to multiple raters and axes

In [7]:
import pandas as pd

In [8]:
df_valid = pd.read_csv( 'mediqa-eval-2026-valid.csv')

In [9]:
len( df_valid )

4872

In [10]:
df_valid.columns

Index(['dataset', 'encounter_id', 'lang', 'candidate', 'candidate_author_id',
       'metric', 'rater_id', 'value'],
      dtype='object')

In [11]:
df_valid['rater_id'].value_counts()

rater_id
A1    2856
NM    1008
SG    1008
Name: count, dtype: int64

In [12]:
df_valid_deduplicated = df_valid[ ['dataset', 'encounter_id', 'lang', 'candidate', 'candidate_author_id'] ].drop_duplicates()

In [13]:
len( df_valid_deduplicated )

966

In [14]:
df_valid_deduplicated[['dataset','lang','candidate_author_id']].value_counts()

dataset    lang  candidate_author_id
woundcare  en    SYSTEM001              105
                 SYSTEM002              105
                 SYSTEM003              105
           zh    SYSTEM001              105
                 SYSTEM002              105
                 SYSTEM003              105
iiyi       en    SYSTEM001               56
                 SYSTEM002               56
                 SYSTEM003               56
           zh    SYSTEM001               56
                 SYSTEM002               56
                 SYSTEM003               56
Name: count, dtype: int64

In [None]:
# Assume we want to test correlation of rouge as general purpose metric across different eval axes.
# To optimize scores, you can futher create per-metric based evaluation metrics.

In [None]:
# Go through the encounter id's and create a csv file with the following columns:
# dataset - dataset name {iiyi, woundcare}
# encounter_id - encounter id
# candidate - system output candidate
# candidate_author_id - candidate author_id
# lang - language {'en','zh'}
# rater_id - you can put your eval metric system id here
# metric - metric you are evaluating for en you should include {disagree_flag,completeness,factual-accuracy,relevance,writing-style,overall}, for chinese {factual-consistency-wgold,writing-style,overall}
# value - your metric value

In [16]:
split = 'valid'
datasets = [ 'iiyi','woundcare' ]
langs = ['en','zh']

candidate_author_ids = systems = ['SYSTEM001',
                                  'SYSTEM002',
                                  'SYSTEM003']

In [17]:
# define an evaluation metric here. for simplicity we'll say this bleu implementation is my new evaluation metric
import sacrebleu_deltableu
def get_new_automatic_metric( candidate, references, lang ) :
    """
    candidate: text to be evaluated
    gold_responses: a list of gold responses
    lang: language
    """
    deltableu = sacrebleu_deltableu.corpus_bleu_t( [candidate],
                                            [references],
                                            ref_weights= [[1]*len(references)],
                                            tokenize='13a' if lang=='en' else 'zh',
                                            lowercase=True,
                                            use_effective_order=True )
    return deltableu.score/100.0

In [18]:
my_new_eval_metric_results = []

for dataset in datasets :
    for lang in langs :
        
        for encounter_id in data_gold_by_ds[ dataset ].keys() :
            data_item = data_gold_by_ds[ dataset ][ encounter_id ]
            gold_responses = [ response['content_{}'.format(lang)] for response in data_item['responses']]

            df_subset = df_valid_deduplicated[ (df_valid_deduplicated['dataset']==dataset) & \
                                              (df_valid_deduplicated['lang']==lang) & \
                                              (df_valid_deduplicated['encounter_id']==encounter_id)
                                            ]
        
            for _, row in df_subset.iterrows() :
                candidate = '' if pd.isnull( row['candidate'] ) else row['candidate']
                candidate_author_id = row['candidate_author_id']

                score = get_new_automatic_metric( candidate, gold_responses, lang )
                
                my_new_eval_metric_results.append(
                    {
                        'dataset': dataset,
                        'lang': lang,
                        'encounter_id': encounter_id,
                        'candidate': candidate,
                        'candidate_author_id': candidate_author_id,
                        'rater_id': 'bleu',
                        'metric': 'overall',
                        'value': score
                    }
                )

In [19]:
df_myresponses = pd.DataFrame( my_new_eval_metric_results )

In [20]:
df_myresponses.head()

Unnamed: 0,dataset,lang,encounter_id,candidate,candidate_author_id,rater_id,metric,value
0,iiyi,en,ENC00852,Unclear diagnosis without biopsy. Could be se...,SYSTEM001,bleu,overall,0.03018
1,iiyi,en,ENC00852,This appears more consistent with eczema. A bi...,SYSTEM002,bleu,overall,0.032147
2,iiyi,en,ENC00852,"Yes, it is possible that this could be Vitilig...",SYSTEM003,bleu,overall,0.050914
3,iiyi,en,ENC00853,Likely chronic eczema or psoriasis. See a der...,SYSTEM001,bleu,overall,0.116988
4,iiyi,en,ENC00853,This appears to be chronic eczema. Consult der...,SYSTEM002,bleu,overall,0.176093


In [21]:
df_myresponses[['dataset','lang','candidate_author_id']].value_counts()

dataset    lang  candidate_author_id
woundcare  en    SYSTEM001              105
                 SYSTEM002              105
                 SYSTEM003              105
           zh    SYSTEM001              105
                 SYSTEM002              105
                 SYSTEM003              105
iiyi       en    SYSTEM001               56
                 SYSTEM002               56
                 SYSTEM003               56
           zh    SYSTEM001               56
                 SYSTEM002               56
                 SYSTEM003               56
Name: count, dtype: int64

In [None]:
# In English the expectation is to have 6 metrics, Chinese 2.
# For simplicity, we'll just assign the same value as our new metric. You may want to come up with more complex strategies in your submissions.

In [22]:
dfs_results = []

In [23]:
df_myresponses_en = df_myresponses[ df_myresponses['lang']=='en' ]
df_myresponses_zh = df_myresponses[ df_myresponses['lang']=='zh' ]

In [24]:
metrics_en = ['disagree_flag','completeness','factual-accuracy','relevance','writing-style']
for metric in metrics_en :
    df_temp = df_myresponses_en.copy()
    df_temp['metric'] = metric
    dfs_results.append(df_temp)
dfs_results.append(df_myresponses_en)

In [25]:
metrics_zh = ['factual-consistency-wgold','writing-style']
for metric in metrics_zh :
    df_temp = df_myresponses_zh.copy()
    df_temp['metric'] = metric
    dfs_results.append(df_temp)

In [26]:
df_results = pd.concat( dfs_results )

In [27]:
len( df_results )

3864

In [28]:
df_results.head()

Unnamed: 0,dataset,lang,encounter_id,candidate,candidate_author_id,rater_id,metric,value
0,iiyi,en,ENC00852,Unclear diagnosis without biopsy. Could be se...,SYSTEM001,bleu,disagree_flag,0.03018
1,iiyi,en,ENC00852,This appears more consistent with eczema. A bi...,SYSTEM002,bleu,disagree_flag,0.032147
2,iiyi,en,ENC00852,"Yes, it is possible that this could be Vitilig...",SYSTEM003,bleu,disagree_flag,0.050914
3,iiyi,en,ENC00853,Likely chronic eczema or psoriasis. See a der...,SYSTEM001,bleu,disagree_flag,0.116988
4,iiyi,en,ENC00853,This appears to be chronic eczema. Consult der...,SYSTEM002,bleu,disagree_flag,0.176093


In [29]:
df_results[['dataset','lang','candidate_author_id']].value_counts()

dataset    lang  candidate_author_id
woundcare  en    SYSTEM001              630
                 SYSTEM002              630
                 SYSTEM003              630
iiyi       en    SYSTEM001              336
                 SYSTEM002              336
                 SYSTEM003              336
woundcare  zh    SYSTEM001              210
                 SYSTEM002              210
                 SYSTEM003              210
iiyi       zh    SYSTEM001              112
                 SYSTEM002              112
                 SYSTEM003              112
Name: count, dtype: int64

In [30]:
df_results[['dataset','lang']].value_counts()

dataset    lang
woundcare  en      1890
iiyi       en      1008
woundcare  zh       630
iiyi       zh       336
Name: count, dtype: int64

In [31]:
df_results.to_csv('mediqa-eval-2026-valid-system.csv')