In [1]:
import pandas as pd
from pathlib import Path




In [2]:
def get_summarization_accuracy(data_path):
    if not Path(data_path).exists():
        raise ValueError(f"Data file {data_path} does not exist.")
    
    data = pd.read_csv(data_path)
    data['correct'] = data['label'] == 1
    accuracy_per_scope = data.groupby('scope')['correct'].mean().reset_index()
    accuracy_per_scope.rename(columns={'correct': 'accuracy'}, inplace=True)
    overall_accuracy = data['correct'].mean()
    accuracy_summary = pd.concat([accuracy_per_scope, pd.DataFrame({'scope': ['Overall'], 'accuracy': [overall_accuracy]})], ignore_index=True)
    return accuracy_summary

def get_files_by_m3(data_dir="../xmode/evaluation/checked_manually/", lang_list=["de", "en", "zh"], model_list=["openai", "m3ae"], modality_list=["vqa", "qa"]):
    # Get all files in the directory
    model_dict = {"openai": "gpt", "m3ae": "m3ae"}
    file_list = [f for f in Path(data_dir).rglob("*.csv") if f.is_file()]
    res = []
    for f in file_list:
        row = {}
        file_name = f.stem
        for lang in lang_list:
            if f"-{lang}" in file_name:
                row.update({"lang": lang})
                break
        for model in model_list:
            if model in file_name:
                row.update({"model": model_dict[model]})
                break
        for modality in modality_list:
            if modality in file_name:
                row.update({"modality": modality})
                break
        if row.get("lang", None) and row.get("model", None) and row.get("modality",None):
            results = get_summarization_accuracy(f).to_dict(orient="records")
            row.update({"results": results})
        res.append(row)
    return res
    
data = get_files_by_m3()    

In [3]:
data

[{'lang': 'zh',
  'model': 'gpt',
  'modality': 'vqa',
  'results': [{'scope': 'IMAGE-SINGLE-1', 'accuracy': 0.7},
   {'scope': 'IMAGE-SINGLE-2', 'accuracy': 0.3},
   {'scope': 'MULTIMODAL-SINGLE', 'accuracy': 0.4},
   {'scope': 'Overall', 'accuracy': 0.4666666666666667}]},
 {'lang': 'zh',
  'model': 'm3ae',
  'modality': 'vqa',
  'results': [{'scope': 'IMAGE-SINGLE-1', 'accuracy': 0.3},
   {'scope': 'IMAGE-SINGLE-2', 'accuracy': 0.3},
   {'scope': 'MULTIMODAL-SINGLE', 'accuracy': 0.6},
   {'scope': 'Overall', 'accuracy': 0.4}]},
 {'lang': 'zh',
  'model': 'gpt',
  'modality': 'qa',
  'results': [{'scope': 'IMAGE-SINGLE-1', 'accuracy': 0.5},
   {'scope': 'IMAGE-SINGLE-2', 'accuracy': 0.4},
   {'scope': 'MULTIMODAL-SINGLE', 'accuracy': 0.4},
   {'scope': 'Overall', 'accuracy': 0.43333333333333335}]},
 {'lang': 'zh',
  'model': 'm3ae',
  'modality': 'vqa',
  'results': [{'scope': 'IMAGE-SINGLE-1', 'accuracy': 0.5},
   {'scope': 'IMAGE-SINGLE-2', 'accuracy': 0.5},
   {'scope': 'MULTIMODAL

In [4]:
# Correct the dataframe construction by ensuring all columns are properly named and present

# Initialize a list of possible columns to avoid KeyErrors
columns = [
    'Question Scope', 'EN VQA_GPT', 'EN VQA_M3AE', 'EN QA_GPT',
    'DE VQA_GPT', 'DE VQA_M3AE', 'DE QA_GPT',
    'ZH VQA_GPT', 'ZH VQA_M3AE', 'ZH QA_GPT'
]

# Initialize the dictionary to hold the data for the dataframe with default values
df_data = {col: [None] * 4 for col in columns}
df_data['Question Scope'] = ['IMAGE-SINGLE-1', 'IMAGE-SINGLE-2', 'MULTIMODAL-SINGLE', 'Overall']

# Populate the dataframe with the corrected column names
for entry in data:
    lang = entry['lang']
    model = entry['model']
    modality = entry['modality']
    
    # Determine the column in the dataframe
    if modality == 'vqa':
        column_name = f"{lang.upper()} VQA_{model.upper()}"
    else:  # modality == 'v'
        column_name = f"{lang.upper()} QA_{model.upper()}"
    
    # Handle if column is not in the initial columns list (e.g., QA for M3AE is not mentioned)
    if column_name not in df_data:
        df_data[column_name] = [None] * 4
    
    # Fill in the accuracies
    for result in entry['results']:
        scope = result['scope']
        accuracy = round(result['accuracy'] * 100, 2) # Convert to percentage
        scope_index = df_data['Question Scope'].index(scope)
        df_data[column_name][scope_index] = accuracy

# Create the dataframe
df_corrected = pd.DataFrame(df_data)


In [5]:
df_corrected

Unnamed: 0,Question Scope,EN VQA_GPT,EN VQA_M3AE,EN QA_GPT,DE VQA_GPT,DE VQA_M3AE,DE QA_GPT,ZH VQA_GPT,ZH VQA_M3AE,ZH QA_GPT
0,IMAGE-SINGLE-1,20.0,20.0,20.0,40.0,20.0,30.0,70.0,50.0,50.0
1,IMAGE-SINGLE-2,0.0,20.0,10.0,20.0,20.0,10.0,30.0,50.0,40.0
2,MULTIMODAL-SINGLE,90.0,60.0,60.0,80.0,80.0,40.0,40.0,40.0,40.0
3,Overall,36.67,33.33,30.0,46.67,40.0,26.67,46.67,46.67,43.33


In [6]:
df_corrected.to_csv("../xmode/evaluation/xmode_summary.csv", index=False)