In [40]:
import os
import ast
import ineqpy
import numpy as np
import pandas as pd
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go

from pathlib import Path
from collections import defaultdict

In [41]:
path_meta = Path("../meta")
font_size = 15

workflow_pwc = ['1']
display_format_pwc = ['Rankable Table', 'Scatter Plot']
display_format_rankable = ['Rankable Table', 'Rankable Bar Chart']
workflow_patterns_with_submission = ['1', '2', '3', '4', '5']
non_accepted_publication_type = ['blog', 'report', 'white paper']

leaderboard_system_without_evaluation_datasets = ['Ko Chatbot Arena Leaderboard']
leaderboard_system_without_pull_requests = ['SEAL', 'FSMBench', 'CompMix', 'DocVQA', 'GENIE', 'InfographicVQA', 'LMExamQA', 'Models Leaderboard', 'MP-DocVQA', 'OpenEval (text)', 'Program Synthesis Models Leaderboard', 'ProtoQA', 'TextSynth Server']
leaderboard_system_without_contacts = ['OpenEval (text)']
leaderboard_system_without_documentation = ['FSMBench']
leaderboard_system_without_evaluation_harness = ['LLM-Leaderboard']

macro_display_format_mapping = {
    'Table': ['Regular Table', 'Rankable Table', 'Table Screenshot'],
    'Figure': ['Bar Chart', 'Box Plot', 'Heat Map', 'Line Chart', 'Pie Chart', 'Radar Chart', 'Scatter Plot', 'Rankable Bar Chart'],
}

platform_abbreviation_mapping = {
    'gh': 'GitHub',
    'hf': 'Hugging Face Spaces',
    'pwc': 'Papers With Code',
    'ip': 'independent platform',
}

ranking_dataframe_organization_tactics_mapping = {
    'Algorithm': ['Algorithm'],
    'Evaluation Configuration': ['Dataset Slice', 'Evaluator', 'Metric'],
    'Leaderboard Aggregation': ['Aggregated Result'],
    'Leaderboard Release': ['Leaderboard Launch Date', 'Leaderboard Version'],
    'Model Capability': ['Benchmark Dataset', 'Service Load', 'Supported Functionality', 'Supported Language', 'Supported Modality', 'Task'],
    'Model Information': ['Model Accessibility', 'Model Size', 'Model Type'],
    'Prompt Engineering': ['#Prompt Example', 'Output Length', 'Prompt Design', 'Prompt Length', 'Relation Extractor', 'Tokenizer']
}

publisher_synonyms = {
    'Alibaba Group': ['DAMO Academy', 'Aliyun'],
    'Amazon': ['Alexa AI Lab', 'AWS AI Lab'],
    'Agency for Science Technology and Research': ['A*STAR I2R', 'CFAR A*STAR'],
    'Bauhaus University': ['Bauhaus University Weimar'],
    'ByteDance': ['ByteDance AI Lab'],
    'Chinese Academy of Sciences': ['University of Chinese Academy of Sciences'],
    'CSIRO': ['Data61'],
    'Fraunhofer Society': ['Fraunhofer Institute for Integrated Circuits'],
    'Frédéric Joliot Institute for Life Sciences': ['NeuroSpin'],
    'Google': ['Google Brain', 'Google DeepMind', 'Google Blueshift'],
    'Huawei': ["Huawei Noah's Ark Lab"],
    'Jingdong': ['Explore Academy'],
    'Kunlun Tech': ['Skywork'],
    'Max Planck Society': ['Max Planck Institute for Informatics', 'Max Planck Institute for Intelligent Systems', 'Max Planck Institute for Mathematics in the Sciences'],
    'Meta': [
        'Meta FAIR',
        'Meta GenAI'
    ],
    'Microsoft': ['Microsoft Research Asia'],
    'National Public School': ['National Public School HSR Layout'],
    'Naver': ['Naver Labs Europe'],
    'Queen\'s University Kingston': ['Queen\'s University'],
    'Tencent': [
        'Tencent AI Lab',
        'Tencent ARC Lab',
        'Tencent Youtu Lab'
    ],
    'Toyota Technological Institute': ['Toyota Technological Institute Chicago'],
    'University of California': [
        'University of California Berkeley',
        'University of California Davis',
        'University of California Irvine',
        'University of California Los Angeles',
        'University of California Santa Barbara',
        'University of California San Diego',
    ],
    'University of Michigan': ['University of Michigan Ann Arbor'],
    'University of North Carolina': ['University of North Carolina Chapel Hill'],
    'University of Illinois': ['University of Illinois Chicago', 'University of Illinois Urbana Champaign'],
    'University of Massachusetts': ['University of Massachusetts Amherst'],
    'University of Maryland': ['University of Maryland College Park'],
    'University of Tennessee': ['University of Tennessee Knoxville'],
    'University of Texas': ['University of Texas Austin'],
    'University of Wisconsin': ['University of Wisconsin Madison'],
}

metrics_synonyms = {
    'accuracy': [
        'acc',
        'accuarcy',
        'qa accuracy'
    ],
    'average': [
        'avg',
        '平均'
    ],
    'average accuracy': [
        'avg. accuracy'
    ],
    'average score': ['平均分'],
    'bleu': ['bleu score'],
    'bleu-1': [
        'narrativeqa bleu-1',
        'socialiqa bleu-1',
        'mcscript bleu-1',
        'cosmosqa bleu-1'
    ],
    'bleu-4': ['bleu4'],
    'bertscore': ['bert score'],
    'code': ['代码'],
    'elo rating': [
        'chatbot arena elo',
        'elo'
    ],
    'exact match': ['em', 'exact match accuracy'],
    'lerc': [
        'cosmosqa lerc',
        'mcscript lerc',
        'socialiqa lerc',
        'narrativeqa lerc'
    ],
    'link': ['url'],
    'mean rank': [
        'text-to-video mean rank',
        'video-to-text mean rank'
    ],
    'median rank': [
        'text-to-video median rank',
        'video-to-text median rank',
        'text-to-videomedian rank',
        'text-to-video medianr'
    ],
    'meteor': [
        'cosmosqa meteor',
        'narrativeqa meteor',
        'socialiqa meteor',
        'mcscript meteor'
    ],
    'neg mean rank': [
        'i->t neg mean rank',
        't->i neg mean rank'
    ],
    'organization': [
        '发布机构',
        '机构',
        'orgaisation',
    ],
    'others': ['其他'],
    'overall': ['xiezhi overall'],
    'overall score': [
        '总分',
        '总体分数'
    ],
    'pass@1': [
        'interview pass@1',
        'competition pass@1',
        'introductory pass@1'
    ],
    'pass@5': [
        'interview pass@5',
        'introductory pass@5',
        'competition pass@5'
    ],
    'pass@1000': [
        'interview pass@1000',
        'competition pass@1000',
        'introductory pass@1000'
    ],
    'pass@any': [
        'introductory pass@any',
        'competition pass@any',
        'interview pass@any'
    ],
    '#parameters':  [
        '#p',
        '#params',
        '# params',
        '#size',
        '参数量',
        'model size',
        'model size/b',
        'number of params',
        'param',
        'parameters',
        'params',
        'size'
    ],
    'perplexity': ['ppl'],
    'precision@1': ['i->t p@1'],
    'precision@20': ['p@20'],
    'recall@1': [
        'r@1',
        'text-to-videor@1',
        'video-to-text r@1',
        'text-to-video r@1',
        'text-to-image r@1',
        'image-to-text r@1'
    ],
    'recall@5': [
        'text-to-image r@5',
        'video-to-text r@5',
        'image-to-text r@5',
        'text-to-video r@5',
        'r@5',
    ],
    'recall@10': [
        'recall@10 on 1 rounds',
        'recall@10 on 2 rounds',
        'recall@10 on 3 rounds',
        'r@10',
        'video-to-text r@10',
        'text-to-image r@10',
        'text-to-video r@10',
        'image-to-text r@10'
    ],
    'recall@50': [
        'text-to-video r@50',
        'video-to-text r@50',
    ],
    'score': ['分数'],
    'submission date': ['提交时间'],
    'top-1 accuracy': ['top 1 accuracy'],
    'top-5 accuracy': ['top 5 accuracy'],
    'type': ['model type'],
    'win rate': ['胜率'],
    'word error rate': ['wer']
}

pwc_leaderboard_mapping = {
    'A-OKVQA': ['visual-question-answering-on-a-okvqa'],
    'ACE (2005)': ['relation-extraction-on-ace-2005'],
    'ADE20K': ['semantic-segmentation-on-ade20k'],
    'AI2D': ['visual-question-answering-vqa-on-ai2d'],
    'AISHELL-1': ['speech-recognition-on-aishell-1'],
    'ANLI': ['natural-language-inference-on-anli-test'],
    'APPS': ['code-generation-on-apps'],
    'ARC Challenge': ['common-sense-reasoning-on-arc-challenge', 'common-sense-reasoning-on-arc-easy'],
    'ASDiv': ['math-word-problem-solving-on-asdiv-a'],
    'AVA': ['action-recognition-on-ava-v2-2', 'spatio-temporal-action-localization-on-ava'],
    'ActivityNet Captions': ['video-captioning-on-activitynet-captions', 'video-captioning-on-activitynet-captions'],
    'ActivityNet': ['temporal-action-localization-on-activitynet', 'video-retrieval-on-activitynet', 'zero-shot-video-retrieval-on-activitynet'],
    'ActivityNet-QA': ['video-question-answering-on-activitynet-qa'],
    'AudioCaps': ['audio-to-text-retrieval-on-audiocaps', 'text-to-audio-retrieval-on-audiocaps', 'zero-shot-audio-captioning-on-audiocaps'],
    'BACE': ['molecular-property-prediction-on-bace-1'],
    'BANKING77': ['intent-detection-on-banking77'],
    'BBBP': ['molecular-property-prediction-on-bbbp-1'],
    'BBH': ['multi-task-language-understanding-on-bbh-nlp', 'multi-task-language-understanding-on-bbh-alg'],
    'BC5CDR': ['named-entity-recognition-ner-on-bc5cdr'],
    'BEIR': ['argument-retrieval-on-arguana-beir', 'argument-retrieval-on-touche-2020-beir', 'biomedical-information-retrieval-on-bioasq-1', 'biomedical-information-retrieval-on-nfcorpus-1', 'biomedical-information-retrieval-on-trec-1', 'citation-prediction-on-scidocs-beir', 'duplicate-question-retrieval-on-cqadupstack-1', 'duplicate-question-retrieval-on-quora-beir', 'entity-retrieval-on-dbpedia-beir', 'fact-checking-on-scifact-beir', 'fact-checking-on-fever-beir', 'fact-checking-on-climate-fever-beir', 'zero-shot-text-search-on-beir', 'news-retrieval-on-trec-news-beir', 'passage-retrieval-on-msmarco-beir', 'question-answering-on-fiqa-2018-beir', 'question-answering-on-hotpotqa-beir', 'tweet-retrieval-on-signal-1m-rt-beir'],
    'BIG-Bench': ['mathematical-induction-on-big-bench', 'physics-mc-on-big-bench', 'riddle-sense-on-big-bench', 'figure-of-speech-detection-on-big-bench', 'fantasy-reasoning-on-big-bench', 'common-sense-reasoning-on-big-bench-logical', 'logical-reasoning-on-big-bench-logical', 'analogical-similarity-on-big-bench', 'implicit-relations-on-big-bench', 'intent-recognition-on-big-bench', 'lambada-on-big-bench', 'movie-dialog-same-or-different-on-big-bench', 'nonsense-words-grammar-on-big-bench', 'phrase-relatedness-on-big-bench', 'question-selection-on-big-bench', 'misconceptions-on-big-bench', 'sentence-ambiguity-on-big-bench', 'general-knowledge-on-big-bench', 'analytic-entailment-on-big-bench', 'entailed-polarity-on-big-bench', 'epistemic-reasoning-on-big-bench', 'evaluating-information-essentiality-on-big', 'logical-args-on-big-bench', 'metaphor-boolean-on-big-bench', 'physical-intuition-on-big-bench', 'presuppositions-as-nli-on-big-bench', 'discourse-marker-prediction-on-big-bench', 'empirical-judgments-on-big-bench', 'irony-identification-on-big-bench', 'english-proverbs-on-big-bench', 'similarities-abstraction-on-big-bench', 'gre-reading-comprehension-on-big-bench', 'crash-blossom-on-big-bench', 'human-organs-senses-multiple-choice-on-big', 'odd-one-out-on-big-bench', 'identify-odd-metapor-on-big-bench', 'dark-humor-detection-on-big-bench', 'understanding-fables-on-big-bench', 'timedial-on-big-bench', 'common-sense-reasoning-on-big-bench-known', 'language-modelling-on-big-bench-lite', 'auto-debugging-on-big-bench-lite', 'memorization-on-big-bench-hindu-knowledge', 'logical-reasoning-on-big-bench-strategyqa', 'logical-reasoning-on-big-bench-logic-grid', 'crass-ai-on-big-bench', 'multiple-choice-question-answering-mcqa-on-31', 'word-sense-disambiguation-on-big-bench', 'multi-task-language-understanding-on-bbh-alg', 'sarcasm-detection-on-big-bench-snarks', 'common-sense-reasoning-on-big-bench-sports', 'multiple-choice-question-answering-mcqa-on-30', 'logical-reasoning-on-big-bench-temporal', 'common-sense-reasoning-on-big-bench', 'common-sense-reasoning-on-big-bench-causal', 'common-sense-reasoning-on-big-bench-date', 'logical-reasoning-on-big-bench-formal', 'multiple-choice-question-answering-mcqa-on-27', 'multiple-choice-question-answering-mcqa-on-28', 'multiple-choice-question-answering-mcqa-on-29', 'logical-reasoning-on-big-bench-penguins-in-a', 'logical-reasoning-on-big-bench-reasoning', 'multi-task-language-understanding-on-bbh-nlp'],
    'BenchLMM': ['visual-question-answering-on-benchlmm'],
    'BioASQ': ['question-answering-on-bioasq'],
    'BioLAMA': ['knowledge-probing-on-biolama'],
    'BoolQ': ['question-answering-on-boolq'],
    'C4': ['language-modelling-on-c4'],
    'CB': ['natural-language-inference-on-commitmentbank'],
    'CFQ': ['semantic-parsing-on-cfq'],
    'CIFAR-10': ['image-classification-on-cifar-10'],
    'CLUE': ['language-modelling-on-clue-afqmc', 'language-modelling-on-clue-ocnli-50k', 'language-modelling-on-clue-drcd', 'language-modelling-on-clue-cmnli', 'language-modelling-on-clue-wsc1-1', 'language-modelling-on-clue-c3', 'language-modelling-on-clue-cmrc2018'],
    'CNN DM': ['document-summarization-on-cnn-daily-mail', 'abstractive-text-summarization-on-cnn-daily'],
    'COCO Captions': ['image-captioning-on-coco-captions'],
    'COPA': ['question-answering-on-copa'],
    'ChEBI-20': ['cross-modal-retrieval-on-chebi-20', 'text-based-de-novo-molecule-generation-on', 'molecule-captioning-on-chebi-20'],
    'Charades': ['zero-shot-action-recognition-on-charades-1'],
    'Charades-STA': ['moment-retrieval-on-charades-sta'],
    'ChartQA': ['chart-question-answering-on-chartqa'],
    'Civil Comments': ['toxic-comment-classification-on-civil'],
    'Clotho': ['text-to-audio-retrieval-on-clotho', 'audio-captioning-on-clotho', 'audio-to-text-retrieval-on-clotho', 'zero-shot-text-to-audio-retrieval-on-clotho'],
    'CoLA': ['linguistic-acceptability-on-cola'],
    'CoNLL': ['coreference-resolution-on-conll-2012', 'coreference-resolution-on-conll12', 'joint-entity-and-relation-extraction-on-2', 'named-entity-recognition-ner-on-conll-2003', 'named-entity-recognition-on-conll03', 'relation-extraction-on-conll04', 'semantic-role-labeling-on-conll-2005', 'semantic-role-labeling-on-conll05-wsj', 'semantic-role-labeling-on-conll12'],
    'CoNaLa': ['code-generation-on-conala'],
    'CoQA': ['question-answering-on-coqa'],
    'CodeContests': ['code-generation-on-codecontests'],
    'Common Voice': ['speech-recognition-on-common-voice-8-0-13', 'speech-recognition-on-common-voice-8-0-14', 'speech-recognition-on-common-voice-8-0-35', 'speech-recognition-on-common-voice-8-0-21', 'speech-recognition-on-common-voice-8-0-19', 'speech-recognition-on-common-voice-8-0-15', 'speech-recognition-on-common-voice-7-0-irish', 'speech-recognition-on-common-voice-8-0-irish', 'speech-recognition-on-common-voice-2', 'speech-recognition-on-common-voice-7-0-3', 'speech-recognition-on-common-voice-8-0-german', 'speech-recognition-on-common-voice-8-0-6', 'speech-recognition-on-common-voice-7-0-4', 'speech-recognition-on-common-voice-7-0-13', 'speech-recognition-on-common-voice-8-0-37', 'speech-recognition-on-common-voice-8-0-22', 'speech-recognition-on-common-voice-8-0', 'speech-recognition-on-common-voice-frisian', 'speech-recognition-on-common-voice-8-0-4', 'speech-recognition-on-common-voice-italian', 'speech-recognition-on-common-voice-8-0-french', 'speech-recognition-on-common-voice-english', 'speech-recognition-on-common-voice-8-0-hindi', 'speech-recognition-on-common-voice-7-0-5', 'speech-recognition-on-common-voice-7-0-hindi', 'speech-recognition-on-common-voice-8-0-dutch', 'speech-recognition-on-common-voice-spanish', 'speech-recognition-on-common-voice-french', 'speech-recognition-on-common-voice-german'],
    'ComplexWebQuestions': ['knowledge-base-question-answering-on'],
    'CrossNER': ['zero-shot-named-entity-recognition-ner-on-1'],
    'CrowS-Pairs': ['stereotypical-bias-analysis-on-crows-pairs'],
    'DROP': ['question-answering-on-drop-test', 'question-answering-on-drop'],
    'DiDeMo': ['video-retrieval-on-didemo', 'zero-shot-video-retrieval-on-didemo'],
    'DocVQA': ['visual-question-answering-on-docvqa-test'],
    'FLEURS': ['automatic-speech-recognition-on-fleurs-1'],
    'FSD50K': ['audio-classification-on-fsd50k'],
    'FewCLUE': ['language-modelling-on-fewclue-eprstmt', 'language-modelling-on-fewclue-ocnli-fc', 'language-modelling-on-fewclue-bustm', 'language-modelling-on-fewclue-chid-fc', 'language-modelling-on-fewclue-cluewsc-fc'],
    'FinQA': ['question-answering-on-finqa'],
    'FineAction': ['temporal-action-localization-on-fineaction'],
    'Flickr30K': ['zero-shot-cross-modal-retrieval-on-flickr30k', 'image-to-text-retrieval-on-flickr30k'],
    'Food-101': ['zero-shot-transfer-image-classification-on-17'],
    'GEM': ['extreme-summarization-on-gem-xsum'],
    'GENIA': ['named-entity-recognition-on-genia'],
    'GQA': ['visual-question-answering-on-gqa-test-dev'],
    'GSM8K': ['arithmetic-reasoning-on-gsm8k'],
    'HIV': ['molecular-property-prediction-on-hiv-dataset'],
    'HMDB51': ['action-recognition-in-videos-on-hmdb-51', 'zero-shot-action-recognition-on-hmdb51'],
    'HallusionBench': ['visual-question-answering-vqa-on-3'],
    'HellaSwag': ['sentence-completion-on-hellaswag'],
    'HumanEval': ['code-generation-on-humaneval'],
    'IFEval': ['instruction-following-on-ifeval'],
    'IUPAC': ['iupac-name-prediction-on-iupac'],
    'ImageNet': ['few-shot-image-classification-on-imagenet-10', 'few-shot-image-classification-on-imagenet-1-1', 'few-shot-image-classification-on-imagenet-5', 'image-classification-on-imagenet', 'self-supervised-image-classification-on', 'self-supervised-image-classification-on-1', 'zero-shot-transfer-image-classification-on-1', 'zero-shot-transfer-image-classification-on-3'],
    'InfoSeek': ['visual-question-answering-vqa-on-infoseek'],
    'InfographicVQA': ['visual-question-answering-vqa-on'],
    'JFT-300M': ['image-classification-on-jft-300m'],
    'Kinetics': ['action-classification-on-kinetics-400', 'action-classification-on-kinetics-600', 'action-classification-on-kinetics-700', 'spatio-temporal-action-localization-on-ava', 'video-generation-on-kinetics-600-12-frames', 'zero-shot-action-recognition-on-kinetics'],
    'LAMBADA': ['language-modelling-on-lambada'],
    'LLaVA-Bench': ['visual-instruction-following-on-llava-bench'],
    'LibriSpeech': ['speech-recognition-on-librispeech-test-clean', 'speech-recognition-on-librispeech-test-other'],
    'MATH': ['math-word-problem-solving-on-math'],
    'MAWPS': ['math-word-problem-solving-on-mawps'],
    'MBPP': ['code-generation-on-mbpp'],
    'MGSM': ['multi-task-language-understanding-on-mgsm'],
    'MM-Vet': ['visual-question-answering-on-mm-vet'],
    'MMLU': ['multi-task-language-understanding-on-mmlu', 'mathematical-reasoning-on-mmlu-mathematics', 'multiple-choice-question-answering-mcqa-on-11', 'multiple-choice-question-answering-mcqa-on-8', 'multiple-choice-question-answering-mcqa-on-25', 'multiple-choice-question-answering-mcqa-on-7', 'multiple-choice-question-answering-mcqa-on-9', 'multiple-choice-question-answering-mcqa-on-10', 'multiple-choice-question-answering-mcqa-on-13', 'multiple-choice-question-answering-mcqa-on-12', 'multiple-choice-question-answering-mcqa-on-2', 'multiple-choice-question-answering-mcqa-on-3', 'multiple-choice-question-answering-mcqa-on-4', 'multiple-choice-question-answering-mcqa-on-5', 'multiple-choice-question-answering-mcqa-on-16', 'multiple-choice-question-answering-mcqa-on-17', 'multiple-choice-question-answering-mcqa-on-18', 'multiple-choice-question-answering-mcqa-on-20', 'multiple-choice-question-answering-mcqa-on-14', 'multiple-choice-question-answering-mcqa-on-19', 'multiple-choice-question-answering-mcqa-on-6', 'multiple-choice-question-answering-mcqa-on-15', 'multiple-choice-question-answering-mcqa-on-24', 'multiple-choice-question-answering-mcqa-on-26', 'multiple-choice-question-answering-mcqa-on-23'],
    'MRPC': ['semantic-textual-similarity-on-mrpc'],
    'MSCOCO': ['image-captioning-on-coco', 'image-retrieval-on-coco', 'image-to-text-retrieval-on-coco', 'text-to-image-generation-on-coco', 'zero-shot-cross-modal-retrieval-on-coco-2014'],
    'MSRVTT': ['text-to-video-generation-on-msr-vtt', 'video-captioning-on-msr-vtt-1', 'video-retrieval-on-msr-vtt', 'zero-shot-video-retrieval-on-msr-vtt', 'video-retrieval-on-msr-vtt-1ka'],
    'MSRVTT-QA': ['zeroshot-video-question-answer-on-msrvtt-qa', 'video-question-answering-on-msrvtt-qa', 'visual-question-answering-on-msrvtt-qa-1'],
    'MSVD': ['video-retrieval-on-msvd', 'video-captioning-on-msvd-1', 'zero-shot-video-retrieval-on-msvd'],
    'MSVD-QA': ['zeroshot-video-question-answer-on-msvd-qa', 'zeroshot-video-question-answer-on-msvd-qa'],
    'MathQA': ['math-word-problem-solving-on-mathqa'],
    'MedMCQA': ['multiple-choice-question-answering-mcqa-on-21'],
    'MedQA': ['question-answering-on-medqa-usmle'],
    'MiT': ['action-classification-on-moments-in-time'],
    'MoleculeNet': ['molecular-property-prediction-on-moleculenet'],
    'MultiRC': ['question-answering-on-multirc'],
    'MusicCaps': ['text-to-music-generation-on-musiccaps'],
    'MusicQA': ['music-question-answering-on-musicqa'],
    'NExT-QA': ['temporal-casual-qa-on-next-qa', 'question-answering-on-next-qa-open-ended', 'video-question-answering-on-next-qa', 'zero-shot-video-question-answer-on-next-qa', 'zero-shot-video-question-answer-on-next-gqa'],
    'NLVR': ['visual-reasoning-on-nlvr2-test', 'visual-reasoning-on-nlvr2-dev'],
    'NQ': ['passage-retrieval-on-natural-questions', 'question-answering-on-natural-questions'],
    'NoCaps': ['image-captioning-on-nocaps-entire', 'image-captioning-on-nocaps-near-domain', 'image-captioning-on-nocaps-out-of-domain', 'image-captioning-on-nocaps-in-domain', 'image-captioning-on-nocaps-val-in-domain', 'image-captioning-on-nocaps-val-overall', 'image-captioning-on-nocaps-val-out-domain', 'image-captioning-on-nocaps-val-near-domain'],
    'OK-VQA': ['visual-question-answering-on-ok-vqa'],
    'OVEN': ['fine-grained-image-recognition-on-oven'],
    'OmniBenchmark': ['image-classification-on-omnibenchmark'],
    'OpenbookQA': ['question-answering-on-openbookqa', 'question-answering-on-obqa'],
    'PIQA': ['question-answering-on-piqa'],
    'PMC-VQA': ['generative-visual-question-answering-on-pmc', 'visual-question-answering-vqa-on-pmc-vqa', 'medical-visual-question-answering-on-pmc-vqa'],
    'Pets37': ['fine-grained-image-classification-on-oxford-1'],
    'PubChemQA': ['question-answering-on-pubchemqa'],
    'PubMedQA': ['question-answering-on-pubmedqa'],
    'QNLI': ['natural-language-inference-on-qnli'],
    'RACE': ['reading-comprehension-on-race', 'question-answering-on-race'],
    'RAFT': ['few-shot-text-classification-on-raft'],
    'RTE': ['natural-language-inference-on-rte'],
    'RareAct': ['action-recognition-on-rareact'],
    'ReCoRD': ['common-sense-reasoning-on-record'],
    'RefCOCO': ['referring-expression-comprehension-on-refcoco-1', 'referring-expression-comprehension-on-refcoco-1', 'referring-expression-segmentation-on-refcocog', 'referring-expression-segmentation-on-refcoco', 'referring-expression-comprehension-on-1', 'referring-expression-comprehension-on', 'referring-expression-comprehension-on-refcoco'],
    'Robust (2004)': ['ad-hoc-information-retrieval-on-trec-robust04'],
    'SCROLLS': ['long-range-modeling-on-scrolls'],
    'SICK': ['semantic-textual-similarity-on-sick', 'semantic-textual-similarity-on-sick-r-1'],
    'SIDER': ['molecular-property-prediction-on-sider-1'],
    'SIQA': ['question-answering-on-social-iqa'],     
    'SPGISpeech': ['speech-recognition-on-spgispeech'],
    'SQuAD': ['question-answering-on-squad11-dev'],
    'SST': ['sentiment-analysis-on-sst-2-binary'],
    'STAR': ['video-question-answering-on-situated', 'zero-shot-video-question-answer-on-star', 'zero-shot-video-question-answer-on-star-1'],
    'STS-B': ['semantic-textual-similarity-on-sts-benchmark', 'semantic-textual-similarity-on-sts13', 'semantic-textual-similarity-on-sts14', 'semantic-textual-similarity-on-sts12', 'semantic-textual-similarity-on-sts15', 'semantic-textual-similarity-on-sts16', 'semantic-similarity-on-sts-benchmark'],
    'SVAMP': ['math-word-problem-solving-on-svamp'],
    'SWE-bench': ['bug-fixing-on-swe-bench'],
    'SciQ': ['text-generation-on-sciq'],
    'ScienceQA': ['science-question-answering-on-scienceqa'],
    'Something-Something': ['action-recognition-in-videos-on-something'],
    'Spider': ['text-to-sql-on-spider', 'semantic-parsing-on-spider'],
    'StereoSet': ['bias-detection-on-stereoset-1'],
    'StoryCloze': ['question-answering-on-storycloze'],
    'StrategyQA': ['question-answering-on-strategyqa'],
    'TACRED': ['relation-extraction-on-tacred'],
    'TED-LIUM': ['speech-recognition-on-tedlium'],
    'TGIF-QA': ['tgif-frame-on-tgif-qa', 'zeroshot-video-question-answer-on-tgif-qa', 'zeroshot-video-question-answer-on-tgif-qa-1', 'visual-question-answering-on-tgif-qa'],
    'TREC-COVID': ['zero-shot-text-search-on-trec-covid'],
    'TVQA': ['video-question-answering-on-tvqa', 'zero-shot-video-question-answer-on-tvqa'],
    'TextVQA': ['visual-question-answering-on-textvqa-test-1'],
    'The Pile': ['language-modelling-on-the-pile'],
    'Tox21': ['language-modelling-on-the-pile'],
    'TriviaQA': ['question-answering-on-triviaqa'],
    'TruthfulQA': ['question-answering-on-truthfulqa'],
    'TyDiQA-GoldP': ['cross-lingual-question-answering-on-tydiqa'],
    'UCF101': ['action-recognition-in-videos-on-ucf101', 'self-supervised-action-recognition-on-ucf101', 'text-to-video-generation-on-ucf-101', 'zero-shot-action-recognition-on-ucf101', 'video-generation-on-ucf-101'],
    'UniProtQA': ['question-answering-on-uniprotqa'],
    'VATEX': ['video-retrieval-on-vatex', 'video-captioning-on-vatex-1', 'zero-shot-video-retrieval-on-vatex'],
    'VCR': ['visual-question-answering-on-vcr-q-ar-test', 'visual-question-answering-on-vcr-qa-r-test', 'visual-question-answering-on-vcr-q-a-test'],
    'VGG-Sound': ['audio-classification-on-vggsound'],
    'VNHSGE': ['question-answering-on-vnhsge-english', 'question-answering-on-vnhsge-physics', 'question-answering-on-vnhsge-chemistry', 'question-answering-on-vnhsge-biology', 'question-answering-on-vnhsge-history', 'question-answering-on-vnhsge-geography', 'question-answering-on-vnhsge-literature', 'question-answering-on-vnhsge-mathematics-1', 'question-answering-on-vnhsge-civic'],
    'VQA': ['visual-question-answering-on-vqa-v2-val-1', 'visual-question-answering-on-vqa-v2-test-dev-1', 'visual-question-answering-on-vqa-v2-val', 'visual-question-answering-on-vqa-v2-test-std', 'visual-question-answering-on-vqa-v2-test-dev'],
    'VeRi-776': ['vehicle-re-identification-on-veri-776'],
    'VeRi-Wild': ['vehicle-re-identification-on-veri-wild-small'],
    'ViP-Bench': ['visual-question-answering-on-vip-bench'],
    'VideoInstruct': ['video-based-generative-performance', 'video-based-generative-performance-1', 'video-based-generative-performance-2', 'video-based-generative-performance-3', 'video-based-generative-performance-4', 'video-based-generative-performance-5'],
    'VisDial': ['chat-based-image-retrieval-on-visdial'],
    'VizWiz': ['visual-question-answering-on-vizwiz-2020-vqa'],
    'WHOOPS!': ['explanation-generation-on-whoops', 'image-to-text-retrieval-on-whoops', 'image-captioning-on-whoops', 'visual-question-answering-vqa-on-whoops'],
    'WMT (2014)': ['unsupervised-machine-translation-on-wmt2014-1', 'machine-translation-on-wmt2014-french-english', 'machine-translation-on-wmt2014-english-german', 'unsupervised-machine-translation-on-wmt2014-2', 'machine-translation-on-wmt2014-english-french'],
    'WNLI': ['natural-language-inference-on-wnli'],
    'WSC': ['coreference-resolution-on-winograd-schema'],
    'WebQuestions': ['question-answering-on-webquestions'],
    'WebQuestionsSP': ['knowledge-base-question-answering-on-1', 'semantic-parsing-on-webquestionssp'],
    'WenetSpeech': ['speech-recognition-on-wenetspeech'],
    'WikiText-103': ['language-modelling-on-wikitext-103'],
    'WikiText-2': ['language-modelling-on-wikitext-2'],
    'WinoGrande': ['common-sense-reasoning-on-winogrande'],
    'Winoground': ['visual-reasoning-on-winoground'],
    'XCOPA': ['cross-lingual-transfer-on-xcopa'],
    'XSUM': ['text-summarization-on-x-sum'],
    'YouCook2': ['video-retrieval-on-youcook2', 'video-captioning-on-youcook2', 'zero-shot-video-retrieval-on-youcook2'],
    'iNaturalist': ['long-tail-learning-on-inaturalist-2018', 'image-classification-on-inaturalist-2018'],
    'iVQA': ['video-question-answering-on-ivqa'],
}

dataset_oriented_leaderboards = ['https://www.datacomp.ai/dclm/leaderboard.html', 'https://www.datacomp.ai/dcclip/leaderboard.html']
metric_oriented_leaderboards = ['https://github.com/yuh-zha/AlignScore']
method_oriented_leaderboards = ['https://siml.earth/Julia-LLM-Leaderboard/stable/examples/summarize_results_prompts', 'https://huggingface.co/spaces/locuslab/tofu_leaderboard']
solution_oriented_leaderboards = ['https://www.kaggle.com/competitions', 'https://eval.ai/web/challenges/list', 'https://taostats.io', 'https://artificialanalysis.ai/leaderboards/providers', 'https://huggingface.co/spaces/dreambooth-hackathon/leaderboard', 'https://huggingface.co/spaces/jax-diffusers-event/leaderboard', 'https://huggingface.co/spaces/keras-dreambooth/leaderboard', 'https://huggingface.co/spaces/Weyaxi/huggingface-leaderboard', 'https://huggingface.co/spaces/Nymbo/followers-leaderboard']

In [42]:
platform_abbreviation_mapping_inverse = {v: k for k, v in platform_abbreviation_mapping.items()}

metrics_synonyms_inverse = {}
for key, values in metrics_synonyms.items():
    for value in values:
        metrics_synonyms_inverse[value] = key

publisher_synonyms_inverse = {}
for key, values in publisher_synonyms.items():
    for value in values:
        publisher_synonyms_inverse[value] = key

ranking_dataframe_organization_tactics_mapping_inverse = {}
for key, values in ranking_dataframe_organization_tactics_mapping.items():
    for value in values:
        ranking_dataframe_organization_tactics_mapping_inverse[value] = key

def convert_df_to_dict(df):
    """
    Convert a DataFrame of leaderboards and metrics to a dictionary.

    :param df: DataFrame with leaderboards and metrics
    :return: Dictionary with leaderboards as keys and list of metrics as values
    """
    leaderboard_data = {}

    # Ensure the DataFrame's first column is considered as the keys
    # and iterate through each row to construct the dictionary
    for _, row in df.iterrows():
        key = row[0]  # The first column serves as the key
        # List comprehension to get non-empty values from the rest of the row
        values = [value for value in row[1:] if pd.notna(value)]
        leaderboard_data[key] = values

    return leaderboard_data

def string_to_list(text, platform=False):
    if pd.isna(text) or text == 'nan':
        return []
    elif platform:
        return [platform_abbreviation_mapping[platform] for platform in text.split(',')]
    else:
        return text.split(',')

def string_to_dict(s, platform=False, list_format=False):
    # Check if the input is np.nan
    if pd.isna(s):
        if list_format:
            return []
        return {}
    
    # Split the string into key-value pairs
    pairs = s.split(',')
    result_dict = {}
    result_list = []
    
    for pair in pairs:
        # Split each pair by ':' to separate keys and values
        key, value = pair.split(':')
        # Split the value by '+' to get the list of items
        value_list = value.split('+')
        # Assign the list to the key in the dictionary
        if platform:
            result_dict[platform_abbreviation_mapping[key]] = value_list
        else:
            result_dict[key] = value_list
        result_list.extend(value_list)
            
    if list_format:
        return result_list
    return result_dict

# Function to format each value based on its own decimal places
def format_individual_value(x):
    if isinstance(x, float):
        # Use string formatting to maintain original decimal places
        return "{:f}".format(x).rstrip('0').rstrip('.')
    return x

def print_empty_folders(root_dir):
    for dirpath, dirname, filenames in os.walk(root_dir):
        # Check if both lists of subdirectories and filenames are empty
        if not filenames and dirpath != root_dir:
            print(f"Empty folder: {dirpath}")
            
def list_directories(folder_path):
    # List all subdirectories within the folder
    directories = [os.path.join(folder_path, d) for d in os.listdir(
        folder_path) if os.path.isdir(os.path.join(folder_path, d))]
    return directories

def publisher_synonyms_mapping(publishers):
    publishers_processed = set()
    for publisher in publishers:
        if publisher in publisher_synonyms_inverse:
            publishers_processed.add(publisher_synonyms_inverse[publisher])
        # elif 'Independent Contributor' == publisher:
        #     publishers_processed.add(leaderboard)
        else:
            publishers_processed.add(publisher)
    return publishers_processed

def keep_rows_by_list_column(df, column_name, keyword_list):
    """
    Filters rows in a DataFrame based on whether all elements in a specified list column are in a given keyword list.
    
    Parameters:
    - df: Pandas DataFrame.
    - column_name: The name of the column containing lists of strings.
    - keyword_list: A list of keywords to check against.
    
    Returns:
    - A filtered Pandas DataFrame.
    """
    # Convert the keyword list to a set for faster membership testing
    keyword_set = set(keyword_list)
    
    # Define a lambda function to check if all elements of a list are in the keyword_set
    all_in_keywords = lambda x: any(element in keyword_set for element in x)
    
    # Apply the lambda function to the specified column and invert the boolean Series to filter rows
    filtered_df = df[df[column_name].apply(all_in_keywords)]
    
    return filtered_df

def filter_rows_by_list_column(df, column_name, keyword_list):
    """
    Filters rows in a DataFrame based on whether all elements in a specified list column are in a given keyword list.
    
    Parameters:
    - df: Pandas DataFrame.
    - column_name: The name of the column containing lists of strings.
    - keyword_list: A list of keywords to check against.
    
    Returns:
    - A filtered Pandas DataFrame.
    """
    # Convert the keyword list to a set for faster membership testing
    keyword_set = set(keyword_list)
    
    # Define a lambda function to check if all elements of a list are in the keyword_set
    all_in_keywords = lambda x: not all(element in keyword_set for element in x)
    
    # Apply the lambda function to the specified column and invert the boolean Series to filter rows
    filtered_df = df[df[column_name].apply(all_in_keywords)]
    
    return filtered_df

In [43]:
with pd.ExcelFile(path_meta / 'Foundation Model Leaderboards.xlsx') as excel_file:
    df_leaderboard = pd.read_excel(excel_file, sheet_name='Leaderboard')

    df_leaderboard_expanded = []
    for idx, row in df_leaderboard.iterrows():
        task = row['Name']
        if task in pwc_leaderboard_mapping and row['Platforms'] == 'pwc':
            sub_names = pwc_leaderboard_mapping[task]
            for sub_name in sub_names:
                new_row = row.copy()
                new_row['Name'] = sub_name
                df_leaderboard_expanded.append(new_row)
        else:
            df_leaderboard_expanded.append(row)

    df_leaderboard_expanded = pd.DataFrame(df_leaderboard_expanded)
    df_leaderboard_expanded.to_csv(path_meta / 'leaderboard_processed.csv', index=False)

In [44]:
leaderboard_hosted_on_multiple_platforms = 0
leaderboard_without_model_linkage = 0
leaderboard_host_platform_mapping = defaultdict(int)

df_leaderboard_expanded = pd.read_csv(path_meta / 'leaderboard_processed.csv')
df_leaderboard_expanded['Development workflows (non-pwc)'] = df_leaderboard_expanded['Development workflows (non-pwc)'].astype(str).apply(string_to_list)
df_leaderboard_expanded['Display formats (non-pwc)'] = df_leaderboard_expanded['Display formats (non-pwc)'].apply(lambda x: string_to_dict(x, list_format=True))
df_leaderboard_expanded['Publication venues (non-pwc)'] = df_leaderboard_expanded['Publication venues (non-pwc)'].apply(string_to_list)
df_leaderboard_expanded['Release organizations (non-pwc)'] = df_leaderboard_expanded['Release organizations (non-pwc)'].apply(string_to_list)

df_leaderboard_expanded['Display formats'] = [[] for _ in range(len(df_leaderboard_expanded))]
df_leaderboard_expanded['Development workflows'] = [[] for _ in range(len(df_leaderboard_expanded))]
df_leaderboard_expanded['Release organizations'] = [[] for _ in range(len(df_leaderboard_expanded))]

for index, row in df_leaderboard_expanded.iterrows():
    display_formats = set(row['Display formats (non-pwc)'])
    publishers = publisher_synonyms_mapping(row['Release organizations (non-pwc)'])
    workflows = set(row['Development workflows (non-pwc)'])
        
    for platform in row['Platforms'].split(','):
        leaderboard_host_platform_mapping[platform] += 1
            
    if len(row['Platforms'].split(',')) > 1:
        leaderboard_hosted_on_multiple_platforms += 1
        
    if row['Platforms'] == 'pwc':
        publishers.add('Papers With Code')
            
    if 'pwc' in row['Platforms']:
        display_formats = display_formats.union(display_format_pwc)
        workflows = workflows.union(workflow_pwc)
    elif pd.isna(row['Model linkage (non-pwc)']):
        leaderboard_without_model_linkage += 1

    df_leaderboard_expanded.at[index, 'Display formats'] = list(display_formats)
    df_leaderboard_expanded.at[index, 'Development workflows'] = list(workflows)
    df_leaderboard_expanded.at[index, 'Release organizations'] = list(publishers)

df_leaderboard_expanded.to_csv(path_meta / 'leaderboard_processed.csv', index=False)

print(f"Total number of leaderboards: {len(df_leaderboard_expanded)}")
print(f"{round(leaderboard_without_model_linkage/len(df_leaderboard_expanded)*100,2)}% ({leaderboard_without_model_linkage} out of {len(df_leaderboard_expanded)}) leaderboards do not have any model provenence links.")
print(f"{round(leaderboard_hosted_on_multiple_platforms/len(df_leaderboard_expanded)*100,2)}% ({leaderboard_hosted_on_multiple_platforms} out of {len(df_leaderboard_expanded)}) leaderboards are hosted on multiple platforms.")

leaderboard_distribution = df_leaderboard_expanded['Platforms'].value_counts().to_dict()

fig = px.bar(
    x=leaderboard_distribution.keys(),
    y=leaderboard_distribution.values(),
    text_auto=True,
    labels={'x': 'Host Platforms', 'y': 'Number of Leaderboards'},
)
# Update the layout for a tighter look
fig.update_layout(
    autosize=True,
    margin=dict(
        l=10,  # Left margin
        r=10,  # Right margin
        b=10,  # Bottom margin
        t=10,  # Top margin
        pad=4  # Padding between the plot and the margin
    ),
    xaxis=dict(title_font=dict(size=18, family='Arial, bold', color='black'),tickfont=dict(color='black')),
    yaxis=dict(title_font=dict(size=18, family='Arial, bold', color='black'),tickfont=dict(color='black')),
)
fig.show()
pio.write_image(fig, f'{path_meta}/platform-combo-distribution.pdf')

for key, value in leaderboard_host_platform_mapping.items():
    print(f"{key.upper()}: {value} ({round(value/len(df_leaderboard_expanded)*100,2)}%)")

Total number of leaderboards: 731
19.43% (142 out of 731) leaderboards do not have any model provenence links.
7.66% (56 out of 731) leaderboards are hosted on multiple platforms.


PWC: 447 (61.15%)
IP: 95 (13.0%)
GH: 151 (20.66%)
HF: 97 (13.27%)


In [45]:
df = pd.read_csv(path_meta / 'leaderboard_processed.csv')
df['Development workflows'] = df['Development workflows'].apply(ast.literal_eval)

# df = df[df['Development workflows'].map(len) > 0]
df_workflow = df.explode('Development workflows')
df_workflow = df_workflow.groupby('Development workflows').size().reset_index(name='Frequency')
df_workflow['Development workflows'] = df_workflow['Development workflows'].apply(lambda x: f'Pattern {x}' if x != 'Unknown' else x)
df_workflow.sort_values(by='Frequency', ascending=False, inplace=True)
df_workflow['Weight'] = 1

print(f"{df_workflow['Development workflows'].iloc[0]} is the most prevalent ({df_workflow['Frequency'].iloc[0]} out of {len(df)}) workflow patterns that accounts for {round(df_workflow['Frequency'].iloc[0]/len(df)*100,2)}%.")
df_pwc = df[df['Platforms'] == 'pwc']
print(f"There are {round(len(df[df['Development workflows'].map(len) > 1])/(len(df)-len(df_pwc))*100,2)}% ({len(df[df['Development workflows'].map(len) > 1])} out of {len(df)-len(df_pwc)}) non-PWC leaderboards with multiple workflow patterns.")
print(f'In {df_workflow["Development workflows"].iloc[0]}, {round(len(df_pwc)/df_workflow["Frequency"].iloc[0]*100,2)}% ({len(df_pwc)} out of {df_workflow["Frequency"].iloc[0]}) of the leaderboards are hosted on Papers With Code.')

statistics = {
    'Distribution': 'Leaderboards across Workflow Patterns',
    'Mean': np.mean(df_workflow['Frequency']),
    'Median': np.median(df_workflow['Frequency']),
    'IQR': df_workflow['Frequency'].quantile(0.75) - df_workflow['Frequency'].quantile(0.25),
    'Gini Coefficient': ineqpy.inequality.gini(data=df_workflow[df_workflow['Development workflows'] != 'Unknown'], income='Frequency', weights='Weight'),
}
if os.path.exists(path_meta / 'leaderboard_statistics.csv'):
    df_statistics = pd.read_csv(path_meta / 'leaderboard_statistics.csv')
else:
    df_statistics = pd.DataFrame()
df_statistics = pd.concat([df_statistics, pd.DataFrame([statistics])], ignore_index=True)
df_statistics.drop_duplicates(subset=['Distribution'], keep='last', inplace=True)
df_statistics.sort_values(by='Distribution', inplace=True)
df_statistics.to_csv(path_meta / 'leaderboard_statistics.csv', index=False)

df_workflow = df_workflow[df_workflow['Development workflows'] != 'Unknown']
df_workflow['Ratio'] = round(df_workflow['Frequency'] / len(df) * 100, 2)
fig = px.bar(
    x=df_workflow['Development workflows'], 
    y=df_workflow['Ratio'],
    text_auto=True,
    labels={'x': 'Deployment workflows', 'y': 'Leaderboard Percentage'},
)
fig.show()

Pattern 1 is the most prevalent (473 out of 731) workflow patterns that accounts for 64.71%.
There are 4.86% (14 out of 288) non-PWC leaderboards with multiple workflow patterns.
In Pattern 1, 93.66% (443 out of 473) of the leaderboards are hosted on Papers With Code.


In [46]:
df = pd.read_csv(path_meta / 'leaderboard_processed.csv')
print(f"{round(len(df[df['#Empty ranking dataframes (non-pwc)'].notna()])/len(df)*100,2)}% ({len(df[df['#Empty ranking dataframes (non-pwc)'].notna()])} out of {len(df)}) leaderboards have empty ranking dataframes.")
df['#Empty ranking dataframes (non-pwc)'].value_counts().reset_index(name='Frequency')

0.41% (3 out of 731) leaderboards have empty ranking dataframes.


Unnamed: 0,#Empty ranking dataframes (non-pwc),Frequency
0,25,1
1,Unknown,1
2,2,1


In [47]:
df = pd.read_csv(path_meta / 'leaderboard_processed.csv')

df['Release organizations'] = df['Release organizations'].apply(ast.literal_eval)
df_split = df.explode('Release organizations').groupby('Release organizations').size().reset_index(name='Frequency')
df_split['Weight'] = 1

statistics = {
    'Distribution': 'Leaderboards across Release Organizations',
    'Mean': np.mean(df_split['Frequency']),
    'Median': np.median(df_split['Frequency']),
    'IQR': df_split['Frequency'].quantile(0.75) - df_split['Frequency'].quantile(0.25),
    'Gini Coefficient': ineqpy.inequality.gini(data=df_split, income='Frequency', weights='Weight'),
}
if os.path.exists(path_meta / 'leaderboard_statistics.csv'):
    df_statistics = pd.read_csv(path_meta / 'leaderboard_statistics.csv')
else:
    df_statistics = pd.DataFrame()
df_statistics = pd.concat([df_statistics, pd.DataFrame([statistics])], ignore_index=True)
df_statistics.drop_duplicates(subset=['Distribution'], keep='last', inplace=True)
df_statistics.sort_values(by='Distribution', inplace=True)
df_statistics.to_csv(path_meta / 'leaderboard_statistics.csv', index=False)

df_top = df_split.sort_values(by='Frequency', ascending=False).head(10)
print(f"{df_top['Release organizations'].iloc[0]} stands out as the most prolific release organization among the {len(df_split['Release organizations'].unique())-1} identified organizations, contributing to a notable {round(df_top['Frequency'].iloc[0]/len(df)*100,2)}% ({df_top['Frequency'].iloc[0]} out of {len(df)}) leaderboards.")

df_top['Ratio'] = round(df_top['Frequency'] / len(df) * 100, 2)

fig = go.Figure(go.Bar(
        x=df_top['Frequency'],  # Values for the bar lengths
        y=df_top['Release organizations'],  # Categories for each bar
        orientation='h',  # Sets the bars to be horizontal
        text=df_top['Ratio'],  # Adds the values as text on each bar
        textposition='auto'  # Automatically positions the text on the bars
))
fig.update_layout(
        title=f'Number of Leaderboards across Release Organizations (Top 10)',
        xaxis_title='Leaderboard Number',
        yaxis_title='Publisher Name',
        yaxis_autorange='reversed'  # This line makes the bars go top-down
)
fig.show()


Papers With Code stands out as the most prolific release organization among the 263 identified organizations, contributing to a notable 60.6% (443 out of 731) leaderboards.


In [48]:
df = pd.read_csv(path_meta / 'leaderboard_processed.csv')
df['Display formats'] = df['Display formats'].apply(ast.literal_eval)

df_multiple = df[df['Display formats'].map(len) > 1]
print(f"{round(len(df_multiple)/len(df)*100,2)}% ({len(df_multiple)} out of {len(df)}) leaderboards support multiple display formats.")
display_format_rankable = ['Rankable Table', 'Rankable Bar Chart']
df_rankable = keep_rows_by_list_column(df, 'Display formats', display_format_rankable)
print(f"{round(len(df_rankable)/len(df)*100,2)}% ({len(df_rankable)} out of {len(df)}) leaderboards support rankable display formats.")
df_rankable_split = df_rankable.explode('Display formats')['Display formats'].value_counts().reset_index(name='Frequency')
print(f"{df_rankable_split['Display formats'].iloc[0]} is the most popular display format of rankable leaderboards, accounting for {round(df_rankable_split['Frequency'].iloc[0]/len(df_rankable)*100,2)}% ({df_rankable_split['Frequency'].iloc[0]} out of {len(df_rankable)}) rankable leaderboards.")
df_split = df.explode('Display formats')['Display formats'].value_counts().reset_index(name='Frequency')
df_split['Weight'] = 1

statistics = {
    'Distribution': 'Leaderboards across Display Formats',
    'Mean': np.mean(df_split['Frequency']),
    'Median': np.median(df_split['Frequency']),
    'IQR': df_split['Frequency'].quantile(0.75) - df_split['Frequency'].quantile(0.25),
    'Gini Coefficient': ineqpy.inequality.gini(data=df_split, income='Frequency', weights='Weight'),
}
if os.path.exists(path_meta / 'leaderboard_statistics.csv'):
    df_statistics = pd.read_csv(path_meta / 'leaderboard_statistics.csv')
else:
    df_statistics = pd.DataFrame()
df_statistics = pd.concat([df_statistics, pd.DataFrame([statistics])], ignore_index=True)
df_statistics.drop_duplicates(subset=['Distribution'], keep='last', inplace=True)
df_statistics.sort_values(by='Distribution', inplace=True)
df_statistics.to_csv(path_meta / 'leaderboard_statistics.csv', index=False)

df_split['Ratio'] = round(df_split['Frequency'] / len(df) * 100, 2)

fig = px.bar(
    x=df_split['Display formats'],
    y=df_split['Ratio'],
    text_auto=True,
    labels={'x': 'Display format', 'y': 'Number of Leaderboards'},
)
# Update the layout for a tighter look
fig.update_layout(
    autosize=True,
    margin=dict(
        l=10,  # Left margin
        r=10,  # Right margin
        b=10,  # Bottom margin
        t=10,  # Top margin
        pad=4  # Padding between the plot and the margin
    )
)
fig.show()

74.28% (543 out of 731) leaderboards support multiple display formats.
83.86% (613 out of 731) leaderboards support rankable display formats.
Rankable Table is the most popular display format of rankable leaderboards, accounting for 99.84% (612 out of 613) rankable leaderboards.


In [49]:
df = pd.read_csv(path_meta / 'leaderboard_processed.csv')
df = df[df['Platforms'] != 'pwc']
df['Publication venues (non-pwc)'] = df['Publication venues (non-pwc)'].apply(ast.literal_eval)
df_pub = df[df['Publication venues (non-pwc)'].apply(len) > 0]
print(f"{round(len(df_pub)/len(df)*100,2)}% ({len(df_pub)} out of {len(df)}) leaderboards are associated with specific publications, including research articles, blog posts, and white papers.")

df_literature = filter_rows_by_list_column(df_pub, 'Publication venues (non-pwc)', non_accepted_publication_type)
print(f"{round(len(df_literature)/len(df_pub)*100,2)}% ({len(df_literature)} out of {len(df_pub)}) publications have been accepted in a specific workshop, conference, magzine, or journal.")
df_split = df_literature.explode('Publication venues (non-pwc)').groupby('Publication venues (non-pwc)').size().reset_index(name='Frequency')
df_split['Weight'] = 1

statistics = {
    'Distribution': 'Leaderboards across Publication Venues',
    'Mean': np.mean(df_split['Frequency']),
    'Median': np.median(df_split['Frequency']),
    'IQR': df_split['Frequency'].quantile(0.75) - df_split['Frequency'].quantile(0.25),
    'Gini Coefficient': ineqpy.inequality.gini(data=df_split, income='Frequency', weights='Weight'),
}
if os.path.exists(path_meta / 'leaderboard_statistics.csv'):
    df_statistics = pd.read_csv(path_meta / 'leaderboard_statistics.csv')
else:
    df_statistics = pd.DataFrame()
df_statistics = pd.concat([df_statistics, pd.DataFrame([statistics])], ignore_index=True)
df_statistics.drop_duplicates(subset=['Distribution'], keep='last', inplace=True)
df_statistics.sort_values(by='Distribution', inplace=True)
df_statistics.to_csv(path_meta / 'leaderboard_statistics.csv', index=False)

df_top = df_split.sort_values(by='Frequency', ascending=False).head(10)
print(f"{df_top['Publication venues (non-pwc)'].iloc[0]} emerges as the conference with the most number of accepted publications, accounting for {round(df_top['Frequency'].iloc[0]/len(df_literature)*100,2)}% ({df_top['Frequency'].iloc[0]} out of {len(df_literature)}) of those published.")

df_top['Ratio'] = round(df_top['Frequency'] / len(df_literature) * 100, 2)

fig = go.Figure(go.Bar(
        x=df_top['Frequency'],  # Values for the bar lengths
        y=df_top['Publication venues (non-pwc)'],  # Categories for each bar
        orientation='h',  # Sets the bars to be horizontal
        text=df_top['Ratio'],  # Adds the values as text on each bar
        textposition='auto'  # Automatically positions the text on the bars
))
fig.update_layout(
        title=f'Number of non-PWC Leaderboards with Accepted Publications across Publication Venues (Top 10)',
        xaxis_title='Leaderboard Number',
        yaxis_title='Publication Name',
        yaxis_autorange='reversed'  # This line makes the bars go top-down
)
fig.show()


76.74% (221 out of 288) leaderboards are associated with specific publications, including research articles, blog posts, and white papers.
34.84% (77 out of 221) publications have been accepted in a specific workshop, conference, magzine, or journal.
NeurIPS emerges as the conference with the most number of accepted publications, accounting for 23.38% (18 out of 77) of those published.


In [50]:
df = pd.read_csv(path_meta / 'leaderboard_processed.csv')
print(f"{round((len(df)-len(leaderboard_system_without_documentation))/(len(df))*100, 2)}% ({len(df)-len(leaderboard_system_without_documentation)} out of {len(df)}) leaderboards provide documentation.")
print(f"{round((len(df)-len(leaderboard_system_without_contacts))/(len(df))*100, 2)}% ({len(df)-len(leaderboard_system_without_contacts)} out of {len(df)}) leaderboards provide contact information.")
df['Development workflows'] = df['Development workflows'].apply(ast.literal_eval)
df_submission = keep_rows_by_list_column(df, 'Development workflows', workflow_patterns_with_submission)
print(f"{round(len(df_submission)/(len(df))*100, 2)}% ({len(df_submission)} out of {len(df)}) leaderboards provide submission channels.")
df_pwc = df[df['Platforms'] == 'pwc']
print(f"{round((len(df)-len(leaderboard_system_without_evaluation_harness)-len(df_pwc))/(len(df))*100, 2)}% ({len(df)-len(leaderboard_system_without_evaluation_harness)-len(df_pwc)} out of {len(df)}) leaderboards provide evaluation harness.")
print(f"{round((len(df)-len(leaderboard_system_without_pull_requests)-len(df_pwc))/(len(df))*100, 2)}% ({len(df)-len(leaderboard_system_without_pull_requests)-len(df_pwc)} out of {len(df)}) leaderboards provide pull request channels.")
print(f"{round((len(df)-len(leaderboard_system_without_pull_requests)-len(df_pwc))/(len(df))*100, 2)}% ({len(df)-len(leaderboard_system_without_pull_requests)-len(df_pwc)} out of {len(df)}) leaderboards provide discussion forums.")

99.86% (730 out of 731) leaderboards provide documentation.
99.86% (730 out of 731) leaderboards provide contact information.
100.0% (731 out of 731) leaderboards provide submission channels.
39.26% (287 out of 731) leaderboards provide evaluation harness.
37.62% (275 out of 731) leaderboards provide pull request channels.
37.62% (275 out of 731) leaderboards provide discussion forums.


In [51]:
df = pd.read_csv(path_meta / 'leaderboard_statistics.csv')
df.fillna('$\\times$', inplace=True)
df['Mean'] = df['Mean'].map('{:.2f}'.format)
df['Median'] = df['Median'].map('{:.1f}'.format)
df['IQR'] = df['IQR'].map('{:.0f}'.format)
df['Gini Coefficient'] = df['Gini Coefficient'].map(lambda x: '{:.3f}'.format(x) if isinstance(x,float) else x)
print(df.to_latex(index=False))

\begin{tabular}{lllll}
\toprule
Distribution & Mean & Median & IQR & Gini Coefficient \\
\midrule
Leaderboards across Display Formats & 118.45 & 6.0 & 82 & 0.838 \\
Leaderboards across Publication Venues & 3.52 & 2.0 & 2 & 0.556 \\
Leaderboards across Release Organizations & 4.55 & 1.0 & 1 & 0.712 \\
Leaderboards across Workflow Patterns & 186.50 & 130.5 & 197 & 0.667 \\
\bottomrule
\end{tabular}



In [52]:
total_smell_count = 0

# Load the Excel file
with pd.ExcelFile(path_meta / 'Foundation Model Leaderboards.xlsx') as xls:

    # Initialize a dictionary to store unique counts
    unique_counts = {}

    # Iterate over each sheet
    for sheet_name in xls.sheet_names:
        if sheet_name in ['Leaderboard']:
            continue
        
        df = pd.read_excel(xls, sheet_name=sheet_name)
        
        if sheet_name == 'Self-admitted Technical Debt':
            print(f'There are {len(df)+1} SATD examples.')
            continue
    
        # Iterate over each column
        for column in df.columns:
            unique_count = df[column].dropna().nunique()
        
            if sheet_name not in unique_counts:
                unique_counts[sheet_name] = {}
                
            unique_counts[sheet_name][column] = unique_count
            total_smell_count += unique_count

print(f'There are {total_smell_count} leaderboard smell examples.')

# Display the result
result_df = pd.DataFrame(unique_counts).transpose()
result_df = result_df.fillna('')
result_df = result_df.map(lambda x: int(x) if isinstance(x, float) else x)
result_df

There are 24 SATD examples.
There are 292 leaderboard smell examples.


Unnamed: 0,Benchmark Metric,Benchmark Protocol,Benchmark Raw Dataset,Evaluation Record/Result,Model (Information),Ranking Dataframe,Submission Channel/Guideline,Benchmark Task,Uncategorized
Confusing Entity,4.0,9.0,23.0,11.0,7.0,4.0,21.0,,
Deprecated Entity,,,3.0,1.0,1.0,7.0,3.0,,
Inaccessible Entity,,,2.0,3.0,1.0,14.0,6.0,,
Misdisplayed Entity,5.0,2.0,,,,3.0,,,
Mismatched Entity,3.0,1.0,2.0,2.0,6.0,1.0,,2.0,
Missing Entity,,1.0,,6.0,,12.0,1.0,,
Redundant Entity,,,,3.0,2.0,4.0,,,
Unresponsive Entity,,,,,,96.0,2.0,,
Others,,,,,,,,,18.0
