In [1]:
import os
import ast
import csv
import glob
import json
import ineqpy
import numpy as np
import pandas as pd
import scipy.stats as stats
import plotly.express as px
import plotly.graph_objects as go

from scipy import stats
from pathlib import Path
from collections import Counter
from collections import defaultdict

In [2]:
path_data = Path("../data")
path_meta = Path("../meta")
path_rq1 = path_meta / 'rq1'
path_rq2 = path_meta / 'rq2'

platform_abbreviation_mapping = {
    'ai2': 'Allen Institute for AI',
    'gh': 'GitHub',
    'hf': 'HuggingFace',
    'pwc': 'PapersWithCode',
    'shw': 'Self-hosted website',
}

platform_template = [
    'ai2',
    'pwc'
]

platform_nontemplate = [
    'gh',
    'hf',
    'shw'
]

platform_template_fullname = [platform_abbreviation_mapping[platform] for platform in platform_template]
platform_nontemplate_fullname = [platform_abbreviation_mapping[platform] for platform in platform_nontemplate]

leaderboard_segmentation_criteria_mapping = {
    'Functionality Evaluation': ['Domain Task', 'Model Capability', 'Evaluation Benchmark', 'Language Support', 'Task Modality'],
    'Evaluation Configuration': ['#Prompt Example', '#Prompt Token', 'Evaluation Dataset', 'Evaluation Metrics', 'Evaluator', 'Tokenizer'],
    'Leaderboard Version': ['Evaluation Aggregation', 'Leaderboard Series', 'Release Date'],
    'Model Information': ['Model Accessibility', 'Model Size', 'Model Type'],
}

organizer_synonyms = {
    'Alibaba Group': ['Alibaba Group DAMO Academy'],
    'Amazon': ['Amazon Alexa AI Lab', 'Amazon AWS AI Lab'],
    'Google': ['Google DeepMind'],
    'Huawei': ["Huawei Noah's Ark Lab"],
    'Meta': [
        'Meta FAIR',
        'Meta GenAI'
    ],
    'Microsoft': ['Microsoft Research Asia'],
    'Tencent': [
        'Tencent AI Lab',
        'Tencent PCG ARC Lab',
        'Tencent Youtu Lab'
    ],
    'University of California': [
        'University of California Berkeley',
        'University of California San Diego',
        'University of California Los Angeles',
    ],
    'University of North Carolina': ['University of North Carolina Chapel Hill'],
    'University of Illinois': ['University of Illinois Urbana-Champaign'],
    'University of Massachusetts': ['University of Massachusetts Amherst'],
    'University of Maryland': ['University of Maryland College Park']
}

metrics_synonyms = {
    'accuracy': [
        'acc',
        'accuarcy',
        'qa accuracy'
    ],
    'average': [
        'avg',
        '平均'
    ],
    'average accuracy': [
        'avg. accuracy'
    ],
    'average score': ['平均分'],
    'bleu': ['bleu score'],
    'bleu-1': [
        'narrativeqa bleu-1',
        'socialiqa bleu-1',
        'mcscript bleu-1',
        'cosmosqa bleu-1'
    ],
    'bleu-4': ['bleu4'],
    'bertscore': ['bert score'],
    'code': ['代码'],
    'elo rating': [
        'chatbot arena elo',
        'elo'
    ],
    'exact match': ['em', 'exact match accuracy'],
    'lerc': [
        'cosmosqa lerc',
        'mcscript lerc',
        'socialiqa lerc',
        'narrativeqa lerc'
    ],
    'link': ['url'],
    'mean rank': [
        'text-to-video mean rank',
        'video-to-text mean rank'
    ],
    'median rank': [
        'text-to-video median rank',
        'video-to-text median rank',
        'text-to-videomedian rank',
        'text-to-video medianr'
    ],
    'meteor': [
        'cosmosqa meteor',
        'narrativeqa meteor',
        'socialiqa meteor',
        'mcscript meteor'
    ],
    'neg mean rank': [
        'i->t neg mean rank',
        't->i neg mean rank'
    ],
    'organization': [
        '发布机构',
        '机构',
        'orgaisation',
    ],
    'others': ['其他'],
    'overall': ['xiezhi overall'],
    'overall score': [
        '总分',
        '总体分数'
    ],
    'pass@1': [
        'interview pass@1',
        'competition pass@1',
        'introductory pass@1'
    ],
    'pass@5': [
        'interview pass@5',
        'introductory pass@5',
        'competition pass@5'
    ],
    'pass@1000': [
        'interview pass@1000',
        'competition pass@1000',
        'introductory pass@1000'
    ],
    'pass@any': [
        'introductory pass@any',
        'competition pass@any',
        'interview pass@any'
    ],
    '#parameters':  [
        '#params',
        '# params',
        '#size',
        '参数量',
        'model size',
        'model size/b',
        'number of params',
        'param',
        'parameters',
        'params',
        'size'
    ],
    'perplexity': ['ppl'],
    'precision@1': ['i->t p@1'],
    'precision@20': ['p@20'],
    'recall@1': [
        'r@1',
        'text-to-videor@1',
        'video-to-text r@1',
        'text-to-video r@1',
        'text-to-image r@1',
        'image-to-text r@1'
    ],
    'recall@5': [
        'text-to-image r@5',
        'video-to-text r@5',
        'image-to-text r@5',
        'text-to-video r@5',
        'r@5',
    ],
    'recall@10': [
        'recall@10 on 1 rounds',
        'recall@10 on 2 rounds',
        'recall@10 on 3 rounds',
        'r@10',
        'video-to-text r@10',
        'text-to-image r@10',
        'text-to-video r@10',
        'image-to-text r@10'
    ],
    'recall@50': [
        'text-to-video r@50',
        'video-to-text r@50',
    ],
    'score': ['分数'],
    'submission date': ['提交时间'],
    'top-1 accuracy': ['top 1 accuracy'],
    'top-5 accuracy': ['top 5 accuracy'],
    'type': ['model type'],
    'win rate': ['胜率'],
    'word error rate': ['wer']
}

model_naming_convention = {
    'Model Publisher (first line)': [
        'Provider Leaderboard'
    ],
    'Repository Username': [
        'EQ-Bench (v2)',
        'HHEM Leaderboard',
        'Big Code Models Leaderboard',
        'Colossal-AI',
        'DecodingTrust',
        'LLMPerf Leaderboard',
        'LLM-Perf Leaderboard',
        'Open LLM Leaderboard',
        'Open Ko-LLM Leaderboard',
        'ScandEval'
    ],
}

metrics_synonyms_inverse = {}
for key, values in metrics_synonyms.items():
    for value in values:
        metrics_synonyms_inverse[value] = key

organizer_synonyms_inverse = {}
for key, values in organizer_synonyms.items():
    for value in values:
        organizer_synonyms_inverse[value] = key

leaderboard_segmentation_criteria_mapping_inverse = {}
for key, values in leaderboard_segmentation_criteria_mapping.items():
    for value in values:
        leaderboard_segmentation_criteria_mapping_inverse[value] = key

model_naming_convention_count = {
    k: len(v) for k, v in model_naming_convention.items()}

def convert_csv_to_dict(file_path):
    """
    Convert a CSV file of leaderboards and metrics to a dictionary.

    :param file_path: Path to the CSV file
    :return: Dictionary with leaderboards as keys and list of metrics as values
    """
    leaderboard_data = {}

    # Read the CSV file
    with open(file_path, mode='r') as file:
        csv_reader = csv.reader(file)
        next(csv_reader)  # Skip the header row

        # Iterate through each row
        for row in csv_reader:
            if row:  # Check if row is not empty
                key = row[0]  # The first element is the key
                # List comprehension to get non-empty values
                values = [value for value in row[1:] if value]
                leaderboard_data[key] = values

    return leaderboard_data


def transform_platform(groups):
    groups_processed = []
    for group in groups:
        group_processed = []
        for platform in group.split('_'):
            group_processed.append(platform_abbreviation_mapping[platform])
        groups_processed.append(group_processed)
    return groups_processed


def faltten_flatten_platform(groups):
    groups_processed = []
    for group in groups:
        group_processed = []
        for platform in group.split('_'):
            group_processed.append(platform_abbreviation_mapping[platform])
        groups_processed.extend(group_processed)
    return groups_processed


def filter_platform(groups, filters):
    groups_processed = []
    for group in groups:
        group_processed = []
        for platform in group:
            if platform not in filters:
                group_processed.append(platform)
        if group_processed:
            groups_processed.append(group_processed)
    return groups_processed


def split_string(text):
    if pd.isna(text):
        return []
    return text.split(',')

In [27]:
df = pd.read_csv(path_rq1 / 'Leaderboard Pattern.csv')
df.fillna('', inplace=True)
df.to_latex(
    path_rq1 / 'Leaderboard Pattern.tex',
    index=False,
    caption='Leaderboard Patterns and Their Characteristics',
    label='tab:pattern'
)

In [118]:
# Function to format each value based on its own decimal places
def format_individual_value(x):
    if isinstance(x, float):
        # Use string formatting to maintain original decimal places
        return "{:f}".format(x).rstrip('0').rstrip('.')
    return x

df = pd.read_csv(path_rq1 / 'Platform Ranking.csv')
formatted_df = df.map(format_individual_value)
formatted_df.to_latex(
    path_rq1 / 'Platform Ranking.tex',
    index=False,
    escape=True,
    caption='Leaderboard Platform and Their Rankings',
    label='tab:platform'
)

In [4]:
df = pd.read_csv(path_rq1 / 'Leaderboard.csv')

for index, row in df.iterrows():
    folder_path = os.path.join(path_data, row['Leaderboard'])
    os.makedirs(folder_path, exist_ok=True)


In [17]:
def print_empty_folders(root_dir):
    for dirpath, dirnames, filenames in os.walk(root_dir):
        # Check if both lists of subdirectories and filenames are empty
        if not dirnames and not filenames:
            print(f"Empty folder: {dirpath}")

# Replace with your root directory path
print_empty_folders(str(path_data))

In [27]:
directory = path_data / "EQ-Bench (v2)"
channel = ''

def file_rename(name, channel):
    name = name.lower()
    name = name.replace(' - ', '-')
    name = name.replace(' ', '_')
    name = name.replace(".csv", ".json")
    if channel:
        name = f'{channel}-{name}'
    return name

for file in glob.glob(str(directory / "*.csv")):
    df = pd.read_csv(file)
    filename = file.split('/')[-1]
    filename = file_rename(filename, channel)
    df.to_json(directory / filename, orient="records", indent=4)
    os.remove(file)


In [28]:
# Aligning Diverse Leaderboards for Standardized Analysis 

def process_model(model_str):
    if ('*' in model_str) and ('*' == model_str[-1]):
        model_str = model_str[:-1]
        
    if '](' in model_str:
        model_str = model_str.split('](')[0][1:]
        
    # Substrings to be removed
    substrings_to_remove = ['🥇', '🥈', '🥉', '🗒️', '📄', '🌍']
    
    # Remove each unwanted substring
    for substring in substrings_to_remove:
        model_str = model_str.replace(substring, '')
    
    # Strip whitespace
    return model_str.strip()

def process_json_files(directory):
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".json"):
                file_path = os.path.join(root, file)
                try:
                    df = pd.read_json(file_path)
                    if 'Model' not in df.columns:
                        if file.startswith('ai2'):
                            df.rename(columns={'Submission': 'Model'}, inplace=True)
                        else:
                            print(file_path)
                    df = df.dropna(subset=['Model'])
                    df['Model'] = df['Model'].apply(process_model)
                    df.to_json(file_path, orient='records', indent=4)
                except:
                    print(file_path)

process_json_files(str(path_data))


In [67]:
df = pd.read_csv(path_rq1 / 'Leaderboard.csv')
df['Feedback manner (shw-only)'] = df['Feedback manner (shw-only)'].apply(split_string)
df['Language'] = df['Language'].apply(split_string)
df['Leaderboard segmentation criteria for all platforms (non-pwc)'] = df['Leaderboard segmentation criteria for all platforms (non-pwc)'].apply(split_string)
df['Modality'] = df['Modality'].apply(split_string)
df['Platform'] = df['Platform'].apply(split_string)
df['Platform'] = df['Platform'].apply(transform_platform)
df['Platform (non-template) with model provenance links'] = df['Platform (non-template) with model provenance links'].apply(split_string)
df['Publication'] = df['Publication'].apply(split_string)
df['Publisher (non-template)'] = df['Publisher (non-template)'].apply(split_string)
df['Submission artifact (non-template)'] = df['Submission artifact (non-template)'].apply(split_string)
df['Submission manner (non-template)'] = df['Submission manner (non-template)'].apply(split_string)
df.to_csv(path_rq1 / 'Leaderboard_processed.csv', index=False)


In [162]:
benchmark_mapping = convert_csv_to_dict(path_rq1 / 'Benchmark.csv')

leaderboard_benchmarks = set()
for key in benchmark_mapping.keys():
    benchmarks = [benchmark if benchmark == 'ARC (The Abstraction and Reasoning Corpus)' else benchmark.split(' (')[0] for benchmark in benchmark_mapping[key]]
    leaderboard_benchmarks = leaderboard_benchmarks.union(set(benchmarks))
print('Benchmarks:', len(leaderboard_benchmarks))

leaderboard_tasks = set()
for key in benchmark_mapping.keys():
    tasks = set([benchmark for benchmark in benchmark_mapping[key]])
    leaderboard_tasks = leaderboard_tasks.union(tasks)
    benchmark_mapping[key] = list(tasks)
print('Tasks:', len(leaderboard_tasks))

df_benchmark = pd.DataFrame(list(benchmark_mapping.items()), columns=['Leaderboard', 'Benchmark'])
df_benchmark.to_csv(path_rq1 / 'Benchmark_processed.csv', index=False)

Benchmarks: 685
Tasks: 819


In [87]:
df = pd.read_csv(path_rq1 / 'Leaderboard_processed.csv')
df['Platform'] = df['Platform'].apply(ast.literal_eval)
df['Platform'] = df['Platform'].apply(filter_platform, filters=platform_template_fullname)
df = df[df['Platform'].map(len) > 0]
        
df['Submission manner (non-template)'] = df['Submission manner (non-template)'].apply(ast.literal_eval)
df['Submission artifact (non-template)'] = df['Submission artifact (non-template)'].apply(ast.literal_eval)
print(f"Of the {len(df)} non-template leaderboards collected, {len(df[df['Submission manner (non-template)'].map(len) > 0])} (accounting for {round(len(df[df['Submission manner (non-template)'].map(len) > 0])/len(df)*100,2)}%) do not explicitly encourage the submission of evaluation records.")

df = df[df['Submission manner (non-template)'].map(len) > 0]
df_manner = df.explode('Submission manner (non-template)')
df_manner = df_manner['Submission manner (non-template)'].value_counts().reset_index()
print(f"{df_manner['Submission manner (non-template)'].iloc[0]} is the most popular Submission manner for non-template leaderboards, accounting for {round(df_manner['count'].iloc[0]/len(df)*100,2)}% of the total.")

fig = px.bar(
    x=df_manner['Submission manner (non-template)'], 
    y=df_manner['count'],
    text_auto=True,
    labels={'x': 'Submission manner', 'y': 'Count of Benchmark-based Leaderboards'},
    title='Frequency of Submission manner across Different Platforms (non-template)'
)
# fig.write_image(path_rq1 / 'platform distribution.pdf')
fig.show()

df_artifact = df.explode('Submission artifact (non-template)')
df_artifact = df_artifact['Submission artifact (non-template)'].value_counts().reset_index()
print(f"{df_artifact['Submission artifact (non-template)'].iloc[0]} is the most popular Submission manner for non-template leaderboards, accounting for {round(df_artifact['count'].iloc[0]/len(df)*100,2)}% of the total.")

fig = px.bar(
    x=df_artifact['Submission artifact (non-template)'],
    y=df_artifact['count'],
    text_auto=True,
    labels={'x': 'Submission artifact', 'y': 'Count of Benchmark-based Leaderboards'},
    title='Frequency of Submission artifact across Different Platforms (non-template)'
)
# fig.write_image(path_rq1 / 'platform distribution.pdf')
fig.show()

Of the 138 non-template leaderboards collected, 69 (accounting for 50.0%) do not explicitly encourage the submission of evaluation records.
Submission Portal is the most popular Submission manner for non-template leaderboards, accounting for 40.58% of the total.


Evaluation Result is the most popular Submission manner for non-template leaderboards, accounting for 72.46% of the total.


In [86]:
df = pd.read_csv(path_rq1 / 'Leaderboard_processed.csv')
df['Platform'] = df['Platform'].apply(ast.literal_eval)
df['Platform'] = df['Platform'].apply(filter_platform, filters=platform_template_fullname)
df = df[df['Platform'].map(len) > 0]

df['Platform (non-template) with model provenance links'] = df['Platform (non-template) with model provenance links'].apply(ast.literal_eval)
print(f"{len(df[df['Platform (non-template) with model provenance links'].map(len) > 0])} non-template leaderboards have accessible linkage to the evaluated models in their leaderboards, accounting for {round(len(df[df['Platform (non-template) with model provenance links'].map(len) > 0])/len(df),4)*100}% of the total.")
df_provenance = df[df['Platform (non-template) with model provenance links'].map(len) > 0]
df_provenance = df_provenance.explode('Platform (non-template) with model provenance links')
df_provenance['Platform (non-template) with model provenance links'] = df_provenance['Platform (non-template) with model provenance links'].map(platform_abbreviation_mapping)
df_provenance = df_provenance['Platform (non-template) with model provenance links'].value_counts().reset_index()

fig = px.bar(
    x=df_provenance['Platform (non-template) with model provenance links'],
    y=df_provenance['count'],
    text_auto=True,
    labels={'x': 'Platform', 'y': 'Count of leaderboards with model provenance linkage'},
    title='Frequency of Leaderboards with Model Provenance Linkage across Different Platforms (non-template)'
)
# fig.write_image(path_rq1 / 'platform distribution.pdf')
fig.show()

total_data = 0
# total_paper = 0
# total_code = 0
total_provenance = 0

# average_paper_rate = []
# average_code_rate = []
average_provenance_rate = []

for root, dirs, files in os.walk(path_data):
    for file in files:
        if file.startswith('pwc') and file.endswith(".json"):
            with open(os.path.join(root, file), 'r') as file:
                json_data = json.load(file)
                total_data += len(json_data)
                json_external = [external for external in json_data if type(external['external_source_url']) == str]
                total_provenance += len(json_external)
                if not len(json_external):
                    json_internal = [paper for paper in json_data if type(paper['paper']['id']) == int]
                    total_provenance += len(json_internal)
                    average_provenance_rate.append(len(json_internal)/len(json_data))
                    # json_paper = [paper for paper in json_data if type(paper['paper']['id']) == int]
                    # total_paper += len(json_paper)
                    # average_paper_rate.append(len(json_paper)/len(json_data)*100)
                    # json_code = [code for code in json_paper if code['paper']['code'] == True]
                    # if len(json_code):
                    #     total_code += len(json_code)
                    #     average_code_rate.append(len(json_code)/len(json_paper)*100)

print(f'Total provenance rate across all leaderboard: {round(total_provenance/total_data, 4)*100}%, namely {total_provenance} out of {total_data} evaluation records.')
print(f'Average provenance rate across crowdsourced leaderboard: {round(sum(average_provenance_rate)/len(average_provenance_rate), 4)*100}%')


55 non-template leaderboards have accessible linkage to the evaluated models in their leaderboards, accounting for 39.86% of the total.


Total provenance rate across all leaderboard: 97.97%, namely 6628 out of 6765 evaluation records.
Average provenance rate across crowdsourced leaderboard: 98.95%


In [98]:
df = pd.read_csv(path_rq1 / 'Leaderboard_processed.csv')
df['Platform'] = df['Platform'].apply(ast.literal_eval)
df['Platform'] = df['Platform'].apply(filter_platform, filters=['PapersWithCode'])
for index, row in df.iterrows():
    if (len(row['Platform']) == 1) and (row['Platform'][0] == ['Self-hosted website']):
        continue
    df.drop(index, inplace=True)
    
df['Feedback manner (shw-only)'] = df['Feedback manner (shw-only)'].apply(ast.literal_eval)
print(f"Among leaderboards exclusively hosted on self-hosted websites (up to {len(df)}), {round(len(df[df['Feedback manner (shw-only)'].map(len) > 0])/len(df)*100,2)}% of them provide feedback channels, with only {len(df[df['Feedback manner (shw-only)'].map(len) == 0])} exception.")
df_provenance = df[df['Feedback manner (shw-only)'].map(len) > 0]
df_provenance = df_provenance.explode('Feedback manner (shw-only)')
df_provenance = df_provenance['Feedback manner (shw-only)'].value_counts().reset_index()

fig = px.bar(
    x=df_provenance['Feedback manner (shw-only)'],
    y=df_provenance['count'],
    text_auto=True,
    labels={'x': 'Platform', 'y': 'Count of leaderboards with model provenance linkage'},
    title='Frequency of Leaderboards with Model Provenance Linkage in Self-hosted Websites'
)
# fig.write_image(path_rq1 / 'platform distribution.pdf')
fig.show()

Among leaderboards exclusively hosted on self-hosted websites (up to 26), 96.15% provide feedback channels, with only 1 exception.


In [3]:
total_records = 0
total_publishers = 0
total_scenarios = 0
total_benchmarks = 0
total_template_scenarios = 0
total_template_publishers = 0
total_nontemplate_scenarios = 0
total_nontemplate_publishers = 0
    
def list_directories(folder_path):
    # List all subdirectories within the folder
    directories = [os.path.join(folder_path, d) for d in os.listdir(
        folder_path) if os.path.isdir(os.path.join(folder_path, d))]
    return directories

for folder in list_directories(path_data):
    total_benchmarks += 1
    prefixes = []
    
    for root, dirs, files in os.walk(folder):
        for file in files:
            if file.endswith('.json'):
                file_path = os.path.join(root, file)
                df = pd.read_json(file_path)
                total_records += len(df)
                total_scenarios += 1
                for prefix in platform_abbreviation_mapping.keys():
                    if file.startswith(prefix):
                        prefixes.append(prefix)
                        break
                    
    total_publishers += len(set(prefixes))
    template_publishers = [p for p in prefixes if p not in platform_nontemplate]
    nontemplate_publishers = [p for p in prefixes if p in platform_nontemplate]
    total_template_publishers += len(set(template_publishers))
    total_nontemplate_publishers += len(set(nontemplate_publishers))
    total_template_scenarios += len(template_publishers)
    total_nontemplate_scenarios += len(nontemplate_publishers)

print(f"Total number of evaluation records: {total_records}")
print(f"Total number of leaderboards (benchmark): {total_benchmarks}")
print(f"Total number of leaderboards (publisher): {total_publishers}")
print(f"Total number of leaderboards (scenario): {total_scenarios}")
print(f"Total number of leaderboards (publisher) (template): {total_template_publishers}")
print(f"Total number of leaderboards (scenario) (template): {total_template_scenarios}")
print(f"Total number of leaderboards (publisher) (non-template): {total_nontemplate_publishers}")
print(f"Total number of leaderboards (scenario) (non-template): {total_nontemplate_scenarios}")


Total number of evaluation records: 51047
Total number of leaderboards (benchmark): 263
Total number of leaderboards (publisher): 289
Total number of leaderboards (scenario): 2230
Total number of leaderboards (publisher) (template): 153
Total number of leaderboards (scenario) (template): 542
Total number of leaderboards (publisher) (non-template): 136
Total number of leaderboards (scenario) (non-template): 1688


In [26]:
# from openpyxl import load_workbook, Workbook

# # Path to your Excel file and output file
# input_file = '/Users/jimmy/Downloads/a.xlsx'
# output_file = '/Users/jimmy/Downloads/b.xlsx'

# # Load the workbook and select the active worksheet
# wb = load_workbook(input_file)
# ws = wb.active

# # Dictionary to store unique values and their hyperlinks
# unique_values = {}

# # Loop through all cells in the worksheet
# for row in ws.iter_rows():
#     for cell in row:
#         # Check if cell has a value
#         if cell.value:
#             # Store value and hyperlink (if any)
#             value_filtered = cell.value.split(' (')[0]
#             unique_values[value_filtered] = cell.hyperlink.target if cell.hyperlink else None

# # Sort the unique values
# sorted_unique_values = sorted(unique_values.items())

# # Create a new workbook and select the active worksheet
# new_wb = Workbook()
# new_ws = new_wb.active

# # Write unique values and hyperlinks to the new worksheet
# for i, (text, hyperlink) in enumerate(sorted_unique_values, start=1):
#     new_ws.cell(row=i, column=1, value=text)
#     if hyperlink:
#         new_ws.cell(row=i, column=1).hyperlink = hyperlink

# # Save the new workbook
# new_wb.save(output_file)

# print(f"Unique values with hyperlinks written to {output_file}")

Unique values with hyperlinks written to /Users/jimmy/Downloads/b.xlsx


In [None]:
# import pandas as pd
# import plotly.graph_objects as go

# # Load your DataFrame

# # Group by 'Platform' and 'Modality', then count occurrences
# grouped = df.groupby(['Modality', 'Platform']).size().reset_index(name='counts')

# # Pivot the DataFrame to get 'Platform' as index, 'Modality' as columns, and 'counts' as values
# pivot_df = grouped.pivot(index='Modality', columns='Platform', values='counts').fillna(0)

# # Create the traces for each 'Platform'
# traces = []
# for Platform in pivot_df.columns:
#     traces.append(go.Bar(
#         x=pivot_df.index,
#         y=pivot_df[Platform],
#         name=Platform,
#         text=pivot_df[Platform],
#         textposition='outside'  # This positions the text on top of the bars
#     ))

# # Create the figure and add traces
# fig = go.Figure(data=traces)

# # Update layout for a better look
# fig.update_layout(
#     barmode='group',
#     title='Frequency of Leaderboard by Platform and Modality',
#     xaxis_title='Task Modality',
#     yaxis_title='Count of Platform-based Leaderboards',
# )

# # Show the plot
# fig.write_image(path_rq1 / 'publisher modality distribution.pdf')

# # Show the figure
# fig.show()


In [8]:
df = pd.read_csv(path_rq1 / 'Leaderboard_processed.csv')
df['Publication'] = df['Publication'].apply(ast.literal_eval)
print(f"{len(df[df['Publication'].map(len) > 0])} benchmark-based leaderboards have related papers, preprints or reports, accounting for {round(len(df[df['Publication'].map(len) > 0])/len(df),4)*100}% of the total.")
df = df[df['Publication'].map(len) > 0]
print(f"{len(df[df['Publication'].apply(lambda x: 'Preprint' in x)])} benchmark-based leaderboards have papers from workshop, conference or journal within those publication, accounting for {round(len(df[df['Publication'].apply(lambda x: 'Preprint' in x)])/len(df),4)*100}% of the total.")
df = df[~df['Publication'].apply(lambda x: 'Preprint' in x)]
organizer_mapping = df.set_index('Leaderboard')['Publication'].to_dict()
org_item_count = Counter(org for orgs in organizer_mapping.values() for org in orgs)
df = pd.DataFrame(list(org_item_count.items()), columns=['Publication', '#Leaderboard'])
df['Weight'] = 1

# Calculate the Gini coefficient
gini_coefficient = ineqpy.inequality.gini(data=df, income='#Leaderboard', weights='Weight')
print("Gini Coefficient:", gini_coefficient)
df_top = df.sort_values(by='#Leaderboard', ascending=False).head(10)
print(f"{df_top['Publication'].iloc[0]} emerges as the most represented conference, accounting for {round(df_top['#Leaderboard'].iloc[0]/df['#Leaderboard'].sum(),4)*100}% of the total, with {df_top['#Leaderboard'].iloc[0]} out of {df['#Leaderboard'].sum()} papers originating from this conference.")

fig = go.Figure(go.Bar(
        x=df_top['#Leaderboard'],  # Values for the bar lengths
        y=df_top['Publication'],  # Categories for each bar
        orientation='h',  # Sets the bars to be horizontal
        text=df_top['#Leaderboard'],  # Adds the values as text on each bar
        textposition='auto'  # Automatically positions the text on the bars
))
fig.update_layout(
        title=f'Number of Leaderboards in Publication (Top 10)',
        xaxis_title='Leaderboard Number',
        yaxis_title='Publication Name',
        yaxis_autorange='reversed'  # This line makes the bars go top-down
)
# fig.write_image(path_rq1 / 'publication distribution (top-10).pdf')
fig.show()


231 benchmark-based leaderboards have related papers, preprints or reports, accounting for 87.17% of the total.
89 benchmark-based leaderboards have accepted papers within those publication, accounting for 38.53% of the total.
Gini Coefficient: 0.5968253968253971
EMNLP emerges as the most represented conference, accounting for 15.97% of the total, with 23 out of 144 papers originating from this conference.


In [60]:
def organizer_synonyms_mapping(organizers, leaderboard):
    organizers_processed = set()
    for organizer in organizers:
        if organizer in organizer_synonyms_inverse:
            organizers_processed.add(organizer_synonyms_inverse[organizer])
        elif 'Independent Contributor' == organizer:
            organizers_processed.add(leaderboard)
        else:
            organizers_processed.add(organizer)
    return list(organizers_processed)

df = pd.read_csv(path_rq1 / 'Leaderboard_processed.csv')
df['Publisher (non-template)'] = df['Publisher (non-template)'].apply(ast.literal_eval)
for index, row in df.iterrows():
    organizers = organizer_synonyms_mapping(row['Publisher (non-template)'], row['Leaderboard'])
    if 'pwc' in row['Platform']:
        organizers.append('PapersWithCode')
    if 'ai2' in row['Platform']:
        organizers.append('Allen Institute for AI')
    df.at[index, 'Publisher (non-template)'] = organizers
organizer_mapping = df.set_index('Leaderboard')['Publisher (non-template)'].to_dict()
org_item_count = Counter(org for orgs in organizer_mapping.values() for org in orgs)
df = pd.DataFrame(list(org_item_count.items()), columns=['Publisher', '#Leaderboard'])
df['Weight'] = 1

# Calculate the Gini coefficient
gini_coefficient = ineqpy.inequality.gini(data=df, income='#Leaderboard', weights='Weight')
print("Gini Coefficient:", gini_coefficient)
df_top = df.sort_values(by='#Leaderboard', ascending=False).head(10)
print(f"{df_top['Publisher'].iloc[0]} is the most productive publisher, accounting for {round(df_top['#Leaderboard'].iloc[0]/df['#Leaderboard'].sum(),4)*100}% benchmark-based leaderboards.")

fig = go.Figure(go.Bar(
        x=df_top['#Leaderboard'],  # Values for the bar lengths
        y=df_top['Publisher'],  # Categories for each bar
        orientation='h',  # Sets the bars to be horizontal
        text=df_top['#Leaderboard'],  # Adds the values as text on each bar
        textposition='auto'  # Automatically positions the text on the bars
))
fig.update_layout(
        title=f'Number of Leaderboards in Publishers (Top 10)',
        xaxis_title='Leaderboard Number',
        yaxis_title='Publisher Name',
        yaxis_autorange='reversed'  # This line makes the bars go top-down
)
# fig.write_image(path_rq1 / 'publisher distribution (top-10).pdf')
fig.show()


Gini Coefficient: 0.6173701298701308
PapersWithCode is the most productive publisher, accounting for 25.6% benchmark-based leaderboards.


In [68]:
df = pd.read_csv(path_rq1 / 'Leaderboard_processed.csv')
df['Platform'] = df['Platform'].apply(ast.literal_eval)
df = df.explode('Platform')
print(f"Only {len(df[df['Platform'].map(len) > 1])} publishers host their leaderboards on multiple platforms, accounting for {round(len(df[df['Platform'].map(len) > 1])/len(df)*100,2)}% of the total.")

df = df.explode('Platform')
platform_counts = df['Platform'].value_counts().reset_index()
platform_counts['Weight'] = 1

# Calculate the Gini coefficient
gini_coefficient = ineqpy.inequality.gini(data=platform_counts, income='count', weights='Weight')
print("Gini Coefficient:", gini_coefficient)

print(f"{platform_counts['Platform'].iloc[0]} is the most popular host platform, accounting for {round(platform_counts['count'].iloc[0]/platform_counts['count'].sum(),4)*100}% benchmark-based leaderboards.")
fig = px.bar(
    x=platform_counts['Platform'],
    y=platform_counts['count'],
    text_auto=True,
    labels={'x': 'Platform', 'y': 'Count of benchmark-based Leaderboards'},
    title='Frequency of Leaderboard across Different Platform'
)
# fig.write_image(path_rq1 / 'platform distribution.pdf')
fig.show()

Only 32 publishers host their leaderboards on multiple platforms, accounting for 11.07% of the total.
Gini Coefficient: 0.47222222222222215
PapersWithCode is the most popular host platform, accounting for 42.59% benchmark-based leaderboards.


In [100]:
df = pd.read_csv(path_rq1 / 'Leaderboard_processed.csv')
df['Modality'] = df['Modality'].apply(ast.literal_eval)
modality_counts = df.explode('Modality')['Modality'].value_counts().reset_index()
modality_counts['Weight'] = 1

# Calculate the Gini coefficient
gini_coefficient = ineqpy.inequality.gini(data=modality_counts, income='count', weights='Weight')
print("Gini Coefficient:", gini_coefficient)

fig = px.bar(
    x=modality_counts['Modality'],
    y=modality_counts['count'],
    text_auto=True,
    labels={'x': 'Modality', 'y': 'Count of benchmark-based Leaderboards'},
    title='Frequency of Leaderboard across Different Modalities'
)
# fig.write_image(path_rq1 / 'task modality distribution.pdf')
fig.show()


Gini Coefficient: 0.7663157894736843


In [None]:
# df = pd.read_csv(path_rq1 / 'Leaderboard_processed.csv')
# df['Platform'] = df['Platform'].apply(ast.literal_eval)
# df['Platform'] = df['Platform'].apply(faltten_flatten_platform)
# df['Modality'] = df['Modality'].apply(ast.literal_eval)
# df = df.explode('Platform').explode('Modality')

# df = df[['Platform', 'Modality']].value_counts().reset_index(name='count')
# pivot_table = df.pivot(index='Modality', columns='Platform', values='count').fillna(0)
# text_labels = pivot_table.values.astype(int)

# fig = go.Figure(data=go.Heatmap(
#     z=pivot_table.values,
#     x=pivot_table.columns.tolist(),
#     y=pivot_table.index.tolist(),
#     colorscale='GnBu',
#     text=text_labels,
#     texttemplate="%{text}",
#     textfont={"size":10}
# ))

# fig.update_layout(
#     title='Frequency of Leaderboard by Platform and Modality',
#     xaxis_title='Leaderboard Platform',
#     yaxis_title='Task Modality'
# )

# fig.write_image(path_rq1 / 'platform modality distribution.pdf')
# fig.show()

In [108]:
df = pd.read_csv(path_rq1 / 'Leaderboard_processed.csv')
df['Language'] = df['Language'].apply(ast.literal_eval)
print(f"{len(df[df['Language'].apply(lambda x: 'English' in x)])} benchmark-based ones focus on English tasks, comprising {round(len(df[df['Language'].apply(lambda x: 'English' in x)])/len(df),4)*100}% of all leaderboards.")
language_counts = df.explode('Language')['Language'].value_counts().reset_index()
language_counts['Weight'] = 1

# Calculate the Gini coefficient
gini_coefficient = ineqpy.inequality.gini(data=language_counts, income='count', weights='Weight')
print("Gini Coefficient:", gini_coefficient)

language_counts = language_counts[language_counts['count'] > 3]
fig = px.bar(
    x=language_counts['Language'],
    y=language_counts['count'],
    text_auto=True,
    labels={'x': 'Language', 'y': 'Count of benchmark-based Leaderboards'},
    title='Frequency of Leaderboard across Different Languages'
)
# fig.write_image(path_rq1 / 'language distribution.pdf')
fig.show()


235 benchmark-based ones focus on English tasks, comprising 88.68% of all leaderboards.
Gini Coefficient: 0.7058285016783629


In [9]:
leaderboard_scenario_count = {}
leaderboard_count = 0
for directory in os.listdir(path_data):
    subdirectory_path = os.path.join(path_data, directory)
    leaderboard_count += 1
    scenario = 0
    for file in glob.glob(os.path.join(subdirectory_path, "*.json")):
        file_name = file.split('/')[-1]
        if not file_name.startswith('pwc'):
            scenario += 1
    if scenario:
        leaderboard_scenario_count[directory] = scenario
        
df = pd.DataFrame(list(leaderboard_scenario_count.items()), columns=['Leaderboard', '#Scenario'])
print(f'{len(df[df["#Scenario"] > 1])} publisher-based leaderboards (excluding PapersWithCode) have multiple scenarios, accounting for {round(len(df[df["#Scenario"] > 1])/len(df)*100,2)}% of the total.')

df_top = df.sort_values(by='#Scenario', ascending=False).head(10)
print(f"{df_top['Leaderboard'].iloc[0]} holds the most number of scenario up to {df_top['#Scenario'].iloc[0]}.")

fig = go.Figure(go.Bar(
        x=df_top['#Scenario'],  # Values for the bar lengths
        y=df_top['Leaderboard'],  # Categories for each bar
        orientation='h',  # Sets the bars to be horizontal
        text=df_top['#Scenario'],  # Adds the values as text on each bar
        textposition='auto'  # Automatically positions the text on the bars
))
fig.update_layout(
        title=f'Number of Scenario in Leaderboards (Top 10)',
        xaxis_title='Scenario Number',
        yaxis_title='Leaderboard Name',
        yaxis_autorange='reversed'  # This line makes the bars go top-down
)
# fig.write_image(path_rq1 / 'scenario distribution (top-10).pdf')
fig.show()

scenario_counts = df.explode('#Scenario')['#Scenario'].value_counts().reset_index()
scenario_counts['Weight'] = 1

# Calculate the Gini coefficient
gini_coefficient = ineqpy.inequality.gini(data=scenario_counts, income='count', weights='Weight')
print("Gini Coefficient:", gini_coefficient)

scenario_counts = scenario_counts[scenario_counts['#Scenario'] < 10]
fig = px.bar(
    x=scenario_counts['#Scenario'],
    y=scenario_counts['count'],
    text_auto=True,
    labels={'x': '#Scenario', 'y': 'Count of Publisher-based Leaderboards'},
    title='Frequency of Scenarios across Different Leaderboards'
)
# fig.write_image(path_rq1 / 'scenario distribution.pdf')
fig.show()

65 publisher-based leaderboards (excluding PapersWithCode) have multiple scenarios, accounting for 42.76% of the total.
LMExamQA holds the most number of scenario up to 884.


Gini Coefficient: 0.7952302631578947


In [140]:
df = pd.read_csv(path_rq1 / 'Leaderboard_processed.csv')
df['Leaderboard segmentation criteria for all platforms (non-pwc)'] = df['Leaderboard segmentation criteria for all platforms (non-pwc)'].apply(ast.literal_eval)
df = df[df['Leaderboard segmentation criteria for all platforms (non-pwc)'].map(len) > 0]
print(f"{len(df[df['#Empty multi-scenario leaderboards (non-pwc)'].notna()])} multi-scenario leaderboards (excluding PapersWithCode) have missing scenarios, accounting for {round(len(df[df['#Empty multi-scenario leaderboards (non-pwc)'].notna()])/len(df)*100,2)}% of the total.")
df['#Empty multi-scenario leaderboards (non-pwc)'].value_counts().reset_index()


5 multi-scenario leaderboards (non-PapersWithCode) have missing scenarios, accounting for 7.25% of the total.


Unnamed: 0,#Empty multi-scenario leaderboards (non-pwc),count
0,2,3
1,18,1
2,?,1


In [156]:
df = pd.read_csv(path_rq1 / 'Leaderboard_processed.csv')
df['Leaderboard segmentation criteria for all platforms (non-pwc)'] = df['Leaderboard segmentation criteria for all platforms (non-pwc)'].apply(ast.literal_eval)
df = df.explode('Leaderboard segmentation criteria for all platforms (non-pwc)')
df = df[df['Leaderboard segmentation criteria for all platforms (non-pwc)'].notna()]

df_number = pd.DataFrame()
for index, (name, group) in enumerate(df.groupby('Leaderboard segmentation criteria for all platforms (non-pwc)')):
    entry = {
        'Index': index, # Temporarily store the original index
        'Segmentation criteria': name,
        'Major category': leaderboard_segmentation_criteria_mapping_inverse[name],
        'Proportion (%)': len(group)/len(df)*100,
    }
    df_number = pd.concat([df_number, pd.DataFrame([entry])], ignore_index=True)

df_number = df_number.sort_values(by=['Major category', 'Segmentation criteria']).reset_index(drop=True)
df_number['Index'] = df_number.index + 1
df_number['Index'] = df_number['Index'].apply(lambda x: f'$C_{{0{x}}}$' if x < 10 else f'$C_{{{x}}}$')
print(df_number.to_latex(float_format="%.2f", index=False))

df['Leaderboard segmentation criteria'] = df['Leaderboard segmentation criteria for all platforms (non-pwc)'].map(leaderboard_segmentation_criteria_mapping_inverse)
split_counts = df['Leaderboard segmentation criteria'].value_counts().reset_index()
# df.drop_duplicates(subset=['Leaderboard', 'Leaderboard segmentation criteria'], inplace=True)
# 
# df.head()

# scenario_counts = scenario_counts[scenario_counts['#Scenario'] < 10]
fig = px.bar(
    x=split_counts['Leaderboard segmentation criteria'],
    y=split_counts['count'],
    text_auto=True,
    labels={'x': 'Major segmentation criteria', 'y': 'Count of Publisher-based Leaderboards'},
    title='Frequency of Major Scenario Segmentation Criteria across Different Leaderboards (Non-PapersWithCode)'
)
# fig.write_image(path_rq1 / 'scenario distribution.pdf')
fig.show()

\begin{tabular}{lllr}
\toprule
Index & Segmentation criteria & Major category & Proportion (%) \\
\midrule
$C_{01}$ & #Prompt Example & Evaluation Configuration & 3.17 \\
$C_{02}$ & #Prompt Token & Evaluation Configuration & 0.79 \\
$C_{03}$ & Evaluation Dataset & Evaluation Configuration & 6.35 \\
$C_{04}$ & Evaluation Metrics & Evaluation Configuration & 10.32 \\
$C_{05}$ & Evaluator & Evaluation Configuration & 3.97 \\
$C_{06}$ & Tokenizer & Evaluation Configuration & 0.79 \\
$C_{07}$ & Domain Task & Functionality Evaluation & 19.84 \\
$C_{08}$ & Evaluation Benchmark & Functionality Evaluation & 7.14 \\
$C_{09}$ & Language Support & Functionality Evaluation & 7.14 \\
$C_{10}$ & Model Capability & Functionality Evaluation & 13.49 \\
$C_{11}$ & Task Modality & Functionality Evaluation & 3.17 \\
$C_{12}$ & Evaluation Aggregation & Leaderboard Version & 12.70 \\
$C_{13}$ & Leaderboard Series & Leaderboard Version & 1.59 \\
$C_{14}$ & Release Date & Leaderboard Version & 1.59 \\
$C_{15}$

In [189]:
leaderboard_submission_count = {}
for directory in os.listdir(path_data):
    subdirectory_path = os.path.join(path_data, directory)
    submission = 0
    for file in glob.glob(os.path.join(subdirectory_path, "*.json")):
        df = pd.read_json(file)
        submission += len(df)
    leaderboard_submission_count[directory] = submission
    
# Basic statistics
df = pd.DataFrame(list(leaderboard_submission_count.items()), columns=['Leaderboard', '#Submission'])
mean = np.mean(df['#Submission'])
median = np.median(df['#Submission'])
mode = stats.mode(df['#Submission'])[0]  # mode() returns a ModeResult object, hence the indexing
variance = np.var(df['#Submission'])
standard_deviation = np.std(df['#Submission'])

print(f"Mean: {mean}")
print(f"Median: {median}")
print(f"Mode: {mode}")
print(f"Variance: {variance}")
print(f"Standard Deviation: {standard_deviation}")

# lambda_poisson = np.mean(df['#Submission'])
# # Generating a Poisson distribution with the same λ and number of observations
# poisson_dist = stats.poisson.rvs(mu=lambda_poisson, size=len(df))
# # Conducting a Kolmogorov-Smirnov test to compare the data to the theoretical Poisson distribution
# ks_statistic, p_value = stats.ks_2samp(df['#Submission'], poisson_dist)
# print(f'KS statistic: {ks_statistic}')
# print(f'p-value: {p_value}')
# skewness = stats.skew(df['#Submission'])
# kurtosis = stats.kurtosis(df['#Submission'])
# print('Skewness:', skewness)
# print('Kurtosis:', kurtosis)

# Calculate the Gini coefficient
df['Weight'] = 1
gini_coefficient = ineqpy.inequality.gini(data=df, income='#Submission', weights='Weight')
print("Gini Coefficient:", gini_coefficient)
df_top = df.sort_values(by='#Submission', ascending=False).head(10)

fig = go.Figure(go.Bar(
    x=df_top['#Submission'],  # Values for the bar lengths
    y=df_top['Leaderboard'],  # Categories for each bar
    orientation='h',  # Sets the bars to be horizontal
    text=df_top['#Submission'],  # Adds the values as text on each bar
    textposition='auto'  # Automatically positions the text on the bars
))
fig.update_layout(
    title=f'Number of Evaluation Records in Leaderboards (Top 10)',
    xaxis_title='Number of Evaluation Records',
    yaxis_title='Leaderboard Name',
    yaxis_autorange='reversed'  # This line makes the bars go top-down
)
# fig.write_image(path_rq1 / 'evaluation record distribution (top-10).pdf')
fig.show()

# df = pd.read_csv(path_rq1 / 'Leaderboard_processed.csv')
# df['Modality'] = df['Modality'].apply(ast.literal_eval)
# leaderboard_modality_mapping = df.set_index('Leaderboard')['Modality'].to_dict()

# leaderboard_modality_mapping_inverse = defaultdict(list)
# for k, v in leaderboard_modality_mapping.items():
#     for item in v:
#         leaderboard_modality_mapping_inverse[item].append(k)

# fig = go.Figure()
# for k, v in leaderboard_modality_mapping_inverse.items():
#     submissions = [leaderboard_submission_count[i] for i in v]
#     fig.add_trace(go.Box(y=submissions, name=k))
# fig.update_layout(title="Distribution of Number of Evaluation Records of Benchmark-based Leaderboards by Task Modality",
#                   yaxis=dict(title='Number of Evaluation Records', type='log'),
#                   xaxis=dict(title='Task Modality'))
# fig.write_image(path_rq1 / 'evaluation record distribution by task modality.pdf')
# fig.show()

# df = pd.read_csv(path_rq1 / 'Leaderboard_processed.csv')
# df['Platform'] = df['Platform'].apply(ast.literal_eval)
# df['Platform'] = df['Platform'].apply(flatten_string)
# df = df.explode('Platform')
# leaderboard_platform_mapping = df.set_index('Leaderboard')['Platform'].to_dict()

# leaderboard_platform_mapping_inverse = defaultdict(list)
# for k, v in leaderboard_platform_mapping.items():
#     leaderboard_platform_mapping_inverse[v].append(k)

# fig = go.Figure()
# for k, v in leaderboard_platform_mapping_inverse.items():
#     submissions = [leaderboard_submission_count[i] for i in v]
#     fig.add_trace(go.Box(y=submissions, name=k))
# fig.update_layout(title="Distribution of Number of Evaluation Records of Benchmark-based Leaderboards by Platform",
#                   yaxis=dict(title='Number of Evaluation Records', type='log'),
#                   xaxis=dict(title='Leaderboard Platform'))
# # fig.write_image(path_rq1 / 'evaluation record distribution by leaderboard platform.pdf')
# fig.show()


Mean: 198.1401515151515
Median: 30.0
Mode: 16
Variance: 1387414.9917211889
Standard Deviation: 1177.8858143815082
Gini Coefficient: 0.86035991014785


In [171]:
df_leaderboard = pd.read_csv(path_rq1 / 'Leaderboard_processed.csv')
leaderboard_mapping = {leaderboard: [leaderboard] for leaderboard in df_leaderboard['Leaderboard'].tolist()}

df_benchmark = pd.read_csv(path_rq1 / 'Benchmark_processed.csv')
df_benchmark['Benchmark'] = df_benchmark['Benchmark'].apply(ast.literal_eval)

benchmark_mapping = dict(zip(df_benchmark['Leaderboard'], df_benchmark['Benchmark']))
for key in benchmark_mapping.keys():
    benchmarks = set([benchmark if benchmark == 'ARC (The Abstraction and Reasoning Corpus)' else benchmark.split(' (')[0] for benchmark in benchmark_mapping[key]])
    benchmark_mapping[key] = list(benchmarks)

for leaderboard in leaderboard_mapping.keys():
    if leaderboard in benchmark_mapping:
        leaderboard_mapping[leaderboard] = benchmark_mapping[leaderboard]
    elif leaderboard == 'MTEB':
        leaderboard_mapping[leaderboard] = benchmark_mapping[leaderboard] + benchmark_mapping['CMTEB']

benchmark_mapping_count = {key: len(value) for key, value in leaderboard_mapping.items()}

df = pd.DataFrame(list(benchmark_mapping_count.items()), columns=['Leaderboard', '#Benchmark'])
df['Weight'] = 1
# Calculate the Gini coefficient
gini_coefficient = ineqpy.inequality.gini(data=df, income='#Benchmark', weights='Weight')
print("Gini coefficient:", gini_coefficient)

# lambda_poisson = np.mean(benchmark_distribution)
# # Generating a Poisson distribution with the same λ and number of observations
# poisson_dist = stats.poisson.rvs(mu=lambda_poisson, size=len(benchmark_distribution))
# # Conducting a Kolmogorov-Smirnov test to compare the data to the theoretical Poisson distribution
# ks_statistic, p_value = stats.ks_2samp(benchmark_distribution, poisson_dist)
# print(f'KS statistic: {ks_statistic}')
# print(f'p-value: {p_value}')
# skewness = stats.skew(benchmark_distribution)
# kurtosis = stats.kurtosis(benchmark_distribution)
# print('Skewness:', skewness)
# print('Kurtosis:', kurtosis)

benchmark_distribution = [value for value in benchmark_mapping_count.values() if value < 11]
fig = go.Figure(data=[go.Histogram(x=benchmark_distribution)])
fig.update_layout(
    title="Distribution of Number of Benchmarks across Benchmark-based Leaderboards",
    xaxis_title="Number of Benchmarks",
    yaxis_title="Number of Occurrence",
    bargap=0.1,  # Gap between bars of adjacent location coordinates
)
# fig.write_image(path_rq1 / 'benchmark distribution.pdf')
fig.show()

# df_leaderboard['Modality'] = df_leaderboard['Modality'].apply(ast.literal_eval)
# df_leaderboard = df_leaderboard.explode('Modality')
# fig = go.Figure()
# for name, group in df_leaderboard.groupby('Modality'):
#     leaderboard_count = []
#     for leaderboard in group['Leaderboard'].tolist():
#         leaderboard_count.append(benchmark_mapping_count[leaderboard])
#     fig.add_trace(go.Box(y=leaderboard_count, name=name))
# fig.update_layout(title="Distribution of Number of Benchmarks by Task Modality",
#                   yaxis=dict(title='Number of Benchmarks', type='log'),
#                   xaxis=dict(title='Task Modality'))
# fig.write_image(path_rq1 / 'benchmark distribution by task modality.pdf')
# fig.show()

Gini coefficient: 0.7084220347446712


In [174]:
df_leaderboard = pd.read_csv(path_rq1 / 'Leaderboard_processed.csv')
leaderboard_mapping = {leaderboard: [leaderboard] for leaderboard in df_leaderboard['Leaderboard'].tolist()}

df_benchmark = pd.read_csv(path_rq1 / 'Benchmark_processed.csv')
df_benchmark['Benchmark'] = df_benchmark['Benchmark'].apply(ast.literal_eval)

benchmark_mapping = dict(zip(df_benchmark['Leaderboard'], df_benchmark['Benchmark']))
for key in benchmark_mapping.keys():
    benchmarks = set([benchmark if benchmark == 'ARC (The Abstraction and Reasoning Corpus)' else benchmark.split(' (')[0] for benchmark in benchmark_mapping[key]])
    benchmark_mapping[key] = list(benchmarks)

for leaderboard in leaderboard_mapping.keys():
    if leaderboard in benchmark_mapping:
        leaderboard_mapping[leaderboard] = benchmark_mapping[leaderboard]
    elif leaderboard == 'MTEB':
        leaderboard_mapping[leaderboard] = benchmark_mapping[leaderboard] + benchmark_mapping['CMTEB']

benchmark_mapping_inverse = defaultdict(list)
for key, values in leaderboard_mapping.items():
    for value in values:
        benchmark_mapping_inverse[value].append(key)

benchmark_mapping_inverse_count = {key: len(value) for key, value in benchmark_mapping_inverse.items()}
labels, values = zip(*sorted(benchmark_mapping_inverse_count.items(), key=lambda x: x[1], reverse=True))

# Calculate the Gini coefficient
df = pd.DataFrame(list(benchmark_mapping_inverse_count.items()), columns=['Benchmark', '#Leaderboard'])
df['Weight'] = 1
gini_coefficient = ineqpy.inequality.gini(data=df, income='#Leaderboard', weights='Weight')
print("Gini coefficient:", gini_coefficient)

fig = go.Figure(go.Bar(
        x=values[:10],  # Values for the bar lengths
        y=labels[:10],  # Categories for each bar
        orientation='h',  # Sets the bars to be horizontal
        text=values[:10],  # Adds the values as text on each bar
        textposition='auto'  # Automatically positions the text on the bars
))
fig.update_layout(
        title=f'Number of Benchmark in Leaderboards (Top 10)',
        xaxis_title='Leaderboard Number',
        yaxis_title='Benchmark Name',
        yaxis_autorange='reversed'  # This line makes the bars go top-down
)
# fig.write_image(path_rq1 / 'benchmark distribution (top-10).pdf')
fig.show()


Gini coefficient: 0.27036373065334235


In [None]:
# df_leaderboard = pd.read_csv(path_rq1 / 'Leaderboard_processed.csv')
# df_leaderboard['Leaderboard segmentation criteria for all platforms (non-pwc)'] = df_leaderboard['Leaderboard segmentation criteria for all platforms (non-pwc)'].apply(ast.literal_eval)
# leaderboard_mapping = {leaderboard: [leaderboard] for leaderboard in df_leaderboard['Leaderboard'].tolist()}

# df_benchmark = pd.read_csv(path_rq1 / 'Benchmark_processed.csv')
# df_benchmark['Benchmark'] = df_benchmark['Benchmark'].apply(ast.literal_eval)

# benchmark_mapping = dict(zip(df_benchmark['Leaderboard'], df_benchmark['Benchmark']))
# for key in benchmark_mapping.keys():
#     benchmarks = set([benchmark if benchmark == 'ARC (The Abstraction and Reasoning Corpus)' else benchmark.split(' (')[0] for benchmark in benchmark_mapping[key]])
#     benchmark_mapping[key] = list(benchmarks)

# for leaderboard in leaderboard_mapping.keys():
#     if leaderboard in benchmark_mapping:
#         leaderboard_mapping[leaderboard] = benchmark_mapping[leaderboard]
#     elif leaderboard == 'MTEB':
#         leaderboard_mapping[leaderboard] = benchmark_mapping[leaderboard] + benchmark_mapping['CMTEB']

# leaderboard_split_mapping = dict(zip(df_leaderboard['Leaderboard'], df_leaderboard['Leaderboard segmentation criteria for all platforms (non-pwc)']))
# benchmark_split_mapping = defaultdict(list)
# for leaderboard, benchmarks in leaderboard_mapping.items():
#     for benchmark in benchmarks:
#         if benchmark not in labels:
#             continue
#         for split in leaderboard_split_mapping[leaderboard]:
#             benchmark_split_mapping[benchmark].append(leaderboard_segmentation_criteria_mapping_inverse[split])

# # Convert dictionary to DataFrame
# df = pd.DataFrame([(key, val) for key, vals in benchmark_split_mapping.items() for val in vals], columns=['Benchmark', 'Leaderboard split'])

# # Count the frequency of each value for each key
# df_count = df.groupby(['Benchmark', 'Leaderboard split']).size().reset_index(name='Leaderboard Count')

# # Create a group bar chart
# fig = px.bar(df_count, x='Benchmark', y='Leaderboard Count', color='Leaderboard split', barmode='group', title='Frequency of Scenario-based Leaderboard Split for Each Benchmark')

# fig.show()

In [62]:
# df_leaderboard = pd.read_csv(path_rq1 / 'Leaderboard_processed.csv')
# df_leaderboard['Leaderboard segmentation criteria for all platforms (non-pwc)'] = df_leaderboard['Leaderboard segmentation criteria for all platforms (non-pwc)'].apply(ast.literal_eval)
# leaderboard_mapping = {leaderboard: [leaderboard] for leaderboard in df_leaderboard['Leaderboard'].tolist()}

# df_benchmark = pd.read_csv(path_rq1 / 'Benchmark_processed.csv')
# df_benchmark['Benchmark'] = df_benchmark['Benchmark'].apply(ast.literal_eval)

# benchmark_mapping = dict(zip(df_benchmark['Leaderboard'], df_benchmark['Benchmark']))
# for key in benchmark_mapping.keys():
#     benchmarks = Counter([benchmark if benchmark == 'ARC (The Abstraction and Reasoning Corpus)' else benchmark.split(' (')[0] for benchmark in benchmark_mapping[key]])
#     benchmark_mapping[key] = dict(benchmarks)

# for leaderboard in leaderboard_mapping.keys():
#     if leaderboard in benchmark_mapping:
#         leaderboard_mapping[leaderboard] = benchmark_mapping[leaderboard]
#     elif leaderboard == 'MTEB':
#         leaderboard_mapping[leaderboard] = benchmark_mapping[leaderboard] | benchmark_mapping['CMTEB']
#     else:
#         leaderboard_mapping[leaderboard] = {leaderboard: 1}

# # Creating the Node and Edge tables
# node_data = []
# edge_data = []

# for key in leaderboard_mapping:
#     total_weight = sum(leaderboard_mapping[key].values())
#     node_data.append({"Id": key, "Label": key, "Size": total_weight})
#     for sub_key, weight in leaderboard_mapping[key].items():
#         edge_data.append({"Platform": key, "Target": sub_key, "Weight": weight})

# # Convert to DataFrame
# node_table = pd.DataFrame(node_data)
# edge_table = pd.DataFrame(edge_data)

# node_table.to_excel(path_rq1 / 'node.xlsx', index=False)
# edge_table.to_excel(path_rq1 / 'edge.xlsx', index=False)

In [None]:
# import networkx as nx
# import matplotlib.pyplot as plt

# # Create a graph
# G = nx.Graph()
# df_leaderboard_benchmark = convert_csv_to_dict(path_rq1 / 'Benchmark.csv')

# # Add nodes and edges
# for key, values in df_leaderboard_benchmark.items():
#     G.add_node(key)
#     for value in values:
#         G.add_edge(key, value)

# # # Increase figure size
# plt.figure(figsize=(30, 30))  # You can adjust the size as needed

# # Adjust layout (you can experiment with different layouts)
# pos = nx.spring_layout(G)  # 'spring_layout' is often a good choice

# # Draw the graph
# nx.draw(G, with_labels=True, node_color='lightblue',
#         font_size=10, node_size=500)
# # Save the figure
# plt.savefig(path_rq1 / "benchmark network graph.pdf", format='png', dpi=300)  # Adjust filename, format, and dpi as needed
# plt.show()


In [None]:
# df_leaderboard_metrics = convert_csv_to_dict(path_rq1 / 'Metrics.csv')
# metrics = set()
# for key, value in df_leaderboard_metrics.items():
#     # print(value)
#     metrics = metrics.union(set(value))
# print('Metrics:', len(metrics))

In [None]:
# from collections import Counter
# df_leaderboard_metrics = convert_csv_to_dict(path_rq1 / 'Metrics.csv

# all_values = []
# for value_list in df_leaderboard_metrics.values():
#     all_values.extend(value_list)

# # Count occurrences of each string
# counter = Counter(all_values)

# # Get the top-10 most common strings
# top_10 = counter.most_common(10)

# # Prepare data for Plotly
# labels, values = zip(*top_10)

# fig = go.Figure(go.Bar(
#     x=values,  # Values for the bar lengths
#     y=labels,  # Categories for each bar
#     orientation='h',  # Sets the bars to be horizontal
#     text=values,  # Adds the values as text on each bar
#     textposition='auto'  # Automatically positions the text on the bars
# ))

# fig.update_layout(
#     title=f'Frequency of Metrics in Leaderboards (Top 10)',
#     xaxis_title='Leaderboad Number',
#     yaxis_title='Metrics Name',
#     yaxis_autorange='reversed'  # This line makes the bars go top-down
# )

# fig.write_image(path_rq1 / 'metrics distribution.pdf')
# fig.show()

In [None]:
# import networkx as nx
# import matplotlib.pyplot as plt

# # Create a graph
# G = nx.Graph()
# df_leaderboard_benchmark = convert_csv_to_dict(path_rq1 / 'Metrics.csv')

# # Add nodes and edges
# for key, values in df_leaderboard_benchmark.items():
#     G.add_node(key)
#     for value in values:
#         G.add_edge(key, value)

# # Increase figure size
# plt.figure(figsize=(30, 30))  # You can adjust the size as needed

# # Adjust layout (you can experiment with different layouts)
# pos = nx.spring_layout(G)  # 'spring_layout' is often a good choice

# # Draw the graph
# nx.draw(G, with_labels=True, node_color='lightblue',
#         font_size=10, node_size=500)
# # Save the figure
# plt.savefig(path_rq1 / "metrics network graph.pdf", format='png', dpi=300)  # Adjust filename, format, and dpi as needed
# plt.show()


In [175]:
# pattern = r'( |_|-|\())[0-9.]+b'
pattern = r'[0-9.]+b'

model_with_size = set()
for directory in os.listdir(path_data):
    subdirectory_path = os.path.join(path_data, directory)
    for file in glob.glob(os.path.join(subdirectory_path, "*.json")):
        file_name = file.split('/')[-1]
        for prefix in platform_nontemplate:
            if file_name.startswith(prefix):
                df = pd.read_json(file)
                if df['Model'].str.lower().str.contains(pattern, regex=True).any():
                    model_with_size.add((directory, platform_abbreviation_mapping[prefix]))

model_with_size_leaderboards = list(set([x[0] for x in model_with_size]))
model_with_size_leaderboards

['CMMMU',
 'MMCU',
 'Toloka LLM Leaderboard',
 'UHGEval',
 'SuperGLUE',
 'LLM-Perf Leaderboard',
 'SEED-Bench Leaderboard',
 'MMMU',
 'BenchLMM',
 'Open Multilingual LLM Evaluation Leaderboard',
 'C-Eval',
 'LLMEval',
 'CG-Eval',
 'ANGO',
 'Xiezhi',
 'InfiCoder-Eval',
 'DocVQA',
 'tStoryCloze',
 'HalluQA',
 'SuperCLUElyb',
 'CORE-MM',
 'FacTool',
 'SciGraphQA',
 'ScienceQA',
 'LLM-Leaderboard',
 'Big Code Models Leaderboard',
 'BIRD',
 'C-Eval Hard',
 'LongBench',
 'M3KE',
 'SuperCLUEgkzw',
 'DecodingTrust',
 'MathVista',
 'LawBench',
 'CMMLU',
 'CCBench',
 'MME',
 'CRUXEval',
 'GAOKAO-Bench',
 'SWE-bench',
 'OpenCompass LLM Leaderboard (v2)',
 'MATH401',
 'FinanceIQ',
 'LLMPerf',
 'YALL',
 'BotChat',
 'CLiB',
 'LLMonitor',
 'Coding LLMs Leaderboard',
 'SafetyBench',
 'DyVal',
 'AgentBench',
 'EvalPlus',
 'LAiW Leaderboard',
 'TabMWP',
 'InstructEval',
 'SuperCLUE',
 'HELM Classic',
 'Open LLM Leaderboard',
 'Multi-modal Modal Leaderboard',
 'SuperCLUE-Agent',
 'CLEVA',
 'L-Eval',
 'In

In [176]:
model_with_publisher = set()
for directory in os.listdir(path_data):
    subdirectory_path = os.path.join(path_data, directory)
    for file in glob.glob(os.path.join(subdirectory_path, "*.json")):
        file_name = file.split('/')[-1]
        for prefix in platform_nontemplate:
            if file_name.startswith(prefix):
                df = pd.read_json(file)
                if df['Model'].str.contains('\n').any():
                    model_with_publisher.add((directory, platform_abbreviation_mapping[prefix]))

model_with_publisher_leaderboard = list(set([x[0] for x in model_with_publisher]))
model_with_publisher_leaderboard

['Provider Leaderboard',
 'Spider',
 'HallusionBench',
 'DS-1000',
 'LawBench',
 'HellaSwag Leaderboard',
 'OpenCompass LLM Leaderboard (v2)',
 'MMBench',
 'Multi-modal Modal Leaderboard',
 'QuALITY',
 'CCBench',
 'BIRD',
 'PubMedQA',
 'OpenEval (text)']

In [None]:
# fig = go.Figure(data=[
#     go.Bar(
#         x=list(model_naming_convention_count.keys()),
#         y=list(model_naming_convention_count.values()),
#         text=list(model_naming_convention_count.values()),
#         textposition='outside'
#     )
# ])
# fig.update_layout(
#     title='Frequency of Benchmark-based Leaderboards by Model Naming Convention',
#     xaxis_title='Model Naming Convention',
#     yaxis_title='Count',
#     yaxis=dict(
#         range=[0, max(model_naming_convention_count.values())
#                * 1.1]  # Increase y-axis range
#     )
# )
# fig.show()


In [177]:
models = set()
leaderboard_model_mapping = defaultdict(set)
for directory in os.listdir(path_data):
    subdirectory_path = os.path.join(path_data, directory)
    index = 1 if directory in model_naming_convention['Model Publisher (first line)'] else 0
    for file in glob.glob(os.path.join(subdirectory_path, "*.json")):
        df = pd.read_json(file)
        if directory in model_naming_convention['Repository Username']:
            df['Model'] = df['Model'].apply(lambda x: x.split('/')[-1])
        df['Model'] = df['Model'].apply(lambda x: x.split('\n')[index].split(' ')[0].lower())
        leaderboard_model_mapping[directory] = leaderboard_model_mapping[directory].union(set(df['Model'].tolist()))
    models = models.union(leaderboard_model_mapping[directory])
        
org_item_count = Counter(org for orgs in leaderboard_model_mapping.values() for org in orgs)
df = pd.DataFrame(list(org_item_count.items()), columns=['Model', '#Leaderboard'])
df_top = df.sort_values(by='#Leaderboard', ascending=False).head(10)

# Calculate the Gini coefficient
df['Weight'] = 1
gini_coefficient = ineqpy.inequality.gini(data=df, income='#Leaderboard', weights='Weight')
print("Gini coefficient:", gini_coefficient)

fig = go.Figure(go.Bar(
    x=df_top['#Leaderboard'],  # Values for the bar lengths
    y=df_top['Model'],  # Categories for each bar
    orientation='h',  # Sets the bars to be horizontal
    text=df_top['#Leaderboard'],  # Adds the values as text on each bar
    textposition='auto'  # Automatically positions the text on the bars
))
fig.update_layout(
    title=f'Frequency of Models in Leaderboards (Top 10)',
    xaxis_title='Leaderboard Number',
    yaxis_title='Model Name',
    yaxis_autorange='reversed'  # This line makes the bars go top-down
)
# fig.write_image(path_rq1 / 'model distribution (top-10).pdf')
fig.show()

Gini coefficient: 0.2702981793150804


In [183]:
leaderboard_model_count = defaultdict(int)
for directory in os.listdir(path_data):
    subdirectory_path = os.path.join(path_data, directory)
    model_names = []
    for file in glob.glob(os.path.join(subdirectory_path, "*.json")):
        df = pd.read_json(file)
        if directory in model_naming_convention['Repository Username']:
            df['Model'] = df['Model'].apply(lambda x: x.split('/')[-1])
        df['Model'] = df['Model'].apply(lambda x: x.split('\n')[0].split('(')[0].strip())
        model_names.extend(df['Model'].tolist())
    leaderboard_model_count[directory] = len(set(model_names))

df = pd.DataFrame(list(leaderboard_model_count.items()), columns=['Leaderboard', '#Model'])
df['Weight'] = 1
# Calculate the Gini coefficient
gini_coefficient = ineqpy.inequality.gini(data=df, income='#Model', weights='Weight')
print("Gini coefficient:", gini_coefficient)
df_top = df.sort_values(by='#Model', ascending=False).head(10)

# Basic statistics
mean = np.mean(df['#Model'])
median = np.median(df['#Model'])
mode = stats.mode(df['#Model'])[0]  # mode() returns a ModeResult object, hence the indexing
variance = np.var(df['#Model'])
standard_deviation = np.std(df['#Model'])

print(f"Mean: {mean}")
print(f"Median: {median}")
print(f"Mode: {mode}")
print(f"Variance: {variance}")
print(f"Standard Deviation: {standard_deviation}")

fig = go.Figure(go.Bar(
    x=df_top['#Model'],  # Values for the bar lengths
    y=df_top['Leaderboard'],  # Categories for each bar
    orientation='h',  # Sets the bars to be horizontal
    text=df_top['#Model'],  # Adds the values as text on each bar
    textposition='auto'  # Automatically positions the text on the bars
))
fig.update_layout(
    title=f'Number of Model in Leaderboards (Top 10)',
    xaxis_title='Model Number',
    yaxis_title='Leaderboard Name',
    yaxis_autorange='reversed'  # This line makes the bars go top-down
)
# fig.write_image(path_rq1 / 'model distribution (top-10).pdf')
fig.show()

# df_leaderboard = pd.read_csv(path_rq1 / 'Leaderboard_processed.csv')
# df_leaderboard['Modality'] = df_leaderboard['Modality'].apply(ast.literal_eval)
# df_leaderboard = df_leaderboard.explode('Modality')
# fig = go.Figure()
# for name, group in df_leaderboard.groupby('Modality'):
#     leaderboard_count = []
#     for leaderboard in group['Leaderboard'].tolist():
#         leaderboard_count.append(leaderboard_model_count[leaderboard])
#     fig.add_trace(go.Box(y=leaderboard_count, name=name))
# fig.update_layout(title="Distribution of Number of Models by Task Modality",
#                   yaxis=dict(title='Number of Models', type='log'),
#                   xaxis=dict(title='Task Modality'))
# fig.write_image(path_rq1 / 'model distribution by task modality.pdf')
# fig.show()

Gini coefficient: 0.7181755580510726
Mean: 55.571969696969695
Median: 19.0
Mode: 4
Variance: 50950.64633551424
Standard Deviation: 225.722498514247


In [None]:
# import matplotlib.pyplot as plt
# import networkx as nx

# leaderboard_models = {}

# for directory in os.listdir(path_data):
#     subdirectory_path = os.path.join(path_data, directory)
#     model_names = []
#     for file in glob.glob(os.path.join(subdirectory_path, "*.json")):
#         df = pd.read_json(file)
#         if directory in model_naming_convention['Repository Username']:
#             df['Model'] = df['Model'].apply(lambda x: x.split('/')[-1])
#         df['Model'] = df['Model'].apply(lambda x: x.split('\n')[0].split('(')[0].strip())
#         model_names.extend(df['Model'].tolist())
#     leaderboard_models[directory] = set(model_names)

# model_leaderboards = {}

# for key, values in leaderboard_models.items():
#     for value in values:
#         # Add the key to the list of keys for this value
#         if value not in model_leaderboards:
#             model_leaderboards[value] = [key]
#         else:
#             model_leaderboards[value].append(key)

# # Create a graph
# G = nx.Graph()

# # Add nodes and edges
# for key, values in model_leaderboards.items():
#     G.add_node(key)
#     for value in values:
#         G.add_edge(key, value)

# # Increase figure size
# plt.figure(figsize=(80, 80))  # You can adjust the size as needed

# # Adjust layout (you can experiment with different layouts)
# pos = nx.spring_layout(G)  # 'spring_layout' is often a good choice

# # Draw the graph
# nx.draw(G, with_labels=True, node_color='lightblue',
#         font_size=10, node_size=500)
# # Save the figure
# plt.savefig(path_rq1 / "model network graph.pdf", format='png', dpi=300)  # Adjust filename, format, and dpi as needed
# plt.show()

In [179]:
pwc_default_columns = {'extra training data', 'paper', 'code', 'result', 'year', 'tags'}
ai2_default_columns = {'created'}
leaderboard_attributes = {}
leaderboard_metrics_raw = {}
leaderboard_metrics = {}

for directory in os.listdir(path_data):
    subdirectory_path = os.path.join(path_data, directory)
    template = set()
    template_raw = set()
    nontemplate = set()
    for file in glob.glob(os.path.join(subdirectory_path, "*.json")):
        df = pd.read_json(file)
        file_name = file.split('/')[-1]
        if file_name.startswith('pwc'):
            keys = set()
            for item in df['metrics']:
                for key in item.keys():
                    key_processed = key.split('(')[0].lower().strip()
                    if key_processed in metrics_synonyms_inverse:
                        keys.add(metrics_synonyms_inverse[key_processed])
                    else:
                        keys.add(key_processed)
            template = template.union(keys)
            template_raw = template_raw.union(keys).union(pwc_default_columns)
        elif file_name.startswith('ai2'):
            column_names = set()
            for column_name in df.columns.tolist():
                if column_name in ['Model', 'Created']:
                    continue
                column_name = column_name.lower()
                if column_name in metrics_synonyms_inverse:
                    column_names.add(metrics_synonyms_inverse[column_name])
                elif column_name:
                    column_names.add(column_name)
            template = template.union(column_names)
            template_raw = template_raw.union(column_names).union(ai2_default_columns)
        else:
            column_names = set()
            for column_name in df.columns.tolist():
                if column_name == 'Model':
                    continue
                column_name = column_name.split('\n')[0].split('(')[0].split(' - ')[-1].split(']')[-1].lower().strip().rstrip('.')
                if column_name in metrics_synonyms_inverse:
                    column_names.add(metrics_synonyms_inverse[column_name])
                elif column_name:
                    column_names.add(column_name)
            nontemplate = nontemplate.union(column_names)
    if len(template):
        leaderboard_metrics[directory] = template
        leaderboard_metrics_raw[directory] = template_raw
    elif len(nontemplate):
        leaderboard_attributes[directory] = nontemplate

leaderboard_attribute_inverse = defaultdict(int)
for lb, lst_metrics in leaderboard_attributes.items():
    for metrics in lst_metrics:
        leaderboard_attribute_inverse[metrics] += 1

leaderboard_metrics_inverse = defaultdict(int)
for lb, lst_metrics in leaderboard_metrics.items():
    for metrics in lst_metrics:
        leaderboard_metrics_inverse[metrics] += 1


In [186]:
df = pd.DataFrame(list(leaderboard_attribute_inverse.items()), columns=['Leaderboard', '#Attribute'])
df['Weight'] = 1
# Calculate the Gini coefficient
gini_coefficient = ineqpy.inequality.gini(data=df, income='#Attribute', weights='Weight')
print("Gini coefficient:", gini_coefficient)
df_top = df.sort_values(by='#Attribute', ascending=False).head(10)

# Basic statistics
mean = np.mean(df['#Attribute'])
median = np.median(df['#Attribute'])
mode = stats.mode(df['#Attribute'])[0]  # mode() returns a ModeResult object, hence the indexing
variance = np.var(df['#Attribute'])
standard_deviation = np.std(df['#Attribute'])

print(f"Mean: {mean}")
print(f"Median: {median}")
print(f"Mode: {mode}")
print(f"Variance: {variance}")
print(f"Standard Deviation: {standard_deviation}")

fig = go.Figure(go.Bar(
    x=df_top['#Attribute'],  # Values for the bar lengths
    y=df_top['Leaderboard'],  # Categories for each bar
    orientation='h',  # Sets the bars to be horizontal
    text=df_top['#Attribute'],  # Adds the values as text on each bar
    textposition='auto'  # Automatically positions the text on the bars
))
fig.update_layout(
    title=f'Frequency of Attributes in Publisher-based Non-template Leaderboards (Top 10)',
    xaxis_title='Frequency of Leaderboards',
    yaxis_title='Attribute Name',
    yaxis_autorange='reversed'  # This line makes the bars go top-down
)
# fig.write_image(path_rq1 / 'attribute distribution (non-template) (top-10).pdf')
fig.show()

fig = go.Figure(data=[go.Histogram(x=values)])
fig.update_layout(
    title="Distribution of Number of Attributes across Publisher-based Leaderboards",
    xaxis_title="Number of Attributes",
    yaxis_title="Number of Occurrence",
    bargap=0.1,  # Gap between bars of adjacent location coordinates
)
# fig.write_image(path_rq1 / 'attribute distribution (non-template).pdf')
fig.show()


Gini coefficient: 0.15379077672308028
Mean: 1.1898648648648649
Median: 1.0
Mode: 1
Variance: 1.5119243060628191
Standard Deviation: 1.229603312480419


In [188]:
df = pd.DataFrame(list(leaderboard_metrics_inverse.items()), columns=['Leaderboard', '#Metrics'])
df['Weight'] = 1
# Calculate the Gini coefficient
gini_coefficient = ineqpy.inequality.gini(data=df, income='#Metrics', weights='Weight')
print("Gini coefficient:", gini_coefficient)
df_top = df.sort_values(by='#Metrics', ascending=False).head(10)

# Basic statistics
mean = np.mean(df['#Metrics'])
median = np.median(df['#Metrics'])
mode = stats.mode(df['#Metrics'])[0]  # mode() returns a ModeResult object, hence the indexing
variance = np.var(df['#Metrics'])
standard_deviation = np.std(df['#Metrics'])

print(f"Mean: {mean}")
print(f"Median: {median}")
print(f"Mode: {mode}")
print(f"Variance: {variance}")
print(f"Standard Deviation: {standard_deviation}")

fig = go.Figure(go.Bar(
    x=df_top['#Metrics'],  # Values for the bar lengths
    y=df_top['Leaderboard'],  # Categories for each bar
    orientation='h',  # Sets the bars to be horizontal
    text=df_top['#Metrics'],  # Adds the values as text on each bar
    textposition='auto'  # Automatically positions the text on the bars
))
fig.update_layout(
    title=f'Frequency of Metrics in Publisher-based Template Leaderboards (Top 10)',
    xaxis_title='Frequency of Leaderboards',
    yaxis_title='Metrics Name',
    yaxis_autorange='reversed'  # This line makes the bars go top-down
)
# fig.write_image(path_rq1 / 'metrics distribution (template) (top-10).pdf')
fig.show()

fig = go.Figure(data=[go.Histogram(x=values)])
fig.update_layout(
    title="Distribution of Number of Metrics across Publisher-based Leaderboards",
    xaxis_title="Number of Metrics",
    yaxis_title="Number of Occurrence",
    bargap=0.1,  # Gap between bars of adjacent location coordinates
)
# fig.write_image(path_rq1 / 'metrics distribution (template).pdf')
fig.show()


Gini coefficient: 0.45321013977730457
Mean: 1.995744680851064
Median: 1.0
Mode: 1
Variance: 24.319130828429174
Standard Deviation: 4.931443077683162


In [None]:
# df = pd.read_csv(path_rq1 / 'Leaderboard_processed.csv')
# df = df.explode('Platform')
# df['Modality'] = df['Modality'].apply(ast.literal_eval)
# df_modality = df.explode('Modality')
# leaderboard_attribute_count = {key: len(value) for key, value in leaderboard_attributes.items()} | {key: len(value) for key, value in leaderboard_metrics_raw.items()}

# fig = go.Figure()
# for name, group in df_modality.groupby('Modality'):
#     leaderboard_count = []
#     for leaderboard in group['Leaderboard'].tolist():
#         if leaderboard in leaderboard_attribute_count:
#             leaderboard_count.append(leaderboard_attribute_count[leaderboard])
#     fig.add_trace(go.Box(y=leaderboard_count, name=name))
# fig.update_layout(title="Distribution of Number of Attributes across Publisher-based Leaderboards by Task Modality",
#                   yaxis=dict(title='Number of Attributes', type='log'),
#                   xaxis=dict(title='Task Modality'))
# fig.write_image(path_rq1 / 'attribute distribution by task modality.pdf')
# fig.show()

# fig = go.Figure()
# for name, group in df.groupby('Platform'):
#     leaderboard_count = []
#     for leaderboard in group['Leaderboard'].tolist():
#         if leaderboard in leaderboard_attribute_count:
#             leaderboard_count.append(leaderboard_attribute_count[leaderboard])
#     fig.add_trace(go.Box(y=leaderboard_count, name=name))
# fig.update_layout(title="Distribution of Number of Attributes across Publisher-based Leaderboards by Platform",
#                   yaxis=dict(title='Number of Attributes', type='log'),
#                   xaxis=dict(title='Leaderboard Platform'))
# fig.write_image(path_rq1 / 'attribute distribution by leaderboard platform.pdf')
# fig.show()
