In [9]:
from gradio_client import Client
from pathlib import Path
import json
import pandas as pd

In [10]:
path_data = Path("../data")
path_llm = path_data / "llm"
path_lvlm = path_data / "lvlm"

In [4]:
client = Client("https://felixz-open-llm-leaderboard.hf.space/")
json_data = client.predict("","", api_name='/predict')

with open(json_data, 'r') as file:
    file_data = file.read()
    data = json.loads(file_data)
    df = pd.DataFrame(data['data'], columns=data['headers'])
    df.drop(columns=['Model'], inplace=True)
    df.to_json(path_llm / 'HuggingFace-Open-llm-leaderboard-20231116.json', orient='records', indent=4)

Loaded as API: https://felixz-open-llm-leaderboard.hf.space/ ✔


In [4]:
import os
import glob
import pandas as pd
import plotly.express as px

json_files = path_data.glob('*.json')

# Function to read a JSON file into a DataFrame
def read_json_file(file_path):
    return pd.read_json(file_path)

# Function to find a column that contains 'model' in its name
def find_model_column(df):
    for col in df.columns:
        if 'model' in col.lower():
            return col
    return None

# Read each file and store in a list of DataFrames
dataframes = [read_json_file(file) for file in json_files]

# Extract and count models in each DataFrame
model_counts = []
for i, df in enumerate(dataframes):
    model_col = find_model_column(df)
    if model_col is not None:
        count = df[model_col].value_counts().reset_index()
        count.columns = ['model', 'count']
        count['file'] = f'file_{i+1}'  # Naming files as file_1, file_2, etc.
        model_counts.append(count)

# Combine all counts into a single DataFrame
combined_df = pd.concat(model_counts, ignore_index=True)

# Create a bubble chart
fig = px.scatter(combined_df, x="model", y="file", size="count", 
                 size_max=60, title="Model Coverage Across Different Files")

# Show the plot
fig.show()


In [3]:
import glob
import os

# Define the directory path where the files are located
# For demonstration purposes, let's assume the path is 'path/to/directory'
# directory_path = 'path/to/directory'

# Use glob to find all files in the directory that match the pattern
json_files = glob.glob(os.path.join(path_llm, "*.json"))

# Rename each file
for file_path in json_files:
    print(file_path)
    new_file_path = '-'.join(file_path.split('-')[:-1]) + '.json'
    # Rename the file
    os.rename(file_path, new_file_path)

../data/llm/FlagEval-sft-20231113.json
../data/llm/HELM-question_answering_robustness-20231024.json
../data/llm/HELM-synthetic_efficiency_synthetic_efficiency_num_prompt_tokens:512,num_instances:10,tokenizer:cohere_cohere-20231024.json
../data/llm/HELM-synthetic_efficiency_synthetic_efficiency_num_prompt_tokens:1536,num_instances:10,tokenizer:meta_opt-20231024.json
../data/llm/HellaSwag-text_generation-paperwithcode-latest.json
../data/llm/HELM-ablation_multiple_choice_efficiency-20231024.json
../data/llm/HELM-ablation_prompts_accuracy-20231024.json
../data/llm/Opencompass-llm-all_benchmark-reasoning-latest.json
../data/llm/HELM-boolq_boolq_only_contrast:True-20231024.json
../data/llm/LongBench-english-20231108.json
../data/llm/LongBench-synthetic_tasks-20231108.json
../data/llm/Opencompress-lawbench-zero_shot-20231113.json
../data/llm/LucyEval-20230814.json
../data/llm/HELM-imdb_imdb-20231024.json
../data/llm/HELM-blimp_blimp_phenomenon:irregular_forms-20231024.json
../data/llm/HELM-s

In [4]:
import glob
import os

# Define the directory path where the files are located
# For demonstration purposes, let's assume the path is 'path/to/directory'
# directory_path = 'path/to/directory'

# Use glob to find all files in the directory that match the pattern
json_files = glob.glob(os.path.join(path_lvlm, "*.json"))

# Rename each file
for file_path in json_files:
    print(file_path)
    new_file_path = '-'.join(file_path.split('-')[:-1]) + '.json'
    # Rename the file
    os.rename(file_path, new_file_path)

../data/lvlm/ReForm_Eval-20231117.json
../data/lvlm/LVLM_eHub-embodied_intelligence-latest.json
../data/lvlm/MME-perception-20231120.json
../data/lvlm/Opencompass-MMBench-CCBench-dev-latest.json
../data/lvlm/Opencompass-MMBench-test_CN-latest.json
../data/lvlm/LAMM-desiderata-20231106.json
../data/lvlm/MME-color-20231120.json
../data/lvlm/MME-landmark-20231120.json
../data/lvlm/MME-code_reasoning-20231120.json
../data/lvlm/MME-position-20231120.json
../data/lvlm/Opencompass-MMBench-dev_CN-latest.json
../data/lvlm/MME-existence-20231120.json
../data/lvlm/ScienceQA-20230729.json
../data/lvlm/MME-artwork-20231120.json
../data/lvlm/LVLM_eHub-quantitative-latest.json
../data/lvlm/LVLM_eHub-visual_commonsense-latest.json
../data/lvlm/FMTI-20231020.json
../data/lvlm/MME-count-20231120.json
../data/lvlm/LVLM_eHub-visual_knowledge_acquisition-latest.json
../data/lvlm/MME-commonsense_reasoning-20231120.json
../data/lvlm/HallusionBench-latest.json
../data/lvlm/MME-text_translation-20231120.json
.

In [32]:
df = pd.read_csv(path_lvlm / 'a.csv')
df.to_json(path_lvlm / 'MME-code_reasoning.json', orient='records', indent=4)
len(df)

35

In [16]:
df = pd.read_json(path_llm / 'MINT.json')
df['Model'].nunique()

34

In [15]:
scenarios = '''BoolQ
NarrativeQA
NaturalQuestions (closed-book)
NaturalQuestions (open-book)
QuAC (Question Answering in Context)
HellaSwag
OpenbookQA
TruthfulQA
MMLU (Massive Multitask Language Understanding)
MS MARCO (regular track)
MS MARCO (TREC track)
CNN/DailyMail
XSUM
IMDB
CivilComments
RAFT (Real-world Annotated Few-Shot)
ICE (International Corpus of English)
The Pile
TwitterAAE
BLiMP (The Benchmark of Linguistic Minimal Pairs for English)
NaturalQuestions (closed-book)
HellaSwag
OpenbookQA
TruthfulQA
MMLU (Massive Multitask Language Understanding)
WikiFact
bAbI
Dyck
Synthetic reasoning (abstract symbols)
Synthetic reasoning (natural language)
GSM8K (Grade school math word problems)
MATH
MATH (chain-of-thoughts)
APPS (Code)
HumanEval (Code)
LegalSupport
LSAT
Data imputation
Entity matching
Copyright (text)
Copyright (code)
Disinformation (reiteration)
Disinformation (wedging)
BBQ (Bias Benchmark for Question Answering)
BOLD (Bias in Open-Ended Language Generation Dataset)
RealToxicityPrompts
Synthetic efficiency
MMLU (Massive Multitask Language Understanding)
IMDB
RAFT (Real-world Annotated Few-Shot)
CivilComments
NaturalQuestions (open-book)
CNN/DailyMail
IMDB
CivilComments
HellaSwag
OpenbookQA
TruthfulQA
MMLU (Massive Multitask Language Understanding)
BLiMP (The Benchmark of Linguistic Minimal Pairs for English)
LegalSupport
LSAT
BBQ (Bias Benchmark for Question Answering)
NaturalQuestions (open-book)
CNN/DailyMail
IMDB
CivilComments
BoolQ
IMDB'''
s = set()
for scenario in scenarios.split('\n'):
    scenario_ = scenario.split('(')[0].strip()
    s.add(scenario_)
# len(s)
s

{'APPS',
 'BBQ',
 'BLiMP',
 'BOLD',
 'BoolQ',
 'CNN/DailyMail',
 'CivilComments',
 'Copyright',
 'Data imputation',
 'Disinformation',
 'Dyck',
 'Entity matching',
 'GSM8K',
 'HellaSwag',
 'HumanEval',
 'ICE',
 'IMDB',
 'LSAT',
 'LegalSupport',
 'MATH',
 'MMLU',
 'MS MARCO',
 'NarrativeQA',
 'NaturalQuestions',
 'OpenbookQA',
 'QuAC',
 'RAFT',
 'RealToxicityPrompts',
 'Synthetic efficiency',
 'Synthetic reasoning',
 'The Pile',
 'TruthfulQA',
 'TwitterAAE',
 'WikiFact',
 'XSUM',
 'bAbI'}

In [16]:
scenarios = '''MS-COCO (base)
Caltech-UCSD Birds-200-2011
DrawBench (image quality categories)
DrawBench (reasoning categories)
DrawBench (knowledge categories)
PartiPrompts (image quality categories)
PartiPrompts (reasoning categories)
PartiPrompts (knowledge categories)
MS-COCO (base)
DrawBench (image quality categories)
PartiPrompts (image quality categories)
MS-COCO (base)
MS-COCO (Art styles)
dailydall.e
Logos
Landing Page
Magazine Cover Photos
dailydall.e
Logos
Landing Page
Magazine Cover Photos
Common Syntactic Processes
DrawBench (reasoning categories)
PartiPrompts (reasoning categories)
Relational Understanding
Detection (PaintSkills)
Winoground
TIME's most significant historical figures
DrawBench (knowledge categories)
PartiPrompts (knowledge categories)
Demographic Stereotypes
Mental Disorders
Inappropriate Image Prompts (I2P)
MS-COCO (fairness - AAVE dialect)
MS-COCO (fairness - gender)
MS-COCO (robustness)
MS-COCO (Chinese)
MS-COCO (Hindi)
MS-COCO (Spanish)
MS-COCO Fidelity
MS-COCO Efficiency
MS-COCO (Art styles)'''
s = set()
for scenario in scenarios.split('\n'):
    scenario = scenario.split('(')[0].strip()
    s.add(scenario)
s

{'Caltech-UCSD Birds-200-2011',
 'Common Syntactic Processes',
 'Demographic Stereotypes',
 'Detection',
 'DrawBench',
 'Inappropriate Image Prompts',
 'Landing Page',
 'Logos',
 'MS-COCO',
 'MS-COCO Efficiency',
 'MS-COCO Fidelity',
 'Magazine Cover Photos',
 'Mental Disorders',
 'PartiPrompts',
 'Relational Understanding',
 "TIME's most significant historical figures",
 'Winoground',
 'dailydall.e'}

In [8]:
import glob

sum = 0

for file in glob.glob(str(path_llm / "*.json")):
    if 'big_bench' not in file.lower():
        continue
    df = pd.read_json(file)
    sum += len(df)
sum

296