Convert summaries previously generated from HuggingFace Dataset repo https://huggingface.co/datasets/vectara/leaderboard_results to new output format of the LB.


In [1]:
import pandas as pd
import re
import hashlib
from datetime import datetime
import json, os

from collections import Counter
from tqdm.auto import tqdm
from datasets import load_dataset, Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from datasets import load_dataset, Dataset
ds = load_dataset("vectara/leaderboard_results", split="train")
# failed_models = []
# for row in ds:
#       if row['summary'] == None: 
#           failed_models.append(row['model'])

# display( sorted(Counter(failed_models).items(), key=lambda x: x[0])) 

# print ("Total number of Null summaries: ", sum(Counter(failed_models).values()))

In [19]:
unique_models = set(ds['model'])

# extract the company names from the regex pattern {company}/{model_name}
companies = []
for _id in unique_models: 
  m = re.match(r'([a-zA-Z]+)/([a-zA-Z0-9_-]+)', _id)
  if m :
    companies.append(m.group(1))
  
print (set(companies))

{'mistralai', 'google', 'Qwen', 'internlm', 'CohereForAI', 'databricks', 'allenai', 'Anthropic', 'qwen', 'THUDM', 'snowflake', 'amazon', 'tiiuae', 'Intel', 'xai', 'microsoft', 'apple', 'cohere', 'deepseek', 'anthropic', 'openai'}


In [20]:
[x for x in set(ds['model']) if "google" in x.lower()]

['google/Gemini-1.5-Pro',
 'google/PaLM-2-Chat',
 'google/gemini-2.0-flash-lite-preview-02-05',
 'google/gemma-3-4b-it',
 'google/flan-t5-large',
 'google/gemini-flash-experimental',
 'google/gemini-2.5-flash-preview-04-17',
 'google/gemma-1.1-2b-it',
 'google/gemini-2.0-flash-001',
 'google/gemma-3-27b-it',
 'google/gemini-2.0-pro-exp-02-05',
 'google/Gemini-1.5-flash',
 'google/gemma-2-2b-it',
 'google/Gemini-Pro',
 'google/gemini-1.5-flash-002',
 'google/gemma-3-1b-it',
 'google/gemma-1.1-7b-it',
 'google/gemini-2.0-flash-thinking-exp',
 'google/gemini-pro-experimental',
 'google/gemini-1.5-flash-001',
 'google/gemma-2-9b-it',
 'google/gemma-7b-it',
 'google/gemini-1.5-pro-001',
 'google/gemini-2.5-pro-exp-03-25',
 'google/PaLM-2',
 'google/gemini-1.5-pro-002']

In [8]:
# Load current summaries data CSV
source_articles_df = pd.read_csv("../../datasets/leaderboard_dataset_revised.csv")
source_to_id = dict(zip(source_articles_df["text"], source_articles_df["article_id"]))

{'mistralai', 'google', 'Qwen', 'internlm', 'CohereForAI', 'databricks', 'allenai', 'Anthropic', 'qwen', 'THUDM', 'snowflake', 'amazon', 'tiiuae', 'Intel', 'xai', 'microsoft', 'apple', 'cohere', 'deepseek', 'anthropic', 'openai'}


In [11]:
[m for m in unique_models if ""]

[]

In [None]:
"""
The new data model for summaries is:

class BasicSummary(BaseModel):
    article_id: int
    summary_uid: str
    summary: str

    company: str
    model_name: str
    date_code: str | None = None

    eval_name: str
    summary_date: str

    temperature: float | None = None
    max_tokens: int | None = None
    # prompt: str | None = None # We chose not to include prompt in the summary class because it is too long and normally not change. 
    thinking_tokens: int | None = None
    execution_mode: Literal["cpu", "gpu", "api"] | None = None
"""

In [12]:
def get_date_code(company_name, model_name):

  if company_name == "cohere":
    if model_name in ["Cohere", "Cohere-Chat"]:
      return model_name, None
    m = re.match(r'^(.*?)(?:-(\d{2}-\d{4}))?$', model_name)
    if m:
      return m.groups()
  elif company_name == "anthropic":
    m = re.match(r'(.*)-(202\d{5})$', model_name)
    if m:
      return m.groups()
    else:
      return model_name, None
  elif company_name == "openai":
    m = re.match(r'^(.*?)-(202\d-\d{2}-\d{2}|latest|\d{2}-\d{2})$', model_name)
    if m:
      return m.groups()
    else:
      return model_name, None
  elif company_name == "google":
    m = re.match(r'^(.*?)-(\d{2}-\d{2})$', model_name)
    if m:
      return m.groups()
    else:
      return model_name, None
  else: # no code name in model name
    return model_name, None

print (get_date_code("cohere", "command-r-08-2024"))
print (get_date_code("anthropic", "claude-3-5-sonnet-20240620"))

for model in unique_models:
  if "grok" not in model:
    continue
  try: 
    [company_name, model_name] = model.split("/")
    print (model_name, "==>", get_date_code(company_name, model_name))
  except Exception as e:
    if "gemini" in model:
      company_name = "google"
      model_name = model
      print (model_name, "==>", get_date_code(company_name, model_name))
    else:
      print (model, "==>", "ERROR") 
      break


('command-r', '08-2024')
('claude-3-5-sonnet', '20240620')
grok-3-mini-latest ==> ('grok-3-mini-latest', None)
grok-beta ==> ('grok-beta', None)
grok-2-vision-1212 ==> ('grok-2-vision-1212', None)
grok-2-1212 ==> ('grok-2-1212', None)
grok-3-latest ==> ('grok-3-latest', None)


In [146]:
def convert_HF_summaries_to_respective_jsonl(ds: Dataset, source_to_id: dict):
    # map every row into a BasicSummary

  count = 0
  no_source_count_by_model = {}

  for row in tqdm(ds):
    summary = row['summary']
    source = row['source']
    company_model = row['model']

    if company_model in "gemini-2.0-flash-exp":
      company_model = f"google/{company_model}"

    try: 
      [company, model_name] = company_model.split('/')
    except: 
      print (f"Error splitting company and model name for model: {company_model}")
      continue
    
    current_time = datetime.now().strftime("%Y-%m-%d")
    combined_string = (
        f"{model_name}|{summary if summary else 'None'}|{current_time}"
    )
    summary_uid = hashlib.md5(combined_string.encode('utf-8')).hexdigest()

    try:
      model_name, date_code = get_date_code(company, model_name)
    except: 
      print (f"Error getting date code for model: {model_name}")
      continue

    try: 
      article_id = source_to_id[source]
    except: 
      no_source_count_by_model[model_name] = no_source_count_by_model.setdefault(model_name, 0) + 1
      # print (f"Article ID not found for source.")
      continue

    if summary: # summary can be none 
      summary = summary.strip()

    summ_dict = {
        "article_id": article_id, 
        "summary_uid": summary_uid,
        "summary": summary,
        "company": company,
        "model_name": model_name,
        "date_code": date_code,
        "eval_name": "pre_2025-07",
        "summary_date": "pre_2025-07",
        # "temperature": 0,
        # "max_tokens": 1024,
        # "thinking_tokens": None,
        # "execution_mode": None
    }

    # append the summary_dict to the corresponding JSONL file
    output_dir = f"./output/{company}/{model_name}"
    if not os.path.exists(output_dir):
      os.makedirs(output_dir)
    summaries_jsonl_file = f"{output_dir}/summaries.jsonl"
    with open(summaries_jsonl_file, "a") as f:
      f.write(json.dumps(summ_dict, ensure_ascii=False) + "\n")

    # count += 1
    # if count > 20000:
    #   break

  display (no_source_count_by_model)

convert_HF_summaries_to_respective_jsonl(ds, source_to_id)

100%|██████████| 161222/161222 [00:07<00:00, 20792.86it/s]


{'Phi-2': 59,
 'Titan-Express': 59,
 'Gemini-Pro': 59,
 'Mixtral-8x7B-Instruct-v0.1': 59,
 'Claude-2': 41,
 'Cohere': 41,
 'Cohere-Chat': 41,
 'GPT-4': 41,
 'GPT-3.5-Turbo': 41,
 'Llama-2-13b-chat-hf': 41,
 'Llama-2-70b-chat-hf': 41,
 'Llama-2-7b-chat-hf': 41,
 'PaLM-2': 41,
 'PaLM-2-Chat': 41,
 'Mixtral-8x22B-Instruct-v0.1': 59,
 'c4ai-command-r-plus': 59,
 'dbrx-instruct': 59,
 'gemma-1.1-2b-it': 59,
 'gemma-1.1-7b-it': 59,
 'Llama-3-8B-chat-hf': 59,
 'Llama-3-70B-chat-hf': 59,
 'WizardLM-2-8x22B': 59,
 'GPT-4-Turbo': 59,
 'Gemini-1.5-Pro': 59,
 'snowflake-arctic-instruct': 59,
 'OpenELM-3B-Instruct': 59,
 'gpt-4o': 59,
 'Gemini-1.5-flash': 59,
 'claude-3-5-sonnet-20240620': 59,
 'gemma-2-9b-it': 59,
 'GPT-4o-mini': 59,
 'Phi-3-mini-4k-instruct': 59,
 'Phi-3-mini-128k-instruct': 59,
 'Orca-2-13b': 59,
 'gemma-7b-it': 59,
 'Claude-3-sonnet': 59,
 'falcon-7b-instruct': 59,
 'Claude-3-opus': 59,
 'flan-t5-large': 59,
 'Meta-Llama-3.1-405B-Instruct': 59,
 'Meta-Llama-3.1-8B-Instruct': 59