The summaries in https://huggingface.co/datasets/vectara/leaderboard_results do not cover all LLMs that we have evaluated. 

In [None]:
import pandas as pd
import re
import hashlib
from datetime import datetime
import json, os

from collections import Counter
from tqdm.auto import tqdm
from datasets import load_dataset, Dataset
from huggingface_hub import Repository


# 1. Load summaries from the HF summary repo

In [48]:
summaries = load_dataset("vectara/leaderboard_results", split="train")
models_at_hf_summaries=set(summaries['model'])
models_at_hf_summaries = set([x.lower() for x in models_at_hf_summaries])

In [49]:
[x for x in models_at_hf_summaries if "gpt" in x.lower()]

['openai/gpt-4o-mini',
 'openai/gpt-4.5-preview',
 'openai/gpt-4.1-nano',
 'openai/gpt-4.1-mini',
 'openai/gpt-4o',
 'openai/chatgpt-4o-latest',
 'openai/gpt-4-turbo-2024-04-09',
 'openai/gpt-4',
 'openai/gpt-3.5-turbo',
 'openai/gpt-4.1']

# 2. Load eval stats from HF results repo and get LLMs whose summaries are missing in summaries repo

In [None]:
# Load Previous evaluation results

def pull_results(results_dir: str):
    repo = Repository(local_dir = results_dir, clone_from="vectara/results", repo_type="dataset") 
    repo.git_pull()

def extract_info_from_result_file(result_file):
    """
        {
        "config": {
            "model_dtype": "float16",
            "model_name": "databricks/dbrx-instruct",
            "model_sha": "main"
        },
        "results": {
            "hallucination_rate": {
            "hallucination_rate": 8.34990059642147
            },
            "factual_consistency_rate": {
            "factual_consistency_rate": 91.65009940357854
            },
            "answer_rate": {
            "answer_rate": 100.0
            },
            "average_summary_length": {
            "average_summary_length": 85.9
            }
        }
    """

    info = json.load(open(result_file, 'r'))
    result = {
        "LLM": info["config"]["model_name"],
        "Hallucination %": info["results"]["hallucination_rate"]["hallucination_rate"],
        # "Factual Consistency Rate": info["results"]["factual_consistency_rate"]["factual_consistency_rate"],
        "Answer %": info["results"]["answer_rate"]["answer_rate"],
        "Avg Summary Words": info["results"]["average_summary_length"]["average_summary_length"],
    }
    return result

def get_latest_result_file(dir: str):
    """
        Get the latest result file in the given directory based on the timestamp in the file name.
    """
    if not os.path.isdir(dir):
        return None
    files = os.listdir(dir)
    files = [f for f in files if f.endswith(".json")]
    if len(files) == 0:
        return None
    files.sort(key=lambda x: os.path.getmtime(os.path.join(dir, x)))
    # print ("Scanning: ", dir, "found latest file: ", files[0])
    return os.path.join(dir, files[0])

def scan_and_extract(dir: str):
    """Scan all folders recursively and exhaustively to load all JSON files and call `extract_info_from_result_file` on each one.
    """

    results = []
    for root, dirs, files in os.walk(dir):
        if len(dirs) == 0:
            continue
        for dir in dirs:
            result_file = get_latest_result_file(os.path.join(root, dir))
            if result_file is not None:
                results.append(extract_info_from_result_file(result_file))
    return results

def load_results(
        results_dir: str = "./results", 
        results_json: str = "./results.json"
        ):
    
    try: 
        pull_results(results_dir)
        print (f"Successfully pulled results from {results_dir}")
    except Exception as e:
        print(f"Failed to pull and/or extract latest results: {e}")
    
    try: 
        results = scan_and_extract(results_dir)
        if len(results) > 0:
            with open(results_json, "w") as f:
                json.dump(results, f, indent=2)
            print(f"Successfully scanned and extracted results from {results_dir} and saved to {results_json}")
        else:
            print(f"No results found in {results_dir}")
    except Exception as e:
        print(f"Failed to scan and extract results from {results_dir}: {e}")
        print(f"Using pre-dumped results from {results_json}")

    results = json.load(open(results_json, "r"))
    # print(results)

    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values(by="Hallucination %", ascending=True)

    # replace any value TBD with -1
    results_df = results_df.replace("TBD", 100)

    for column in ["Hallucination %", "Answer %", "Avg Summary Words"]:
        results_df[column] = results_df[column].apply(lambda x: round(x, 3))

    results_df["LLM_lower_case"] = results_df["LLM"].str.lower()
    
    return results_df

stats_df = load_results() #

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
/home/forrest/github_hallucination_leaderboard/migration/old_summarie_to_2025_July_format/./results is already a clone of https://huggingface.co/datasets/vectara/results. Make sure you pull the latest changes with `repo.git_pull()`.


Successfully pulled results from ./results
Successfully scanned and extracted results from ./results and saved to ./results.json


In [None]:
models_at_hf_stats = set(stats_df["LLM_lower_case"].to_list())
models_at_hf_stats = set([x.lower() for x in models_at_hf_stats])

# find models in models_at_hf_stats that are not in models_at_hf_summaries using set subtraction

models_missing_summaries = models_at_hf_stats - models_at_hf_summaries
display (models_missing_summaries)

{'anthropic/claude-3-5-sonnet',
 'anthropic/claude-4-opus',
 'anthropic/claude-4-sonnet',
 'deepseek/deepseek-r1-0528',
 'google/gemini-2.5-pro-preview-06-05',
 'google/gemma-3-12b-it',
 'moonshotai/kimi-k2-instruct',
 'openai/gpt-4-turbo',
 'openai/o3-mini-high-reasoning',
 'vectara/mockingbird-2-echo',
 'xai/grok-4-0709'}

Note that some models above are "missing" because they are evaluated using the new framework and are not pushed to HF summary repo `leaderboard_results` which is to be phased out. 

### Now let's confirm that they are really missing, not due to a different name

In [None]:
# Check similar names in models_at_hf_summaries
[x for x in models_at_hf_summaries if "gpt" in x]

['openai/gpt-4o-mini',
 'openai/gpt-4.5-preview',
 'openai/gpt-4.1-nano',
 'openai/gpt-4.1-mini',
 'openai/gpt-4o',
 'openai/chatgpt-4o-latest',
 'openai/gpt-4-turbo-2024-04-09',
 'openai/gpt-4',
 'openai/gpt-3.5-turbo',
 'openai/gpt-4.1']

### Discovery: `gpt-4-turbo` in HF_stats should have been `openai/gpt-4-turbo-2024-04-09`. 

In [54]:
[x for x in models_at_hf_summaries if "gemma" in x]

['google/gemma-1.1-2b-it',
 'google/gemma-3-27b-it',
 'google/gemma-2-2b-it',
 'google/gemma-1.1-7b-it',
 'google/gemma-3-1b-it',
 'google/gemma-7b-it',
 'google/gemma-3-4b-it',
 'google/gemma-2-9b-it']

### Discovery: `gemma-3-12b-it` is really missing

In [55]:
[x for x in models_at_hf_summaries if "gemini" in x]

['google/gemini-2.0-pro-exp-02-05',
 'google/gemini-2.5-flash-preview-04-17',
 'google/gemini-1.5-flash-001',
 'gemini-2.0-flash-exp',
 'google/gemini-2.0-flash-001',
 'google/gemini-1.5-flash',
 'google/gemini-1.5-pro-002',
 'google/gemini-flash-experimental',
 'google/gemini-pro-experimental',
 'google/gemini-2.5-pro-exp-03-25',
 'google/gemini-pro',
 'google/gemini-2.0-flash-thinking-exp',
 'google/gemini-1.5-pro-001',
 'google/gemini-2.0-flash-lite-preview-02-05',
 'google/gemini-1.5-pro',
 'google/gemini-1.5-flash-002']

### Discovery: `google/gemini-2.5-pro-preview-06-05` is really missing

In [56]:
[x for x in models_at_hf_summaries if "claude" in x]

['anthropic/claude-3-5-haiku-20241022',
 'anthropic/claude-3-7-sonnet-latest-think',
 'anthropic/claude-3-7-sonnet-latest',
 'anthropic/claude-3-5-sonnet-20241022',
 'anthropic/claude-3-sonnet',
 'anthropic/claude-2',
 'anthropic/claude-3-opus',
 'anthropic/claude-3-5-sonnet-20240620']

### Discovery: `anthropic/claude-3-5-sonnet` is not missing. Just that it had date-code in the name in HF summary repo `leaderboard_results`. the other two missing ones are produced using new framework. So not in HF summary repo `leaderboard_results` as expected. 

## So the actually missing ones are `gemma-3-12b-it` and `google/gemini-2.5-pro-preview-06-05` We will find them in July 2024 Google Drive snapshot of summaries below

# 3. Load from July 2024 Google Drive snapshot of summaries to see whether missing models' summaies are present

In [27]:
gdrive_summaries = pd.read_csv("gdrive_summaries_july_2024.csv")

In [36]:
[x for x in set(gdrive_summaries['model']) if "claude" in x.lower()]

['claude',
 'claude3-opus',
 'Anthropic/claude-3-5-sonnet-20240620',
 'claude3-sonnet']

### Discovery 2: Google Gemini-2.5-pro-preview-06-05 is missing

In [58]:
[x for x in set(gdrive_summaries['model']) if "gemini" in x.lower()]

['google/Gemini-1.5-Pro', 'google/Gemini-1.5-flash', 'Google Gemini Pro']

### Discovery 3: Gemma-3-12b-it still missing

In [59]:
[x for x in set(gdrive_summaries['model']) if "gemma" in x.lower()]

['google/gemma-1.1-2b-it',
 'google/gemma-2-9b-it',
 'gemma-7b-it',
 'google/gemma-1.1-7b-it']