In [39]:
import requests
from datasets import load_dataset
from huggingface_hub import hf_api, ModelCard
from huggingface_hub.utils import EntryNotFoundError
import pandas as pd
import matplotlib.pyplot as plt
import rich
import random

In [3]:
api = hf_api.HfApi()

In [4]:
rich.inspect(api.list_models)

In [5]:
hub_datasets = list(iter(api.list_datasets(limit=None, filter=None, full=True)))

In [6]:
hub_models = list(iter(api.list_models(limit=None, filter=None, full=True)))

In [7]:
hub_models[0]

ModelInfo(id='albert/albert-base-v1', author='albert', sha='082438ba120d36b97b9288772a41144e941705b9', created_at=datetime.datetime(2022, 3, 2, 23, 29, 4, tzinfo=datetime.timezone.utc), last_modified=datetime.datetime(2024, 2, 19, 10, 57, 35, tzinfo=datetime.timezone.utc), private=False, gated=False, disabled=None, downloads=13582, likes=7, library_name='transformers', tags=['transformers', 'pytorch', 'tf', 'safetensors', 'albert', 'fill-mask', 'exbert', 'en', 'dataset:bookcorpus', 'dataset:wikipedia', 'arxiv:1909.11942', 'license:apache-2.0', 'autotrain_compatible', 'endpoints_compatible', 'region:us'], pipeline_tag='fill-mask', mask_token=None, card_data=None, widget_data=None, model_index=None, config=None, transformers_info=None, siblings=[RepoSibling(rfilename='.gitattributes', size=None, blob_id=None, lfs=None), RepoSibling(rfilename='README.md', size=None, blob_id=None, lfs=None), RepoSibling(rfilename='config.json', size=None, blob_id=None, lfs=None), RepoSibling(rfilename='mod

In [8]:
hub_item_dict = []
for item in hub_models:
        data = item.__dict__
        hub_item_dict.append(data)

In [17]:
# df = pd.DataFrame.from_dict(hub_item_dict)
df = load_dataset("librarian-bots/model_cards_with_metadata")['train'].to_pandas()

Downloading readme:   0%|          | 0.00/5.89k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/250M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/105M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/85.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/747434 [00:00<?, ? examples/s]

In [18]:
df.columns

Index(['modelId', 'author', 'last_modified', 'downloads', 'likes',
       'library_name', 'tags', 'pipeline_tag', 'createdAt', 'card'],
      dtype='object')

In [19]:
df.loc[30, "tags"]

array(['transformers', 'pytorch', 'tf', 'jax', 'safetensors', 'bert',
       'fill-mask', 'exbert', 'en', 'dataset:bookcorpus',
       'dataset:wikipedia', 'arxiv:1810.04805', 'license:apache-2.0',
       'autotrain_compatible', 'endpoints_compatible', 'region:us'],
      dtype=object)

In [21]:
#Check that the labels are inside the readme file
print(df.loc[30,"card"])

---
language: en
tags:
- exbert
license: apache-2.0
datasets:
- bookcorpus
- wikipedia
---

# BERT base model (cased)

Pretrained model on English language using a masked language modeling (MLM) objective. It was introduced in
[this paper](https://arxiv.org/abs/1810.04805) and first released in
[this repository](https://github.com/google-research/bert). This model is case-sensitive: it makes a difference between
english and English.

Disclaimer: The team releasing BERT did not write a model card for this model so this model card has been written by
the Hugging Face team.

## Model description

BERT is a transformers model pretrained on a large corpus of English data in a self-supervised fashion. This means it
was pretrained on the raw texts only, with no humans labelling them in any way (which is why it can use lots of
publicly available data) with an automatic process to generate inputs and labels from those texts. More precisely, it
was pretrained with two objectives:

- Masked langu

In [25]:
card = ModelCard.load(df.loc[30,"modelId"]).data.to_dict()
print(card)

{'datasets': ['bookcorpus', 'wikipedia'], 'language': 'en', 'license': 'apache-2.0', 'tags': ['exbert']}


In [26]:
#Go through the card dictionary
for item in card.items():
    if type(item[1]) is list:
        for i in item[1]:
            print(i)
    else:
        print(item[1])

bookcorpus
wikipedia
en
apache-2.0
exbert


In [43]:
# create a list of random integers between 0 and 100000
idx_list = [random.randint(0, 100000) for i in range(10)]
# idx_list = [1]

for id in idx_list:
    dump_tags = df.loc[id, "tags"]
    # yml_tags = []
    try:
        card = str(ModelCard.load(df.loc[id,"modelId"]).data)
    except EntryNotFoundError:
        print(f"Error loading model card, no model card found for {df.loc[id, 'modelId']}")
        
    tags_not_in_card = []
    for tag in dump_tags:
        if ":" in tag:
            tag = tag.split(":")[1]
        if tag not in card:
            all_dump_tags = False
            tags_not_in_card.append(tag)
    print(f"ModelId: {df.loc[id, 'modelId']}")
    print("Tags not in card: \n",tags_not_in_card)
    print("\nCard: \n",card)
        
    # print(card)
    

README.md:   0%|          | 0.00/15.9k [00:00<?, ?B/s]

ModelId: lllyasviel/control_v11p_sd15_inpaint
Tags not in card: 
 ['diffusers', 'safetensors', '2302.05543']

Card: 
 base_model: runwayml/stable-diffusion-v1-5
license: openrail
tags:
- art
- controlnet
- stable-diffusion
- controlnet-v1-1
- image-to-image
duplicated_from: ControlNet-1-1-preview/control_v11p_sd15_inpaint


README.md:   0%|          | 0.00/635 [00:00<?, ?B/s]

ModelId: onefish51/dog_w_prior-preservation
Tags not in card: 
 []

Card: 
 base_model: /data2/home/tyu/stable_diffusion/diffusers/stable-diffusion-v1-4
license: creativeml-openrail-m
tags:
- stable-diffusion
- stable-diffusion-diffusers
- text-to-image
- diffusers
- lora
inference: true


README.md:   0%|          | 0.00/816 [00:00<?, ?B/s]

ModelId: Helsinki-NLP/opus-mt-ht-fr
Tags not in card: 
 ['transformers', 'pytorch', 'tf', 'marian', 'text2text-generation', 'ht', 'fr', 'autotrain_compatible', 'endpoints_compatible', 'us']

Card: 
 license: apache-2.0
tags:
- translation
Error loading model card, no model card found for
ModelId: QuantFactory/internlm2_5-7b-chat-GGUF
Tags not in card: 
 ['gguf', 'us']

Card: 
 license: apache-2.0
tags:
- translation


README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

ModelId: snunlp/KLUE-RoBERTa-Large-SNUExtended
Tags not in card: 
 ['safetensors', 'roberta', 'feature-extraction', '1910.09700', 'endpoints_compatible', 'text-embeddings-inference', 'us']

Card: 
 library_name: transformers
tags: []


README.md:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

ModelId: tutrinh/catcch-pilot-model-2
Tags not in card: 
 ['tensorboard', 'safetensors', 'us']

Card: 
 base_model: mistralai/Mistral-7B-Instruct-v0.2
library_name: peft
license: apache-2.0
tags:
- generated_from_trainer
model-index:
- name: huggingface_out
  results: []
Error loading model card, no model card found for
ModelId: Jeska/VaccinChatSentenceClassifierDutch
Tags not in card: 
 ['transformers', 'pytorch', 'tensorboard', 'bert', 'text-classification', 'autotrain_compatible', 'endpoints_compatible', 'us']

Card: 
 base_model: mistralai/Mistral-7B-Instruct-v0.2
library_name: peft
license: apache-2.0
tags:
- generated_from_trainer
model-index:
- name: huggingface_out
  results: []


README.md:   0%|          | 0.00/847 [00:00<?, ?B/s]

ModelId: MarkBW/cinematic-style-xl
Tags not in card: 
 []

Card: 
 base_model: stabilityai/stable-diffusion-xl-base-1.0
tags:
- text-to-image
- stable-diffusion
- lora
- diffusers
- template:sd-lora
widget:
- text: "UNICODE\0\0D\0a\0r\0k\0 \0F\0a\0n\0t\0a\0s\0y\0 \0A\0r\0t\0 \0o\0f\0 \0 \0\
    <\0l\0o\0r\0a\0:\0C\0i\0n\0e\0m\0a\0t\0i\0c\0 \0H\0o\0l\0l\0y\0w\0o\0o\0d\0 \0\
    F\0i\0l\0m\0:\01\0.\05\0>\0"
  output:
    url: images/00450-2279182498.jpeg
instance_prompt: Cinematic Hollywood Film, Cinematic Hollywood Film style


README.md:   0%|          | 0.00/2.22k [00:00<?, ?B/s]

ModelId: KaibaZax/MCC
Tags not in card: 
 []

Card: 
 base_model: runwayml/stable-diffusion-v1-5
license: unknown
tags:
- text-to-image
- stable-diffusion
- lora
- diffusers
- template:sd-lora
widget:
- text: Amelia
  output:
    url: images/DADE8E2605E536E42EE74F3E45618133A34AA464DBE61D24BF0AE4E750669EBC.jpg
- text: Jessie
  output:
    url: images/29A6CEC2B135070B14C2008B6FE5230ECD91A6303614912DAB59F1E84252964A.jpg
- text: Kimberly
  output:
    url: images/15C55224FFF20855B0856BEF8DEC1D2A321335177E34C6E27D4A094BB659E5DE.jpg
- text: Sandy
  output:
    url: images/FEFE2D7642128FA160FF50A6A60DDCE28C1E7962374C612447A6D5A5B15D629A.jpg
- text: Amelia
  output:
    url: images/9CB611B6BA1D1D3452463AF7A169F4661E8A3B4998ECE22949AF49CAD0081725.jpg
- text: Vanessa
  output:
    url: images/AA48592695B13C315B49F8C4BC60CB206BEB96BFABF3DF383B01201F401B86C1.jpg
- text: Amelia
  output:
    url: images/B4DE05F90047CC25DE45951535513DC4D5B8E9601EFF086B5EAD41388BD559FB.jpg
- text: Deloris
  output:
 

README.md:   0%|          | 0.00/5.13k [00:00<?, ?B/s]

ModelId: DionTimmer/controlnet_qrcode
Tags not in card: 
 ['diffusers', 'safetensors']

Card: 
 language:
- en
license: openrail++
tags:
- stable-diffusion
- controlnet


In [47]:
# Going throug the tag types

response = requests.get(
  "https://huggingface.co/api/models-tags-by-type",
  params={},
  headers={}
)
tags = response.json()

In [51]:
print(rich.inspect(tags))

None
