# Hugging Face Hub Exploration
Here we're exploring the model data and metadata that is available through the HuggingFace Hub package.

The goal is to investigate if it's feasible to filter models on the hub by their sizes and the types of models.

In [1]:
import huggingface_hub as hf_hub
from huggingface_hub import HfApi

  from .autonotebook import tqdm as notebook_tqdm


## Setup

In [2]:
api = HfApi()

In [3]:
try:
    result = api.whoami()
    print(f"Logged in as: {result['type']} {result['name']})")
except:
    hf_hub.login()

Logged in as: user willdmar)


## Filter Models
First we filter models on their domain by choosing the ones that are relevant to us. We do it based on the model `Tasks` defined on Hugging Face

In [100]:
domains = ["image-classification", "object-detection", "image-segmentation", "text-to-image", "image-to-text", "image-to-image",
           "zero-shot-image-classification", "zero-shot-object-detection", "image-feature-extraction", "keypoint-detection",
           "text-classification", "token-classification", "zero-shot-classification", "text-to-speech", "text-to-audio", 
           "automatic-speech-recognition", "audio-to-audio", "audio-classification", "voice-activity-detection", 
           "tabular-classification","tabular-regression", "time-series-forecasting"]

libraries = ["pytorch", "tensorflow", "keras", "transformers", "safetensors", "kerashub", 'tf-keras', 'tflite', 'timm', 'onnx']

In [101]:
def get_data_type_size(data_type):
    """Returns the size in bytes for a given data type."""
    data_type_sizes = {
        'BF16': 2,
        'FP16': 2,
        'FP32': 4,
        'FP64': 8,
        'INT8': 1,
        'INT16': 2,
        'INT32': 4,
        'INT64': 8
    }
    
    return data_type_sizes.get(data_type.upper(), None)

def calc_safetensor_size(data_type, number_of_elements):
    """Calculates the memory usage in megabytes for a given data type and number of elements."""
    size_in_bytes = get_data_type_size(data_type)
    
    if size_in_bytes is None:
        raise ValueError(f"Unknown data type: {data_type}")
    
    total_memory_bytes = size_in_bytes * number_of_elements
    total_memory_mb = total_memory_bytes / 1_000_000  # Convert bytes to megabytes
    
    return total_memory_mb

In [102]:
def calc_size(model_info: hf_hub.ModelInfo):
    if model_info.safetensors is not None:
        model_safetensor_params = list(model_info.safetensors.parameters.items())[0]
        return calc_safetensor_size(model_safetensor_params[0], model_safetensor_params[1])
    else:
        return -1

In [103]:
library_counts = {}

for lib in libraries:
    models = api.list_models(library=lib)
    library_counts[lib] = sum(1 for model in models)

library_counts

KeyboardInterrupt: 

In [None]:
domain_counts = {}

for domain in domains:
    models = api.list_models(filter=domain)
    domain_counts[domain] = sum(1 for model in models)

In [None]:
domain_counts = {}

for domain in domains:
    models = api.list_models(filter=domain, fetch_config=True)

    if domain not in domain_counts:
        domain_counts[domain] = {
            'total_count': 0,
            'count_with_safe_tensors': 0,
            'model_type_count': {}
        }

    for model_info in models:
        if model_info.library in libraries:
            domain_counts[domain]['total_count'] += 1

            meta_data = api.model_info(model_info.modelId, files_metadata=True)
            if meta_data.safetensors is not None:
                domain_counts[domain]['count_with_safe_tensors'] += 1

            # model_size = calculate_size(meta_data)
            model_type = model_info.config.get('model_type', 'unknown') if model_info.config else 'unknown'
            if model_type not in domain_counts[domain]['model_type_count']:
                domain_counts[domain]['model_type_count'][model_type] = 0
            domain_counts[domain]['model_type_count'][model_type] += 1
    
    print(f"{domain}: {domain_counts[domain]['total_count']}")
    print(f"SafeTensor Count: {domain_counts[domain]['count_with_safe_tensors']}")
    print(domain_counts[domain]['model_type_count'])


KeyboardInterrupt: 

In [89]:
models = api.list_models(filter='image-classification', fetch_config=True, cardData=True)