# 对 HuggingFace 公开大模型的数据分析

最受用户喜欢的文本生成模型是怎样的？

# 数据爬取

Hugging Face 仅提供 100 页（每页30个模型）的数据供浏览，也即 3000 个模型，但存放的数据远比这个多（截至2024.12.15有 1,201,412 个模型）。

因此爬取全站的模型是不可行的，哪怕将范围拉到最近7天内的文本生成模型，就有超过 3000 个模型。因此，我们的爬取策略转换为

- 爬取前 3000 个最受用户喜欢的文本生成模型

对其进行数据分析。

In [3]:
import requests
import json
import os
from typing import List
# this url was reverse engineered
base_url = "https://huggingface.co/models-json"

def get_single_page_models(p) -> List[dict]:
    '''30 models per page'''
    assert 0 <= p <= 99, 'page must be positive and less than or equal to 99'
    models = []
    cache_path = f'cache/models_page_{p}.json'
    # read from cache if exists
    if os.path.exists(cache_path):
        with open(cache_path, 'r') as f:
            models = json.load(f)
    else:
        resp = requests.get(base_url, params={"p": p,
                                              "sort": "likes",
                                              "pipeline_tag": "text-generation",
                                              "withCount": True    # default true
                                              })
        models = resp.json().get('models', [])
        
        if not os.path.exists('cache'):
            os.makedirs('cache')
        
        with open(cache_path, 'w') as f:
            json.dump(models, f)
    
    return models

# example of model data
get_single_page_models(1)[0]

{'author': '01-ai',
 'authorData': {'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/6536187279f1de44b5e02d0f/-T8Xw0mX67_R73b7Re1y-.png',
  'fullname': '01-ai',
  'name': '01-ai',
  'type': 'org',
  'isHf': False,
  'isMod': False,
  'isEnterprise': False,
  'followerCount': 570},
 'downloads': 4572,
 'gated': False,
 'id': '01-ai/Yi-34B',
 'inference': 'not-popular-enough',
 'lastModified': '2024-11-11T03:31:31.000Z',
 'likes': 1287,
 'pipeline_tag': 'text-generation',
 'private': False,
 'repoType': 'model',
 'isLikedByUser': False,
 'widgetOutputUrls': []}

## 二级爬取

模型参数作为标签不出现在概览中，针对模型大小的分析，我们考虑通过正则表达式来获取模型大小，例如

`meta-llama/Llama-3.3-70B-Instruct` 就可以pattern匹配得到模型大小为70B。

问题: 如果模型名不包含参数大小, 方法失效.

此外, 有一些模型的参数比较特殊, 例如 `mistralai/Mixtral-8x7B-Instruct-v0.1` 提取出来是 8x7B? 7B? 但实际上是 46.7B.

于是我们认为有需要进入详细页面爬取具体的模型参数.

![model size](images/model_size.png)

In [None]:
from lxml import html
import re

def get_model_para(model: str):
    '''given a model name, return the model size in billion.
    >>> print(get_model_para('CohereForAI/c4ai-command-r7b-12-2024'))
    8.03
    '''
    author, model_name = model.split('/')
    cache_path = f'cache/{author}-{model_name}.html'
    
    if os.path.exists(cache_path):
        with open(cache_path, 'r') as f:
            raw_html = f.read()
    else:
        base_url = 'https://huggingface.co/'
        resp = requests.get(base_url + model)
        raw_html = resp.content.decode('utf-8')
        
        with open(cache_path, 'w') as f:
            f.write(raw_html)
    
    tree = html.fromstring(raw_html)
    para = tree.xpath('//div[@class="px-1.5"]')[0]
    model_size = re.findall(r'(\d+\.\d+)B params', para.text)[0]
    return model_size


# example usage
print(get_model_para('CohereForAI/c4ai-command-r7b-12-2024'))

8.03


## Sqlite ORM

定义数据模型，将数据模型映射到数据库表中，数据存储在本地数据库 SQlite 中

In [6]:
import datetime

from sqlalchemy import Integer, String, DateTime, ForeignKey
from sqlalchemy.orm import DeclarativeBase
from sqlalchemy.orm import Mapped, mapped_column, relationship

class Base(DeclarativeBase):
    pass


class Author(Base):
    __tablename__ = 'Author'
    
    id: Mapped[int] = mapped_column(primary_key=True)
    name: Mapped[str] = mapped_column(String(40), unique=True)
    type: Mapped[str] = mapped_column(String(40))
    isEnterprise: Mapped[bool] = mapped_column(Integer)
    
    models: Mapped[List["Model"]] = relationship(
        "Model", back_populates="author", cascade="all, delete-orphan"
    )
    
    def __repr__(self):
        return f"<{self.name}>"
    
    def to_dict(self):
        return {
            "id": self.id,
            "name": self.name,
            "type": self.type,
            "isEnterprise": self.isEnterprise,
        }


class Model(Base):
    __tablename__ = 'Model'
    
    id: Mapped[int] = mapped_column(primary_key=True)
    name: Mapped[str] = mapped_column(String(40), unique=True)
    lastModified: Mapped[datetime.datetime] = mapped_column(DateTime)
    downloads: Mapped[int] = mapped_column(Integer)
    likes: Mapped[int] = mapped_column(Integer)
    
    author_id: Mapped[int] = mapped_column(Integer, ForeignKey('Author.id'))
    author: Mapped[Author] = relationship("Author", back_populates="models")
    
    def __repr__(self):
        return f"<{self.name}>"
    
    def to_dict(self):
        return {
            "id": self.id,
            "name": self.name,
            "lastModified": self.lastModified,
            "downloads": self.downloads,
            "likes": self.likes,
            "author_id": self.author_id,
        }

模型序列化器：给一个 dict，返回一个对象

In [9]:
def author_seralizer(author_data) -> Author:
    # print(author_data)
    author = Author(
        name=author_data['name'],
        type=author_data['type'],
        isEnterprise=author_data.get('isEnterprise', False),
    )
    return author

def model_serializer(model_data) -> Model:
    model = Model(
        name=model_data['id'],
        lastModified=datetime.datetime.strptime(model_data['lastModified'], '%Y-%m-%dT%H:%M:%S.%fZ'),
        downloads=model_data['downloads'],
        likes=model_data['likes'],
        author=author_seralizer(model_data['authorData']),
    )
    return model

## 数据库操纵

### 创建数据库

In [13]:
from sqlalchemy import create_engine
engine = create_engine('sqlite:///data.db', echo=False)
# If no migration
Base.metadata.drop_all(engine)
Base.metadata.create_all(engine)

### 爬取与存放数据

In [16]:
from sqlalchemy.orm import Session

mx_page = 99
with Session(engine) as session:
    for page in range(0, mx_page + 1):
        models = get_single_page_models(page)
        for model in models:
            model_obj = model_serializer(model)
            # 如果作者已存在，则使用已存在的作者对象，否则重复创建会导致唯一约束冲突
            existing_author = session.query(Author).filter_by(name=model_obj.author.name).first()
            if existing_author:
                model_obj.author = existing_author
            session.add(model_obj)
        
    session.commit()

## 转换为 Dataframe

In [18]:
import pandas as pd

models = pd.read_sql_table('Model', 'sqlite:///data.db')
models.head()

Unnamed: 0,id,name,lastModified,downloads,likes,author_id
0,1,meta-llama/Meta-Llama-3-8B,2024-09-27 15:52:33,634445,5914,1
1,2,bigscience/bloom,2023-07-28 17:50:20,14597,4794,2
2,3,mistralai/Mixtral-8x7B-Instruct-v0.1,2024-08-19 13:18:42,1627931,4226,3
3,4,meta-llama/Llama-2-7b,2024-04-17 08:12:44,0,4187,1
4,5,meta-llama/Llama-2-7b-chat-hf,2024-04-17 08:40:48,1048193,4066,1


In [21]:
authors = pd.read_sql_table('Author', 'sqlite:///data.db')
authors.head()

Unnamed: 0,id,name,type,isEnterprise
0,1,meta-llama,org,1
1,2,bigscience,org,0
2,3,mistralai,org,1
3,4,microsoft,org,0
4,5,google,org,0


In [23]:
models.describe()

Unnamed: 0,id,lastModified,downloads,likes,author_id
count,3000.0,3000,3000.0,3000.0,3000.0
mean,1500.5,2024-02-22 14:44:39.612333312,54864.75,119.233,200.751
min,1.0,2021-03-03 01:44:59,0.0,19.0,1.0
25%,750.75,2023-09-29 20:49:27.750000128,230.75,27.0,25.0
50%,1500.5,2024-03-14 18:41:07,1321.5,43.0,119.0
75%,2250.25,2024-07-29 06:11:10.750000128,5669.5,92.0,310.25
max,3000.0,2024-12-15 01:55:41,18395960.0,5914.0,787.0
std,866.169729,,534781.8,310.258508,207.114514


In [25]:
authors.describe()

Unnamed: 0,id,isEnterprise
count,787.0,787.0
mean,394.0,0.049555
std,227.331623,0.217162
min,1.0,0.0
25%,197.5,0.0
50%,394.0,0.0
75%,590.5,0.0
max,787.0,1.0


# 数据预处理

In [28]:
def low_quality_criteria(models,likes_threshold,download_threshold,Modified_threshold):
    low_quality_criteria=(models['lastModified'] < Modified_threshold) | (models['downloads'] < download_threshold) | (models['likes'] < likes_threshold)
    return low_quality_criteria
def split_models(models,likes_threshold,download_threshold,Modified_threshold):
    low_quality= low_quality_criteria(models,likes_threshold,download_threshold,Modified_threshold)
    #分割数据集
    high_quality_models = models[~low_quality]
    low_quality_models = models[low_quality]
    return high_quality_models,low_quality_models

In [30]:
likes_threshold = models.describe().loc['25%','likes']
download_threshold =models.describe().loc['25%','downloads']
Modifiled_threshold = '2024-01-01 00:00:00'
high_quality_models,low_quality_models = split_models(models,likes_threshold,download_threshold,Modifiled_threshold)
high_quality_models.head()

Unnamed: 0,id,name,lastModified,downloads,likes,author_id
0,1,meta-llama/Meta-Llama-3-8B,2024-09-27 15:52:33,634445,5914,1
2,3,mistralai/Mixtral-8x7B-Instruct-v0.1,2024-08-19 13:18:42,1627931,4226,3
4,5,meta-llama/Llama-2-7b-chat-hf,2024-04-17 08:40:48,1048193,4066,1
5,6,meta-llama/Meta-Llama-3-8B-Instruct,2024-09-27 15:52:39,2392030,3692,1
6,7,mistralai/Mistral-7B-v0.1,2024-07-24 14:04:08,1763962,3489,3


In [52]:
import re
# 定义提取模型参数量的函数
def extract_model_size(model_name):
    pattern = r'(\d+(?:x\d+)?[Bb])'
    match = re.search(pattern, model_name)
    return match.group(1) if match else None

In [54]:
high_quality_models.loc[:,'model_size'] = high_quality_models['name'].apply(extract_model_size)
high_quality_models.head()

Unnamed: 0,id,name,lastModified,downloads,likes,author_id,model_size
0,1,meta-llama/Meta-Llama-3-8B,2024-09-27 15:52:33,634445,5914,1,8B
2,3,mistralai/Mixtral-8x7B-Instruct-v0.1,2024-08-19 13:18:42,1627931,4226,3,8x7B
4,5,meta-llama/Llama-2-7b-chat-hf,2024-04-17 08:40:48,1048193,4066,1,7b
5,6,meta-llama/Meta-Llama-3-8B-Instruct,2024-09-27 15:52:39,2392030,3692,1,8B
6,7,mistralai/Mistral-7B-v0.1,2024-07-24 14:04:08,1763962,3489,3,7B


# 数据可视化

## 模型参数量

## 时间分布

## 作者类型