# 对 HuggingFace 公开大模型的数据分析

最受用户喜欢的文本生成模型是怎样的？

# 数据爬取

Hugging Face 仅提供 100 页（每页30个模型）的数据供浏览，也即 3000 个模型，但存放的数据远比这个多（截至2024.12.15有 1,201,412 个模型）。

因此爬取全站的模型是不可行的，哪怕将范围拉到最近7天内的文本生成模型，就有超过 3000 个模型。因此，我们的爬取策略转换为

- 爬取前 3000 个最受用户喜欢的文本生成模型

对其进行数据分析。

In [115]:
import requests
import json
import os
from typing import List
# this url was reverse engineered
base_url = "https://huggingface.co/models-json"

def get_single_page_models(p) -> List[dict]:
    '''30 models per page'''
    assert 0 <= p <= 99, 'page must be positive and less than or equal to 99'
    models = []
    cache_path = f'cache/models_page_{p}.json'
    # read from cache if exists
    if os.path.exists(cache_path):
        with open(cache_path, 'r') as f:
            models = json.load(f)
    else:
        resp = requests.get(base_url, params={"p": p,
                                              "sort": "likes",
                                              "pipeline_tag": "text-generation",
                                              "withCount": True    # default true
                                              })
        models = resp.json().get('models', [])
        
        if not os.path.exists('cache'):
            os.makedirs('cache')
        
        with open(cache_path, 'w') as f:
            json.dump(models, f)
    
    return models

# example of model data
get_single_page_models(1)[0]

{'author': '01-ai',
 'authorData': {'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/6536187279f1de44b5e02d0f/-T8Xw0mX67_R73b7Re1y-.png',
  'fullname': '01-ai',
  'name': '01-ai',
  'type': 'org',
  'isHf': False,
  'isMod': False,
  'isEnterprise': False,
  'followerCount': 570},
 'downloads': 4572,
 'gated': False,
 'id': '01-ai/Yi-34B',
 'inference': 'not-popular-enough',
 'lastModified': '2024-11-11T03:31:31.000Z',
 'likes': 1287,
 'pipeline_tag': 'text-generation',
 'private': False,
 'repoType': 'model',
 'isLikedByUser': False,
 'widgetOutputUrls': []}

## Sqlite ORM

定义数据模型，将数据模型映射到数据库表中，数据存储在本地数据库 SQlite 中

In [116]:
import datetime

from sqlalchemy import Integer, String, DateTime, ForeignKey
from sqlalchemy.orm import DeclarativeBase
from sqlalchemy.orm import Mapped, mapped_column, relationship

class Base(DeclarativeBase):
    pass


class Author(Base):
    __tablename__ = 'Author'
    
    id: Mapped[int] = mapped_column(primary_key=True)
    name: Mapped[str] = mapped_column(String(40), unique=True)
    type: Mapped[str] = mapped_column(String(40))
    isEnterprise: Mapped[bool] = mapped_column(Integer)
    
    models: Mapped[List["Model"]] = relationship(
        "Model", back_populates="author", cascade="all, delete-orphan"
    )
    
    def __repr__(self):
        return f"<{self.name}>"
    
    def to_dict(self):
        return {
            "id": self.id,
            "name": self.name,
            "type": self.type,
            "isEnterprise": self.isEnterprise,
        }


class Model(Base):
    __tablename__ = 'Model'
    
    id: Mapped[int] = mapped_column(primary_key=True)
    name: Mapped[str] = mapped_column(String(40), unique=True)
    lastModified: Mapped[datetime.datetime] = mapped_column(DateTime)
    downloads: Mapped[int] = mapped_column(Integer)
    likes: Mapped[int] = mapped_column(Integer)
    
    author_id: Mapped[int] = mapped_column(Integer, ForeignKey('Author.id'))
    author: Mapped[Author] = relationship("Author", back_populates="models")
    
    def __repr__(self):
        return f"<{self.name}>"
    
    def to_dict(self):
        return {
            "id": self.id,
            "name": self.name,
            "lastModified": self.lastModified,
            "downloads": self.downloads,
            "likes": self.likes,
            "author_id": self.author_id,
        }

模型序列化器：给一个 dict，返回一个对象

In [117]:
def author_seralizer(author_data) -> Author:
    # print(author_data)
    author = Author(
        name=author_data['name'],
        type=author_data['type'],
        isEnterprise=author_data.get('isEnterprise', False),
    )
    return author

def model_serializer(model_data) -> Model:
    model = Model(
        name=model_data['id'],
        lastModified=datetime.datetime.strptime(model_data['lastModified'], '%Y-%m-%dT%H:%M:%S.%fZ'),
        downloads=model_data['downloads'],
        likes=model_data['likes'],
        author=author_seralizer(model_data['authorData']),
    )
    return model

## 数据库操纵

### 创建数据库

In [118]:
from sqlalchemy import create_engine
engine = create_engine('sqlite:///data.db', echo=True)
# If no migration
Base.metadata.drop_all(engine)
Base.metadata.create_all(engine)

2024-12-15 15:32:58,800 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2024-12-15 15:32:58,801 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("Author")
2024-12-15 15:32:58,801 INFO sqlalchemy.engine.Engine [raw sql] ()
2024-12-15 15:32:58,803 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("Model")
2024-12-15 15:32:58,803 INFO sqlalchemy.engine.Engine [raw sql] ()
2024-12-15 15:32:58,805 INFO sqlalchemy.engine.Engine 
DROP TABLE "Model"
2024-12-15 15:32:58,805 INFO sqlalchemy.engine.Engine [no key 0.00072s] ()


2024-12-15 15:32:58,834 INFO sqlalchemy.engine.Engine 
DROP TABLE "Author"
2024-12-15 15:32:58,834 INFO sqlalchemy.engine.Engine [no key 0.00083s] ()
2024-12-15 15:32:58,849 INFO sqlalchemy.engine.Engine COMMIT
2024-12-15 15:32:58,850 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2024-12-15 15:32:58,851 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("Author")
2024-12-15 15:32:58,852 INFO sqlalchemy.engine.Engine [raw sql] ()
2024-12-15 15:32:58,853 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("Author")
2024-12-15 15:32:58,853 INFO sqlalchemy.engine.Engine [raw sql] ()
2024-12-15 15:32:58,854 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("Model")
2024-12-15 15:32:58,855 INFO sqlalchemy.engine.Engine [raw sql] ()
2024-12-15 15:32:58,856 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("Model")
2024-12-15 15:32:58,856 INFO sqlalchemy.engine.Engine [raw sql] ()
2024-12-15 15:32:58,858 INFO sqlalchemy.engine.Engine 
CREATE TABLE "Author" (
	id INTEGER NOT NULL, 
	n

### 爬取与存放数据

In [119]:
from sqlalchemy.orm import Session

mx_page = 99
with Session(engine) as session:
    for page in range(0, mx_page + 1):
        models = get_single_page_models(page)
        for model in models:
            model_obj = model_serializer(model)
            # 如果作者已存在，则使用已存在的作者对象，否则重复创建会导致唯一约束冲突
            existing_author = session.query(Author).filter_by(name=model_obj.author.name).first()
            if existing_author:
                model_obj.author = existing_author
            session.add(model_obj)
        
    session.commit()

2024-12-15 15:33:00,330 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2024-12-15 15:33:00,333 INFO sqlalchemy.engine.Engine SELECT "Author".id AS "Author_id", "Author".name AS "Author_name", "Author".type AS "Author_type", "Author"."isEnterprise" AS "Author_isEnterprise" 
FROM "Author" 
WHERE "Author".name = ?
 LIMIT ? OFFSET ?
2024-12-15 15:33:00,333 INFO sqlalchemy.engine.Engine [generated in 0.00079s] ('meta-llama', 1, 0)
2024-12-15 15:33:00,336 INFO sqlalchemy.engine.Engine INSERT INTO "Author" (name, type, "isEnterprise") VALUES (?, ?, ?)
2024-12-15 15:33:00,337 INFO sqlalchemy.engine.Engine [generated in 0.00064s] ('meta-llama', 'org', True)
2024-12-15 15:33:00,339 INFO sqlalchemy.engine.Engine INSERT INTO "Model" (name, "lastModified", downloads, likes, author_id) VALUES (?, ?, ?, ?, ?)
2024-12-15 15:33:00,340 INFO sqlalchemy.engine.Engine [generated in 0.00052s] ('meta-llama/Meta-Llama-3-8B', '2024-09-27 15:52:33.000000', 658207, 5914, 1)
2024-12-15 15:33:00,341 INFO sqlalchem

## 转换为 Dataframe

In [120]:
import pandas as pd

models = pd.read_sql_table('Model', 'sqlite:///data.db')
models.head()

Unnamed: 0,id,name,lastModified,downloads,likes,author_id
0,1,meta-llama/Meta-Llama-3-8B,2024-09-27 15:52:33,658207,5914,1
1,2,bigscience/bloom,2023-07-28 17:50:20,14597,4794,2
2,3,mistralai/Mixtral-8x7B-Instruct-v0.1,2024-08-19 13:18:42,1290346,4226,3
3,4,meta-llama/Llama-2-7b,2024-04-17 08:12:44,0,4187,1
4,5,meta-llama/Llama-2-7b-chat-hf,2024-04-17 08:40:48,1048193,4066,1


In [121]:
authors = pd.read_sql_table('Author', 'sqlite:///data.db')
authors.head()

Unnamed: 0,id,name,type,isEnterprise
0,1,meta-llama,org,1
1,2,bigscience,org,0
2,3,mistralai,org,1
3,4,microsoft,org,0
4,5,google,org,0


In [122]:
models.describe()

Unnamed: 0,id,lastModified,downloads,likes,author_id
count,3000.0,3000,3000.0,3000.0,3000.0
mean,1500.5,2024-02-22 14:44:39.612333312,54315.1,119.225667,200.764333
min,1.0,2021-03-03 01:44:59,0.0,19.0,1.0
25%,750.75,2023-09-29 20:49:27.750000128,237.75,27.0,25.0
50%,1500.5,2024-03-14 18:41:07,1344.5,43.0,119.0
75%,2250.25,2024-07-29 06:11:10.750000128,5713.75,92.0,310.25
max,3000.0,2024-12-15 01:55:41,18395960.0,5914.0,787.0
std,866.169729,,541475.0,310.25295,207.117189


In [123]:
authors.describe()

Unnamed: 0,id,isEnterprise
count,787.0,787.0
mean,394.0,0.049555
std,227.331623,0.217162
min,1.0,0.0
25%,197.5,0.0
50%,394.0,0.0
75%,590.5,0.0
max,787.0,1.0


# 数据预处理

# 数据可视化

## 模型参数量

## 时间分布

## 作者类型