# 对 HuggingFace 公开大模型的数据分析

2024 年开源大模型年度回顾

# 数据爬取

In [38]:
import requests
from typing import List
# this url was reverse engineered
base_url = "https://huggingface.co/models-json"

def get_single_page_models(p) -> List[dict]:
    '''30 models per page'''
    assert 1 <= p <= 99, 'page must be positive and less than or equal to 99'
    resp = requests.get(base_url, params={"p": p})
    models = resp.json().get('models', [])
    
    return models

# example of model data
get_single_page_models(1)[0]

{'author': 't-tech',
 'authorData': {'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/674ea07d320a043daeb2d98b/IwSCMolFY4Otk7sFXzWhi.jpeg',
  'fullname': 'T-Tech',
  'name': 't-tech',
  'type': 'org',
  'isHf': False,
  'isMod': False,
  'isEnterprise': False,
  'followerCount': 168},
 'downloads': 3125,
 'gated': False,
 'id': 't-tech/T-lite-it-1.0',
 'inference': 'library-not-detected',
 'lastModified': '2024-12-13T15:26:19.000Z',
 'likes': 48,
 'private': False,
 'repoType': 'model',
 'isLikedByUser': False}

## Sqlite ORM

In [39]:
import datetime

from sqlalchemy import Integer, String, DateTime, ForeignKey
from sqlalchemy.orm import DeclarativeBase
from sqlalchemy.orm import Mapped, mapped_column, relationship

class Base(DeclarativeBase):
    pass


class Author(Base):
    __tablename__ = 'Author'
    
    id: Mapped[int] = mapped_column(primary_key=True)
    name: Mapped[str] = mapped_column(String(40))
    isEnterprise: Mapped[bool] = mapped_column(Integer)
    
    models: Mapped[List["Model"]] = relationship(
        "Model", back_populates="author", cascade="all, delete-orphan"
    )
    
    def __repr__(self):
        return f"<{self.name}>"


class Model(Base):
    __tablename__ = 'Model'
    
    id: Mapped[int] = mapped_column(primary_key=True)
    name: Mapped[str] = mapped_column(String(40))
    lastModified: Mapped[datetime.datetime] = mapped_column(DateTime)
    downloads: Mapped[int] = mapped_column(Integer)
    likes: Mapped[int] = mapped_column(Integer)
    
    author_id: Mapped[int] = mapped_column(Integer, ForeignKey('Author.id'))
    author: Mapped[Author] = relationship("Author", back_populates="models")
    
    def __repr__(self):
        return f"<{self.name}>"

In [40]:
from sqlalchemy import create_engine
engine = create_engine('sqlite:///data.db', echo=True)
Base.metadata.create_all(engine)

2024-12-14 23:41:20,120 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2024-12-14 23:41:20,121 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("Author")
2024-12-14 23:41:20,122 INFO sqlalchemy.engine.Engine [raw sql] ()
2024-12-14 23:41:20,123 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("Author")
2024-12-14 23:41:20,124 INFO sqlalchemy.engine.Engine [raw sql] ()
2024-12-14 23:41:20,124 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("Model")
2024-12-14 23:41:20,125 INFO sqlalchemy.engine.Engine [raw sql] ()
2024-12-14 23:41:20,126 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("Model")
2024-12-14 23:41:20,127 INFO sqlalchemy.engine.Engine [raw sql] ()
2024-12-14 23:41:20,129 INFO sqlalchemy.engine.Engine 
CREATE TABLE "Author" (
	id INTEGER NOT NULL, 
	name VARCHAR(40) NOT NULL, 
	"isEnterprise" INTEGER NOT NULL, 
	PRIMARY KEY (id)
)


2024-12-14 23:41:20,129 INFO sqlalchemy.engine.Engine [no key 0.00064s] ()
2024-12-14 23:41:20,150 INFO sqlalchemy.engine.Engin

In [41]:
def author_seralizer(author_data) -> Author:
    # print(author_data)
    author = Author(
        name=author_data['name'],
        isEnterprise=author_data.get('isEnterprise', False),
    )
    return author

def model_serializer(model_data) -> Model:
    model = Model(
        name=model_data['id'],
        lastModified=datetime.datetime.strptime(model_data['lastModified'], '%Y-%m-%dT%H:%M:%S.%fZ'),
        downloads=model_data['downloads'],
        likes=model_data['likes'],
        author=author_seralizer(model_data['authorData']),
    )
    return model

In [42]:
from sqlalchemy.orm import Session

# edit this to adjust the number of pages to scrape
mx_page = 2

with Session(engine) as session:
    for page in range(1, mx_page + 1):
        models = get_single_page_models(page)
        for model in models:
            model_obj = model_serializer(model)
            session.add(model_obj)
    
    session.commit()

{'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/674ea07d320a043daeb2d98b/IwSCMolFY4Otk7sFXzWhi.jpeg', 'fullname': 'T-Tech', 'name': 't-tech', 'type': 'org', 'isHf': False, 'isMod': False, 'isEnterprise': False, 'followerCount': 168}
{'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/62cfefa74b3e8dc1e32e38bf/GgkglHn3sIo6C5XGTtZSs.png', 'fullname': 'Black Forest Labs', 'name': 'black-forest-labs', 'type': 'org', 'isHf': False, 'isMod': False, 'isEnterprise': False, 'followerCount': 4977}
{'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/62cfefa74b3e8dc1e32e38bf/GgkglHn3sIo6C5XGTtZSs.png', 'fullname': 'Black Forest Labs', 'name': 'black-forest-labs', 'type': 'org', 'isHf': False, 'isMod': False, 'isEnterprise': False, 'followerCount': 4977}
{'avatarUrl': 'https://cdn-avatars.huggingface.co/v1/production/uploads/62af434f457691d789c36aeb/ZeVYADUPsv_20G3FkybFf.jpeg', 'fullname': 'Dnotitia Inc.', 'name': 'dnotitia', 'type': 'o

In [43]:
import numpy as np
import pandas as pd

# 数据存储

# 数据预处理

# 数据可视化