In [30]:
import transformers
import pprint
import torch

TypeError: pformat() missing 1 required positional argument: 'object'

# §4.1 `Auto*`类

对于特定的`checkpoint: str`，我们有以下三种使用的方法：

1. 使用`transformers.pipeline`
2. 使用`transformers`的特定`<MODEL>Tokenizer`/`<MODEL>For<TASK>`
3. 使用`transformers`的通用`AutoTokenizer`/`AutoModelFor<TASK>`

In [23]:
checkpoint: str = "camembert-base"
raw_data = "The capital of France is the <mask>."

# 方法1
pipeline = transformers.pipeline("fill-mask", checkpoint)
pprint.pprint(pipeline(raw_data))

# 方法2
tokenizer = transformers.CamembertTokenizer.from_pretrained(checkpoint)
model = transformers.CamembertForMaskedLM.from_pretrained(checkpoint)
inputs = tokenizer(raw_data, return_tensors="pt")
with torch.no_grad():
    logits = model(**inputs).logits
mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero()[0]
predicted_token_id = logits[0, mask_token_index].argmax(axis=-1)
pprint.pprint(tokenizer.decode(predicted_token_id))

# 方法3
tokenizer = transformers.AutoTokenizer.from_pretrained(checkpoint)
model = transformers.AutoModelForMaskedLM.from_pretrained(checkpoint)
inputs = tokenizer(raw_data, return_tensors="pt")
with torch.no_grad():
    logits = model(**inputs).logits
mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero()[0]
predicted_token_id = logits[0, mask_token_index].argmax(axis=-1)
pprint.pprint(tokenizer.decode(predicted_token_id))

Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing CamembertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'score': 0.13117577135562897,
  'sequence': 'The capital of France is the city.',
  'token': 23151,
  'token_str': 'city'},
 {'score': 0.09899164736270905,
  'sequence': 'The capital of France is the City.',
  'token': 6383,
  'token_str': 'City'},
 {'score': 0.04552188888192177,
  'sequence': 'The capital of France is the French.',
  'token': 11098,
  'token_str': 'French'},
 {'score': 0.038993123918771744,
  'sequence': 'The capital of France is the London.',
  'token': 15970,
  'token_str': 'London'},
 {'score': 0.026898596435785294,
  'sequence': 'The capital of France is the world.',
  'token': 18909,
  'token_str': 'world'}]


Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing CamembertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


'city'


Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing CamembertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


'city'


In [18]:
(inputs.input_ids == tokenizer.mask_token_id)[0].nonzero()[0]

tensor([6])

# §4.2 上传HuggingFace Hub

In [24]:
import huggingface_hub

huggingface_hub.notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

```python
training_args = transformers.TrainingArguments(
    "bert-finetuned-mrpc", 
    save_strategy="epoch", 
    push_to_hub=True, # 开启上传开关，每个Trainer的Epoch就上传一次
    hub_model_id="组织名称/仓库名称"
)
trainer = transformers.Trainer(
    model=model
)
# trainer.train()
# trainer.push_to_hub() # 最后上传一次
```

In [27]:
# 上传演示

checkpoint = "camembert-base"

model = transformers.AutoModelForMaskedLM.from_pretrained(checkpoint)
tokenizer = transformers.AutoTokenizer.from_pretrained(checkpoint)

# 执行之后访问huggingface.co/<USERNAME>/<REPONAME>
model.push_to_hub("test-model")
tokenizer.push_to_hub("test-tokenizer") 
# tokenizer/model.push_to_hub(
#     repo_id: str = "...",
#     organization: typing.Optional[str] = "...",
#     use_auth_key: typing.Optional[str] = "<TOKEN>"
# )

model.save_pretrained("本地路径")
tokenizer.save_pretrained("本地路径")

Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing CamembertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


sentencepiece.bpe.model:   0%|          | 0.00/811k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/443M [00:00<?, ?B/s]

KeyboardInterrupt: 

In [31]:
# 查看账户信息

# 账户登录与注销
# huggingface_hub.login()
# huggingface_hub.logout()
pprint.pprint(huggingface_hub.whoami())

# 写入仓库
# huggingface_hub.create_repo()
# huggingface_hub.delete_repo()
# huggingface_hub.update_repo_visibility()

# 读取所有公开仓库
pprint.pprint(huggingface_hub.list_models())
pprint.pprint(huggingface_hub.list_datasets())
pprint.pprint(huggingface_hub.list_metrics())
# huggingface_hub.list_repo_files()

{'auth': {'accessToken': {'createdAt': '2024-09-06T09:09:44.796Z',
                          'displayName': 'VSCode',
                          'fineGrained': {'canReadGatedRepos': True,
                                          'global': ['inference.serverless.write',
                                                     'discussion.write',
                                                     'post.write'],
                                          'scoped': [{'entity': {'_id': '63fe1ef80c1bbe8e29d40edc',
                                                                 'name': 'NoComment',
                                                                 'type': 'user'},
                                                      'permissions': ['user.webhooks.read',
                                                                      'repo.content.read',
                                                                      'repo.write',
                                                      

In [None]:
# Repo本身也可以视为一个对象

repo = huggingface_hub.Repository(
    local_dir="本地目录",
    clone_from="<USERNAME>/<REPONAME>"
)

repo.git_pull()
repo.git_add()
repo.git_commit()
repo.git_push()