In [1]:
# Text Classification
from transformers import pipeline
local_model_path = "/mnt/new_volume/hf/IDEA-CCNL/Erlangshen-Roberta-330M-Sentiment"
pipe = pipeline("sentiment-analysis", model = local_model_path)
pipe("今儿上海可真冷啊")

[{'label': 'Negative', 'score': 0.9974936246871948}]

In [2]:
pipe("我觉得这家店蒜泥白肉的味道一般")

[{'label': 'Negative', 'score': 0.999907374382019}]

In [3]:
pipe("你学东西真的好快，理论课一讲就明白了")

[{'label': 'Positive', 'score': 0.9999562501907349}]

In [4]:
text_list = [
    "今儿上海可真冷啊。",
    "我觉得这家店蒜泥白肉的味道一般。",
    "你学东西真的好快，理论课一讲就明白了"
]

pipe(text_list)

[{'label': 'Negative', 'score': 0.9983922839164734},
 {'label': 'Negative', 'score': 0.9998031258583069},
 {'label': 'Positive', 'score': 0.9999562501907349}]

In [5]:
# Token Classification
from transformers import pipeline
import os
os.environ['HF_HOME'] = '/mnt/new_volume/hf'
os.environ['HF_HUB_CACHE'] = '/mnt/new_volume/hf/hub'
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
#classifier = pipeline("token-classification", model="dbmdz/bert-large-cased-finetuned-conll03-english")
classifier = pipeline(task = "ner", model="dslim/bert-base-NER")
preds = classifier("Hugging Face is a French company based in New York City.")
preds = [
    {
        "entity": pred["entity"],
        "score": round(pred["score"], 4),
        "index": pred["index"],
        "word": pred["word"],
        "start": pred["start"],
        "end": pred["end"],
    }
    for pred in preds
]
print(*preds, sep="\n")

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'entity': 'B-ORG', 'score': 0.8935, 'index': 1, 'word': 'Hu', 'start': 0, 'end': 2}
{'entity': 'I-ORG', 'score': 0.915, 'index': 2, 'word': '##gging', 'start': 2, 'end': 7}
{'entity': 'I-ORG', 'score': 0.9777, 'index': 3, 'word': 'Face', 'start': 8, 'end': 12}
{'entity': 'B-MISC', 'score': 0.9996, 'index': 6, 'word': 'French', 'start': 18, 'end': 24}
{'entity': 'B-LOC', 'score': 0.9995, 'index': 10, 'word': 'New', 'start': 42, 'end': 45}
{'entity': 'I-LOC', 'score': 0.9994, 'index': 11, 'word': 'York', 'start': 46, 'end': 50}
{'entity': 'I-LOC', 'score': 0.9996, 'index': 12, 'word': 'City', 'start': 51, 'end': 55}


In [6]:
# 合并实体
classifier = pipeline(task="ner", model="dslim/bert-base-NER", grouped_entities=True)
classifier("Hugging Face is a French company based in New York City.")

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'entity_group': 'ORG',
  'score': 0.928741,
  'word': 'Hugging Face',
  'start': 0,
  'end': 12},
 {'entity_group': 'MISC',
  'score': 0.9996295,
  'word': 'French',
  'start': 18,
  'end': 24},
 {'entity_group': 'LOC',
  'score': 0.9994915,
  'word': 'New York City',
  'start': 42,
  'end': 55}]

In [8]:
# Question Answering
from transformers import pipeline

question_answerer = pipeline("question-answering", model="consciousAI/question-answering-roberta-base-s-v2")
preds = question_answerer(
    question="What is the name of the repository?",
    context="The name of the repository is huggingface/transformers",
)
print(
    f"score: {round(preds['score'], 4)}, start: {preds['start']}, end: {preds['end']}, answer: {preds['answer']}"
)

score: 0.9998, start: 30, end: 54, answer: huggingface/transformers


In [9]:
preds = question_answerer(
    question="What is the capital of China?",
    context="On 1 October 1949, CCP Chairman Mao Zedong formally proclaimed the People's Republic of China in Tiananmen Square, Beijing.",
)
print(
    f"score: {round(preds['score'], 4)}, start: {preds['start']}, end: {preds['end']}, answer: {preds['answer']}"
)

score: 0.9688, start: 115, end: 122, answer: Beijing


In [16]:
# Summarization(文本摘要）
from transformers import pipeline
import os
os.environ['HF_HOME'] = '/mnt/new_volume/hf'
os.environ['HF_HUB_CACHE'] = '/mnt/new_volume/hf/hub'
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
summarizer = pipeline(task="summarization",
                      model="/mnt/new_volume/hf/medical_summarization",
                      min_length=8,
                      max_length=32,
)

In [17]:
summarizer(
    """
    In this work, we presented the Transformer, the first sequence transduction model based entirely on attention, 
    replacing the recurrent layers most commonly used in encoder-decoder architectures with multi-headed self-attention. 
    For translation tasks, the Transformer can be trained significantly faster than architectures based on recurrent or convolutional layers. 
    On both WMT 2014 English-to-German and WMT 2014 English-to-French translation tasks, we achieve a new state of the art. 
    In the former task our best model outperforms even all previously reported ensembles.
    """
)

[{'summary_text': 'in this work, we presented the Transformer, the first sequence transduction model based entirely on attention, replacing the recurrent layers most commonly used in'}]

In [18]:
summarizer(
    '''
    Large language models (LLM) are very large deep learning models that are pre-trained on vast amounts of data. 
    The underlying transformer is a set of neural networks that consist of an encoder and a decoder with self-attention capabilities. 
    The encoder and decoder extract meanings from a sequence of text and understand the relationships between words and phrases in it.
    Transformer LLMs are capable of unsupervised training, although a more precise explanation is that transformers perform self-learning. 
    It is through this process that transformers learn to understand basic grammar, languages, and knowledge.
    Unlike earlier recurrent neural networks (RNN) that sequentially process inputs, transformers process entire sequences in parallel. 
    This allows the data scientists to use GPUs for training transformer-based LLMs, significantly reducing the training time.
    '''
)


[{'summary_text': 'large language models ( LLMs) are very large deep learning models that are pre-trained on vast amounts of data . the underlying'}]

In [4]:
# Audio classification
from transformers import pipeline
classifier = pipeline(task="audio-classification", model="/mnt/new_volume/hf/superb/hubert-base-superb-er")
# classifier = pipeline("audio-classification", model="/mnt/new_volume/hf/m-a-p/MERT-v1-95M", trust_remote_code=True)

Some weights of the model checkpoint at /mnt/new_volume/hf/superb/hubert-base-superb-er were not used when initializing HubertForSequenceClassification: ['hubert.encoder.pos_conv_embed.conv.weight_g', 'hubert.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing HubertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at /mnt/new_volume/hf/superb/hubert-base-superb-er and are newly initialized: ['hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'hubert.e

In [None]:
# 使用 Hugging Face Datasets 上的测试文件
preds = classifier("https://hf-mirror.com/datasets/Narsil/asr_dummy/blob/main/mlk.flac")
preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
preds

In [5]:
# 使用本地的音频文件做测试
preds = classifier("/data/audio/mlk.flac")
preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
preds

[{'score': 0.4532, 'label': 'hap'},
 {'score': 0.3622, 'label': 'sad'},
 {'score': 0.0943, 'label': 'neu'},
 {'score': 0.0903, 'label': 'ang'}]

In [12]:
# Automatic speech recognition（自动语音识别）
from transformers import pipeline

# 使用 `model` 参数指定模型
transcriber = pipeline(task="automatic-speech-recognition", model="/mnt/new_volume/hf/openai/whisper-small")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
text = transcriber("/data/audio/mlk.flac")
text

{'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.'}

In [1]:
# Computer Vision（计算机视觉）
# Image Classificaiton(图像分类)
from transformers import pipeline

classifier = pipeline(task="image-classification", model="/mnt/new_volume/hf/vit-base-patch16-224")

W0726 01:23:18.819000 18200 site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


In [None]:
preds = classifier(
    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
)
preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
print(*preds, sep="\n")

In [2]:
# 使用本地图片（狼猫）
preds = classifier(
    "/data/image/cat-chonk.jpg"
)
preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
print(*preds, sep="\n")

{'score': 0.8329, 'label': 'Egyptian cat'}
{'score': 0.0553, 'label': 'toy terrier'}
{'score': 0.0344, 'label': 'Siamese cat, Siamese'}
{'score': 0.0104, 'label': 'tabby, tabby cat'}
{'score': 0.0088, 'label': 'Chihuahua'}


In [3]:
# 使用本地图片（熊猫）
preds = classifier(
    "/data/image/panda.png"
)
preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
print(*preds, sep="\n")

{'score': 0.99, 'label': 'giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca'}
{'score': 0.0027, 'label': 'sloth bear, Melursus ursinus, Ursus ursinus'}
{'score': 0.0018, 'label': 'American black bear, black bear, Ursus americanus, Euarctos americanus'}
{'score': 0.0015, 'label': 'ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus'}
{'score': 0.0012, 'label': 'brown bear, bruin, Ursus arctos'}


In [None]:
from transformers import pipeline
model_path = "/mnt/new_volume/hf/microsoft/table-transformer-detection"
detector = pipeline(task = "object-detection", model = model_path)
# detector = pipeline(task="object-detection", model = "facebook/detr-resnet-50")

In [None]:
preds = detector(
    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
)
preds = [{"score": round(pred["score"], 4), "label": pred["label"], "box": pred["box"]} for pred in preds]
preds

In [None]:
# 使用本地图片
preds = detector(
    "/data/image/cat_dog.jpg"
)
preds = [{"score": round(pred["score"], 4), "label": pred["label"], "box": pred["box"]} for pred in preds]
preds