pipeline

In [None]:
from transformers import pipeline

# 使用情绪分析流水线
classifier = pipeline('sentiment-analysis')
classifier('We are very happy to introduce pipeline to the transformers repository.')

In [None]:
from transformers import pipeline

# 使用问答流水线
question_answerer = pipeline('question-answering')
question_answerer({
    'question': 'What is the name of the repository ?',
    'context': 'Pipeline has been included in the huggingface/transformers repository'
})
{'score': 0.30970096588134766, 'start': 34, 'end': 58, 'answer': 'huggingface/transformers'}


下载和使用预训练模型

In [None]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
model = AutoModel.from_pretrained("google-bert/bert-base-uncased")

inputs = tokenizer("Hello world!", return_tensors="pt")
outputs = model(**inputs)
print(outputs)

In [None]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis")

classifier("We are very happy to show you the 🤗 Transformers library.")

In [None]:
results = classifier(["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."])
for result in results:
    print(f"label: {result['label']}, with score: {round(result['score'], 4)}")

自动语音识别

In [None]:
import torch
from transformers import pipeline

speech_recognizer = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h")

In [None]:
from datasets import load_dataset, Audio

dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")

In [None]:
dataset = dataset.cast_column("audio", Audio(sampling_rate=speech_recognizer.feature_extractor.sampling_rate))

In [None]:
result = speech_recognizer(dataset[:4]["audio"])
print([d["text"] for d in result])

在pipeline中使用另一个模型和标记器------------针对法语文本的情绪分析

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification


model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
classifier("Nous sommes très heureux de vous présenter la bibliothèque 🤗 Transformers.")

# 使用pipeline运行推理

## pipeline用法

In [None]:
from transformers import pipeline

transcriber = pipeline(task="automatic-speech-recognition")

transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")

In [None]:
transcriber = pipeline(model="openai/whisper-large-v2")
transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")

如果有多个输入，可以将输入作为列表传递：

In [None]:
transcriber(
    [
        "https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac",
        "https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac",
    ]
)

## 参数

In [None]:
transcriber = pipeline(model="openai/whisper-large-v2", my_parameter=1)

out = transcriber(...)  # This will use `my_parameter=1`.
out = transcriber(..., my_parameter=2)  # This will override and use `my_parameter=2`.
out = transcriber(...)  # This will go back to using `my_parameter=1`.

## 设备

In [None]:
transcriber = pipeline(model="openai/whisper-large-v2", device=0)

In [None]:
transcriber = pipeline(model="openai/whisper-large-v2", device_map="auto")

## 批次大小

In [None]:
transcriber = pipeline(model="openai/whisper-large-v2", device=0, batch_size=2)
audio_filenames = [f"https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/{i}.flac" for i in range(1, 5)]
texts = transcriber(audio_filenames)

## 任务特定参数

In [None]:
transcriber = pipeline(model="openai/whisper-large-v2", return_timestamps=True)
transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")

In [None]:
transcriber = pipeline(model="openai/whisper-large-v2", chunk_length_s=30)
transcriber("https://huggingface.co/datasets/reach-vb/random-audios/resolve/main/ted_60.wav")

## 在数据集上使用pipeline

In [None]:
from transformers import pipeline
def data():
    for i in range(1000):
        yield f"My example {i}"


pipe = pipeline(model="openai-community/gpt2", device=0)
generated_characters = 0
for out in pipe(data()):
    generated_characters += len(out[0]["generated_text"])

In [None]:
import gradio as gr
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor

is_GPU = None
device = None

training_data = datasets.FashionMNIST(
    root="data",
    train=True,
    download=True,
    transform=ToTensor()
)

test_data = datasets.FashionMNIST(
    root="data",
    train=False,
    download=True,
    transform=ToTensor()
)


class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10),
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

learning_rate = 0.1
batch_size = 64
epochs = 10

def train_loop(dataloader, model, loss_fn, optimizer, text):
    size = len(dataloader.dataset)
    # Set the model to training mode - important for batch normalization and dropout layers
    # Unnecessary in this situation but added for best practices
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        X, y = X.to(device), y.to(device)
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * batch_size + len(X)
            text.append(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
    return text


def test_loop(dataloader, model, loss_fn, text):
    # Set the model to evaluation mode - important for batch normalization and dropout layers
    # Unnecessary in this situation but added for best practices
    model.eval()
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    # Evaluating the model with torch.no_grad() ensures that no gradients are computed during test mode
    # also serves to reduce unnecessary gradient computations and memory usage for tensors with requires_grad=True
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    text.append(f"Test Error-----Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f}")
    return(text)

def greet(device): 
    if device == "cuda":
        is_GPU = True
    else:
        is_GPU = False
    text = []
    text.append(f"is_GPU: {is_GPU}")
    train_dataloader = DataLoader(training_data, batch_size=64, pin_memory=is_GPU)
    test_dataloader = DataLoader(test_data, batch_size=64, pin_memory=is_GPU)
    model = NeuralNetwork().to(device)
    # Initialize the loss function
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
    for t in range(epochs):
        text.append(f"Epoch {t+1}-------------------------------")
        train_loop(train_dataloader, model, loss_fn, optimizer, text)
        test_loop(test_dataloader, model, loss_fn, text)
    text.append("Done!")
    output_text = '\n'.join(text)
    return(output_text)
  
  
demo = gr.Interface(  
    fn=greet,   
    inputs=gr.Textbox(label="请输入计算设备(cpu or cuda)"),   
    outputs=gr.Textbox(label="训练结果")
)  
  
demo.launch()