# Import Package

In [1]:
import numpy as np
import torch
import tensorflow as tf 

from pprint import pprint 
from datasets import load_dataset, load_metric, Audio
from tqdm.auto import tqdm

from torch import nn
from torch.utils.data import DataLoader 
from torch.optim import AdamW

from transformers import pipeline
from transformers import TrainingArguments
from transformers import Trainer
from transformers import DefaultDataCollator
from transformers import get_scheduler

from transformers import AutoTokenizer 
from transformers import AutoFeatureExtractor
from transformers import AutoConfig
from transformers import AutoProcessor

from transformers import AutoModel
from transformers import AutoModelForSequenceClassification
from transformers import AutoModelForTokenClassification
from transformers import AutoModelForCausalLM


from transformers import TFAutoModel
from transformers import TFAutoModelForSequenceClassification
from transformers import TFAutoModelForTokenClassification


print(tf.__version__)
print(torch.__version__) 



2.9.1
1.12.1+cu102


# Get Started

## Installation

https://huggingface.co/docs/transformers/installation

In [2]:
%%bash

#apt update
# pip install -U pip
#pip install -U transformers
#pip install -U torch
#pip install -U torchvision 
#pip install -U tensorflow
#pip install -U datasets 

#apt-get install -y libsndfile1
#pip install -U soundfile
#pip install -U librosa

# apt install -y ffmpeg
ffmpeg -version 
# ffmpeg -encoders
# ffmpeg -decoders

ffmpeg version 4.2.7-0ubuntu0.1 Copyright (c) 2000-2022 the FFmpeg developers
built with gcc 9 (Ubuntu 9.4.0-1ubuntu1~20.04.1)
configuration: --prefix=/usr --extra-version=0ubuntu0.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-avresample --disable-filter=resample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librsvg --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enabl

In [None]:
%%bash
python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('we love you'))"

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)
2022-08-10 05:48:07.484559: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 7368 MB memory:  -> device: 0, name: GeForce GTX 1070 Ti, pci bus id: 0000:05:00.0, compute capability: 6.1


## Quick tour

https://huggingface.co/docs/transformers/quicktour

### Pipeline

#### Sentiment Classification

In [None]:
classifier = pipeline("sentiment-analysis")
classifier("We are very happy to show you the 🤗 Transformers library.")

In [None]:
results = classifier(["We are very happy to show you the 🤗 Transformers library.", 
                      "We hope you don't hate it."])
for result in results:
    print(f"label: {result['label']}, with score: {round(result['score'], 4)}")

#### Speech Recognition

In [None]:
speech_recognizer = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h")

In [None]:
dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")

In [None]:
print(speech_recognizer.feature_extractor.sampling_rate) 

In [None]:
dataset = dataset.cast_column("audio", Audio(sampling_rate=speech_recognizer.feature_extractor.sampling_rate))

In [None]:
result = speech_recognizer(dataset[:4]["audio"])
print([d["text"] for d in result])

#### Use another model and tokenizer in the pipeline

In [None]:
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 会报错
# model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
# tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
classifier("Nous sommes très heureux de vous présenter la bibliothèque 🤗 Transformers.")

In [None]:
classifier(["你们的服务实在太差了", "爱死你们了"])

### AutoClass

#### AutoTokenizer

In [None]:
encoding = tokenizer("We are very happy to show you the 🤗 Transformers library.")
print(encoding)

In [None]:
encoding = tokenizer("台湾是中国不可分割的领土")
print(encoding)

In [None]:
pt_batch = tokenizer(
    ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt",
)

print(type(pt_batch)) 

In [None]:
tf_batch = tokenizer(
    ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="tf",
)

print(type(tf_batch)) 

#### AutoModel

In [None]:
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
pt_model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [None]:
pt_outputs = pt_model(**pt_batch)
pt_predictions = nn.functional.softmax(pt_outputs.logits, dim=-1)
print(pt_predictions)

In [None]:
# 会报错
# model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
# tf_model = TFAutoModelForSequenceClassification.from_pretrained(model_name)

# tf_outputs = tf_model(tf_batch)
# tf_predictions = tf.nn.softmax(tf_outputs.logits, axis=-1)
# tf_predictions

### Save a model

In [None]:
pt_save_directory = "./pt_save_pretrained"
tokenizer.save_pretrained(pt_save_directory)
pt_model.save_pretrained(pt_save_directory)
pt_model = AutoModelForSequenceClassification.from_pretrained("./pt_save_pretrained")

In [None]:
# tf_save_directory = "./tf_save_pretrained"
# tokenizer.save_pretrained(tf_save_directory)
# tf_model.save_pretrained(tf_save_directory)
# tf_model = TFAutoModelForSequenceClassification.from_pretrained("./tf_save_pretrained")

### Custom model builds

In [None]:
my_config = AutoConfig.from_pretrained("distilbert-base-uncased", n_heads=12)
my_model = AutoModel.from_config(my_config)
my_model = TFAutoModel.from_config(my_config)

# Tutorials

## Pipelines for inference

https://huggingface.co/docs/transformers/pipeline_tutorial

### Pipeline usage

In [None]:
generator = pipeline(task="text-generation")
generator(
    "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone"
)  # doctest: +SKIP

In [None]:
generator(
    [
        "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone",
        "Nine for Mortal Men, doomed to die, One for the Dark Lord on his dark throne",
    ]
)

In [None]:
generator(
    "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone",
    num_return_sequences=2,
)  # doctest: +SKIP

#### Choose a model and tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
model = AutoModelForCausalLM.from_pretrained("distilgpt2")

In [None]:
generator = pipeline(task="text-generation", model=model, tokenizer=tokenizer)
generator(
    "Three Rings for the Elven-kings under the sky, Seven for the Dwarf-lords in their halls of stone"
)

### Audio pipeline

In [None]:
torch.manual_seed(42)
ds = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
audio_file = ds[0]["audio"]["path"]

In [None]:
audio_classifier = pipeline(
    task="audio-classification", model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
)

In [None]:
preds = audio_classifier(audio_file)
preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
preds

### Vision pipeline

![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg)

In [None]:
vision_classifier = pipeline(task="image-classification")
preds = vision_classifier(
    images="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
)
preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
preds

## Load pretrained instances with an AutoClass

### AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [None]:
sequence = "Let's try to tokenize!"
print('-'*50)
tokens = tokenizer(sequence)
print(tokens, type(tokens)) 

print(tokenizer.decode(tokens.input_ids)) 

# 下面input_ids内容好像不对
print('-'*50)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
print(input_ids, type(input_ids)) 

print('-'*50)
final_inputs = tokenizer.prepare_for_model(input_ids)
print(final_inputs, type(final_inputs))

### AutoFeatureExtractor

For audio and vision tasks, a feature extractor processes the audio signal or image into the correct input format.

In [None]:
feature_extractor = AutoFeatureExtractor.from_pretrained(
    "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
)

### AutoProcessor

Multimodal tasks require a processor that combines two types of preprocessing tools. For example, the LayoutLMV2 model requires a feature extractor to handle images and a tokenizer to handle text; a processor combines both of them.

In [None]:
processor = AutoProcessor.from_pretrained("microsoft/layoutlmv2-base-uncased")

### AutoModel

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")

In [None]:
model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased")

In [None]:
# 报错
# model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
# model = TFAutoModelForTokenClassification.from_pretrained("distilbert-base-uncased")

## Preprocess

https://huggingface.co/docs/transformers/preprocessing

### NLP

#### Tokenize

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [None]:
encoded_input = tokenizer("Do not meddle in the affairs of wizards, for they are subtle and quick to anger.")
pprint(encoded_input)

In [None]:
tokenizer.decode(encoded_input["input_ids"])

In [None]:
batch_sentences = [
    "But what about second breakfast?",
    "Don't think he knows about second breakfast, Pip.",
    "What about elevensies?",
]
encoded_inputs = tokenizer(batch_sentences)
pprint(encoded_inputs)

#### Pad

In [None]:
batch_sentences = [
    "But what about second breakfast?",
    "Don't think he knows about second breakfast, Pip.",
    "What about elevensies?",
]
encoded_input = tokenizer(batch_sentences, padding=True)
pprint(encoded_input)  

#### Truncation

In [None]:
batch_sentences = [
    "But what about second breakfast?",
    "Don't think he knows about second breakfast, Pip.",
    "What about elevensies?",
]
encoded_input = tokenizer(batch_sentences, padding=True, truncation=True)
pprint(encoded_input) 

#### Build tensors

In [None]:
batch_sentences = [
    "But what about second breakfast?",
    "Don't think he knows about second breakfast, Pip.",
    "What about elevensies?",
]
encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="pt")
print(encoded_input, type(encoded_input))

In [None]:
batch_sentences = [
    "But what about second breakfast?",
    "Don't think he knows about second breakfast, Pip.",
    "What about elevensies?", 
]
encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="tf")
print(encoded_input, type(encoded_input))

### Audio

In [None]:
dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")
print(len(dataset))

In [None]:
dataset[0]["audio"]

- array is the speech signal loaded - and potentially resampled - as a 1D array.
- path points to the location of the audio file.
- sampling_rate refers to how many data points in the speech signal are measured per second. 采样率

#### Resample

In [None]:
dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")
dataset[0]["audio"]

In [None]:
dataset = dataset.cast_column("audio", Audio(sampling_rate=16_000))

In [None]:
dataset[0]["audio"]

#### Feature extractor

In [None]:
feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")

In [None]:
audio_input = [dataset[0]["audio"]["array"], dataset[1]["audio"]["array"]]
feature_extractor(audio_input, sampling_rate=16000)

#### Pad and truncate

In [None]:
print(dataset[0]["audio"]["array"].shape, dataset[1]["audio"]["array"].shape)

In [None]:
def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=16000,
        padding=True,
        max_length=100000,
        truncation=True,
    )
    return inputs

In [None]:
processed_dataset = preprocess_function(dataset[:5])

In [None]:
print(processed_dataset["input_values"][0].shape, processed_dataset["input_values"][0].shape) 

### Vision

In [None]:
dataset = load_dataset("food101", split="train[:100]")

In [None]:
dataset[0]["image"]

#### Feature extractor

In [None]:
feature_extractor = AutoFeatureExtractor.from_pretrained("google/vit-base-patch16-224")

#### Data augmentation

In [None]:
from torchvision.transforms import Compose, Normalize, RandomResizedCrop, ColorJitter, ToTensor

normalize = Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
_transforms = Compose(
    [RandomResizedCrop(feature_extractor.size), ColorJitter(brightness=0.5, hue=0.5), ToTensor(), normalize]
)

In [None]:
def transforms(examples):
    examples["pixel_values"] = [_transforms(image.convert("RGB")) for image in examples["image"]]
    return examples

In [None]:
dataset.set_transform(transforms)

In [None]:
dataset[0]["image"]

In [None]:
import numpy as np
import matplotlib.pyplot as plt

img = dataset[0]["pixel_values"] 
plt.imshow(img.permute(1, 2, 0))
plt.show()

### Multimodal

In [None]:
lj_speech = load_dataset("lj_speech", split="train")

In [None]:
lj_speech = lj_speech.map(remove_columns=["file", "id", "normalized_text"])
print(lj_speech[0]["audio"])
print(lj_speech[0]["text"])

In [None]:
lj_speech = lj_speech.cast_column("audio", Audio(sampling_rate=16_000))

#### Processor
A processor combines a feature extractor and tokenizer. 


In [None]:
processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")

In [None]:
def prepare_dataset(example):
    audio = example["audio"]

    example["input_values"] = processor(audio["array"], sampling_rate=16000)

    with processor.as_target_processor():
        example["labels"] = processor(example["text"]).input_ids
    return example

In [None]:
prepare_dataset(lj_speech[0])

## Fine-tune a pretrained model
https://huggingface.co/docs/transformers/training

### Prepare a dataset

In [4]:
dataset = load_dataset("yelp_review_full")
dataset["train"][100]

Reusing dataset yelp_review_full (/root/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf)


  0%|          | 0/2 [00:00<?, ?it/s]

{'label': 0,
 'text': 'My expectations for McDonalds are t rarely high. But for one to still fail so spectacularly...that takes something special!\\nThe cashier took my friends\'s order, then promptly ignored me. I had to force myself in front of a cashier who opened his register to wait on the person BEHIND me. I waited over five minutes for a gigantic order that included precisely one kid\'s meal. After watching two people who ordered after me be handed their food, I asked where mine was. The manager started yelling at the cashiers for \\"serving off their orders\\" when they didn\'t have their food. But neither cashier was anywhere near those controls, and the manager was the one serving food to customers and clearing the boards.\\nThe manager was rude when giving me my order. She didn\'t make sure that I had everything ON MY RECEIPT, and never even had the decency to apologize that I felt I was getting poor service.\\nI\'ve eaten at various McDonalds restaurants for over 30 years. 

In [5]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [6]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True) 
 
tokenized_datasets = dataset.map(tokenize_function, batched=True)



  0%|          | 0/650 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

In [7]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

### Train

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5) 

In [None]:
training_args = TrainingArguments(output_dir="test_trainer")

#### Metrics

In [None]:
metric = load_metric("accuracy")

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")

#### Trainer

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

#### Convert dataset to TensorFlow format

In [None]:
data_collator = DefaultDataCollator(return_tensors="tf")

In [None]:
tf_train_dataset = small_train_dataset.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)

tf_validation_dataset = small_eval_dataset.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=8,
)

#### Compile and fit

In [None]:
# 报错
# model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)

In [None]:
# model.compile(
#     optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
#     loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
#     metrics=tf.metrics.SparseCategoricalAccuracy(),
# )

In [None]:
# model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3)

### Train in native PyTorch

In [2]:
# del model
# del pytorch_model
# del trainer
torch.cuda.empty_cache()

In [8]:
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

#### DataLoader

In [9]:
train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8)
eval_dataloader = DataLoader(small_eval_dataset, batch_size=8)

In [10]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

#### Optimizer and learning rate scheduler

In [11]:
optimizer = AdamW(model.parameters(), lr=5e-5)

In [12]:
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [13]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

#### Training loop

In [14]:
progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/375 [00:00<?, ?it/s]

In [15]:
metric = load_metric("accuracy")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

{'accuracy': 0.578}

### Additional resources

- [Transformers Examples](https://github.com/huggingface/transformers/tree/main/examples) includes scripts to train common NLP tasks in PyTorch and TensorFlow.

- [Transformers Notebooks](https://huggingface.co/docs/transformers/notebooks) contains various notebooks on how to fine-tune a model for specific tasks in PyTorch and TensorFlow.