# Pipelines
 1. Text Based Tasks
 2. Translations Tasks
 3. Speech and Audio Tasks
 4. Image-Based Tasks
 5. Multimodal (Text + Image) Tasks

In [3]:
!pip install -q datasets

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/485.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m481.3/485.4 kB[0m [31m70.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.8/194.8 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
# Imports

import torch
from google.colab import userdata
from huggingface_hub import login
from transformers import pipeline
from diffusers import DiffusionPipeline
from datasets import load_dataset
import soundfile as sf
from IPython.display import Audio

In [5]:
hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

## Text-Based Tasks

- Sentiment Analysis

In [12]:
classifier = pipeline("sentiment-analysis", device="cuda",model="finiteautomata/bertweet-base-sentiment-analysis")
result = classifier("I love this movie!")
print(result)

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0
Device set to use cuda


[{'label': 'POS', 'score': 0.9926134347915649}]


- Classification

In [14]:
classifier = pipeline("zero-shot-classification", device="cuda")
result = classifier("This chicago bulls won the match!", candidate_labels=["technology", "sports", "politics"])
print(result)

No model was supplied, defaulted to facebook/bart-large-mnli and revision d7645e1 (https://huggingface.co/facebook/bart-large-mnli).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cuda


{'sequence': 'This chicago bulls won the match!', 'labels': ['sports', 'technology', 'politics'], 'scores': [0.9841887354850769, 0.008856053464114666, 0.006955244112759829]}


- Named Entity Recognition

In [17]:
ner = pipeline("ner", device="cuda",model="dslim/bert-large-NER")
result = ner("John Doe works at the Farmer's Market. He is very kind")
print(result)

config.json:   0%|          | 0.00/1.45k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-large-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/40.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cuda


[{'entity': 'B-PER', 'score': 0.9988716, 'index': 1, 'word': 'John', 'start': 0, 'end': 4}, {'entity': 'I-PER', 'score': 0.99767023, 'index': 2, 'word': 'Do', 'start': 5, 'end': 7}, {'entity': 'I-PER', 'score': 0.9745281, 'index': 3, 'word': '##e', 'start': 7, 'end': 8}, {'entity': 'B-ORG', 'score': 0.5902574, 'index': 7, 'word': 'Farmer', 'start': 22, 'end': 28}, {'entity': 'I-ORG', 'score': 0.58116484, 'index': 8, 'word': "'", 'start': 28, 'end': 29}, {'entity': 'I-ORG', 'score': 0.5939311, 'index': 9, 'word': 's', 'start': 29, 'end': 30}, {'entity': 'I-ORG', 'score': 0.62898105, 'index': 10, 'word': 'Market', 'start': 31, 'end': 37}]


- Question Answering

In [25]:
qa = pipeline("question-answering", device="cuda")
result = qa(question="Where is the Eiffel tower?", context="The Eiffel tower is in Paris")
print(result)

No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 564e9b5 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cuda


{'score': 0.9839022159576416, 'start': 23, 'end': 28, 'answer': 'Paris'}


In [26]:
qa = pipeline("question-answering", device="cuda")
result = qa(question="Who runs the world?", context="Girls run the world")
print(result)

No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 564e9b5 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cuda


{'score': 0.9971631169319153, 'start': 0, 'end': 5, 'answer': 'Girls'}


- Summarisation

In [29]:
summarizer = pipeline("summarization", device="cuda")
text = "The Eiffel Tower is one of the most famous landmarks in the world. It was built in 1889 and attracts millions of visitors every year."
summary = summarizer(text, max_length=35, min_length=15,)
print(summary)

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cuda
Your max_length is set to 35, but your input_length is only 31. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=15)


[{'summary_text': ' The Eiffel Tower is one of the most famous landmarks in the world . It was built in 1889 and attracts millions of visitors every year .'}]


- Text Generation

In [30]:
generator = pipeline("text-generation", model="gpt2", device="cuda")
text = generator("Once upon a time", max_length=50, num_return_sequences=1)
print(text)

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cuda
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "Once upon a time, I remember feeling more awake and alive. I knew it was all a dream, and this wasn't any dream at all; it was merely a dream I remembered telling myself. Perhaps the dream, as I explained it to myself"}]


## Translation Tasks

- English to French

In [35]:
translator = pipeline("translation_en_to_fr", device="cuda")
result = translator("Hello, how are you?")
print(result)

No model was supplied, defaulted to google-t5/t5-base and revision a9723ea (https://huggingface.co/google-t5/t5-base).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Device set to use cuda


[{'translation_text': 'Bonjour, comment êtes-vous?'}]


- English to Urdu

In [6]:
from transformers import MBart50TokenizerFast, MBartForConditionalGeneration

In [7]:
# Load the fine-tuned model
model_name = "abdulwaheed1/english-to-urdu-translation-mbart"
tokenizer = MBart50TokenizerFast.from_pretrained(model_name, src_lang="en_XX", tgt_lang="ur_PK")
model = MBartForConditionalGeneration.from_pretrained(model_name)

# Function to translate English to Urdu
def translate_english_to_urdu(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    output_tokens = model.generate(**inputs)
    translated_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
    return translated_text

tokenizer_config.json:   0%|          | 0.00/11.0k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.68k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.45k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/226 [00:00<?, ?B/s]

In [12]:
english_text = "How are you?"
urdu_translation = translate_english_to_urdu(english_text)
print(urdu_translation)

آپ کہاں ہیں ؟


In [9]:
english_text = "I woke up feeling refreshed"
urdu_translation = translate_english_to_urdu(english_text)
print(urdu_translation)

مجھے نئی توانائی سے بیدار ہونے کا احساس ہوا


## Speech and Audio

- Speech Recognition (Automatic Speech to Text)

In [17]:
import numpy as np

In [18]:
ds = load_dataset("etechgrid/28.5k_wavfiles_dataset", split="train", streaming=True)
one_row = next(iter(ds))
audio_array = np.array(one_row['audio']['array'])

asr = pipeline("automatic-speech-recognition", device="cuda")
result = asr(audio_array)
print(result)

Resolving data files:   0%|          | 0/163 [00:00<?, ?it/s]

No model was supplied, defaulted to facebook/wav2vec2-base-960h and revision 22aad52 (https://huggingface.co/facebook/wav2vec2-base-960h).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda


{'text': 'CHAPTER ONE MISSUS RACHEL LYNDE IS SURPRISED MISSUS RACHEL LYNDE LIVED JUST WHERE THE AVONLEA MAIN ROAD DIPPED DOWN INTO A LITTLE HOLLOW FRINGED WITH ALDERS AND LADIES EARDROPS AND TRAVERSED BY A BROOK'}


- Text to Speech

In [21]:
import IPython.display as ipd

In [26]:
synthesiser = pipeline("text-to-speech", "microsoft/speecht5_tts", device='cuda')

embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

speech = synthesiser("Hi to an artificial intelligence engineer, on the way to mastery!", forward_params={"speaker_embeddings": speaker_embedding})

sf.write("speech.wav", speech["audio"], samplerate=speech["sampling_rate"])
Audio("speech.wav")

Device set to use cuda


- Text to Audio

In [30]:
synthesiser = pipeline("text-to-audio","lysandre/text-to-speech-pipeline",device="cuda")

embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

speech = synthesiser("Hi to an artificial intelligence engineer, working hard!", forward_params={"speaker_embeddings": speaker_embedding})

sf.write("speech.wav", speech["audio"], samplerate=speech["sampling_rate"])
Audio("speech.wav")

pytorch_model.bin:   0%|          | 0.00/585M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/272 [00:00<?, ?B/s]

spm_char.model:   0%|          | 0.00/238k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/40.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/585M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

Device set to use cuda


- Audio Classification

In [31]:
ds = load_dataset("etechgrid/28.5k_wavfiles_dataset", split="train", streaming=True)
one_row = next(iter(ds))
audio_array = np.array(one_row['audio']['array'])

audio_classifier = pipeline("audio-classification", device="cuda")
result = audio_classifier(audio_array)
print(result)

Resolving data files:   0%|          | 0/163 [00:00<?, ?it/s]

No model was supplied, defaulted to superb/wav2vec2-base-superb-ks and revision 372e048 (https://huggingface.co/superb/wav2vec2-base-superb-ks).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/2.42k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/378M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

Device set to use cuda


[{'score': 0.9830453395843506, 'label': '_unknown_'}, {'score': 0.010987850837409496, 'label': 'left'}, {'score': 0.0032079832162708044, 'label': 'right'}, {'score': 0.0010433406569063663, 'label': 'down'}, {'score': 0.0007334855617955327, 'label': 'yes'}]


## Image-Based Tasks

- Image Classification

In [32]:
from PIL import Image

In [33]:
classifier = pipeline("image-classification", device="cuda")
image = Image.open("tiger.jpg")
result = classifier(image)
print(result)

No model was supplied, defaulted to google/vit-base-patch16-224 and revision 3f49326 (https://huggingface.co/google/vit-base-patch16-224).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/69.7k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.
Device set to use cuda


[{'label': 'tiger, Panthera tigris', 'score': 0.9432807564735413}, {'label': 'tiger cat', 'score': 0.05001961067318916}, {'label': 'jaguar, panther, Panthera onca, Felis onca', 'score': 0.000979915144853294}, {'label': 'leopard, Panthera pardus', 'score': 0.000515370222274214}, {'label': 'lion, king of beasts, Panthera leo', 'score': 0.00045476187369786203}]


In [35]:
classifier = pipeline("image-classification", device="cuda")
image = Image.open("perrytheplatypus.webp")
result = classifier(image)
print(result)

No model was supplied, defaulted to google/vit-base-patch16-224 and revision 3f49326 (https://huggingface.co/google/vit-base-patch16-224).
Using a pipeline without specifying a model name and revision in production is not recommended.
Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.
Device set to use cuda


[{'label': 'cowboy hat, ten-gallon hat', 'score': 0.6768121719360352}, {'label': 'sombrero', 'score': 0.19119733572006226}, {'label': 'bolo tie, bolo, bola tie, bola', 'score': 0.014991486445069313}, {'label': "plunger, plumber's helper", 'score': 0.008419524878263474}, {'label': 'pick, plectrum, plectron', 'score': 0.005832729861140251}]


- Object Detection

In [37]:
detector = pipeline("object-detection", device="cuda",model="jcm-art/hf_object_detection_DETR_CPPE_5_pipeline")
image = Image.open("tiger.jpg")
result = detector(image)
print(result)

config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/167M [00:00<?, ?B/s]

Some weights of the model checkpoint at jcm-art/hf_object_detection_DETR_CPPE_5_pipeline were not used when initializing DetrForObjectDetection: ['model.backbone.conv_encoder.model.layer1.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing DetrForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DetrForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


preprocessor_config.json:   0%|          | 0.00/401 [00:00<?, ?B/s]

Device set to use cuda


[]


In [36]:
detector = pipeline("object-detection", device="cuda")
image = Image.open("perrytheplatypus.webp")
result = detector(image)
print(result)

No model was supplied, defaulted to facebook/detr-resnet-50 and revision 1d5f47b (https://huggingface.co/facebook/detr-resnet-50).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some weights of the model checkpoint at facebook/detr-resnet-50 were not used when initializing DetrForObjectDetection: ['model.backbone.conv_encoder.model.layer1.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing DetrForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DetrForObjectDetection from the checkpoint of a 

[{'score': 0.5128208994865417, 'label': 'kite', 'box': {'xmin': 11, 'ymin': 48, 'xmax': 229, 'ymax': 209}}, {'score': 0.538276195526123, 'label': 'kite', 'box': {'xmin': 29, 'ymin': 48, 'xmax': 229, 'ymax': 210}}]


- Image Segmentation

In [38]:
segmenter = pipeline("image-segmentation", device="cuda")
image = Image.open("tiger.jpg")
result = segmenter(image)
print(result)

No model was supplied, defaulted to facebook/detr-resnet-50-panoptic and revision d53b52a (https://huggingface.co/facebook/detr-resnet-50-panoptic).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/172M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/detr-resnet-50-panoptic were not used when initializing DetrForSegmentation: ['detr.model.backbone.conv_encoder.model.layer1.0.downsample.1.num_batches_tracked', 'detr.model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'detr.model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'detr.model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing DetrForSegmentation from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DetrForSegmentation from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


preprocessor_config.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

Device set to use cuda


model.safetensors:   0%|          | 0.00/172M [00:00<?, ?B/s]

`label_ids_to_fuse` unset. No instance will be fused.


[{'score': 0.99958, 'label': 'LABEL_193', 'mask': <PIL.Image.Image image mode=L size=198x148 at 0x7F4DC0948D90>}, {'score': 0.958553, 'label': 'zebra', 'mask': <PIL.Image.Image image mode=L size=198x148 at 0x7F4DC0E0CB50>}, {'score': 0.979986, 'label': 'zebra', 'mask': <PIL.Image.Image image mode=L size=198x148 at 0x7F4E26A75810>}]


## Multi-modal Tasks

- Visual Question Answering (VQA)

In [39]:
vqa = pipeline("vqa", device="cuda")
image = Image.open("car.jpg")
result = vqa(image, question="What color is the car?")
print(result)

No model was supplied, defaulted to dandelin/vilt-b32-finetuned-vqa and revision d0a1f6a (https://huggingface.co/dandelin/vilt-b32-finetuned-vqa).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/136k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/470M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/320 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/470M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/251 [00:00<?, ?B/s]

Device set to use cuda


[{'score': 0.8614336252212524, 'answer': 'orange'}, {'score': 0.577265202999115, 'answer': 'yellow'}, {'score': 0.0019292350625619292, 'answer': 'red'}, {'score': 0.001509473193436861, 'answer': 'orange and black'}, {'score': 0.0010303161107003689, 'answer': 'black'}]
