### https://huggingface.co/transformers has pipelines, pretrained models, tokenizers etc for several tasks.

In [None]:
!pip install gdown



In [None]:
!pip install datasets



In [None]:
!pip install transformers



In [None]:
!pip install 'transformers[torch]'



In [None]:
import torch
from transformers import pipeline

### Sentiment analysis: recall we used it in our introductory lecture

In [None]:
classifier = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [None]:
result = classifier("I hate washing dishes")[0]
print("Sentiment: {} Score: {}".format(result['label'],result['score']))

result = classifier("I love visiting Paris")[0]
print("Sentiment: {} Score: {}".format(result['label'],result['score']))

Sentiment: NEGATIVE Score: 0.9992689490318298
Sentiment: POSITIVE Score: 0.999553382396698


### You can also fine tune any pretrained model. For instance, let's fine tune a model for sentiment analysis

### Load a dataset for fine-tuning.

In [None]:
!gdown 'https://drive.google.com/uc?id=11_M4ootuT7I1G0RlihcC0cA3Elqotlc-'

Downloading...
From: https://drive.google.com/uc?id=11_M4ootuT7I1G0RlihcC0cA3Elqotlc-
To: /content/imdbs.csv
  0% 0.00/132k [00:00<?, ?B/s]100% 132k/132k [00:00<00:00, 100MB/s]


In [None]:
import datasets
from datasets import load_dataset

dataset = load_dataset('csv', data_files='./imdbs.csv', split='train')

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-23dfa85546200de7/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-23dfa85546200de7/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d. Subsequent calls will reuse this data.


In [None]:
# split dataset into train and test

dataset   = dataset.train_test_split(test_size=0.1)
train_set = dataset['train']
test_set  = dataset['test']

### Load the tokenizer and preprocess the training and test sets with the tokenizer -- it already converts tokens into ids and sets attention masks

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

# preprocess the dataset

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

train_set = train_set.map(tokenize_function, batched=True)
test_set  = test_set.map(tokenize_function, batched=True)

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

### Load the model for sequence classification

In [None]:
from transformers import AutoModelForSequenceClassification

checkpoint = "bert-base-cased"

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Since we want to report the accuracy of the model, we can add the following function.

In [None]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

  metric = load_metric("accuracy")


### Now set training parameters and arguments, and train the model

In [None]:
from transformers import TrainingArguments, Trainer

In [None]:
# set training parameters and arguments

batch_size = 8
epochs     = 20
warmup_steps = 100
weight_decay = 0.01

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_steps=warmup_steps,
    weight_decay=weight_decay,
    evaluation_strategy="epoch",
    logging_dir='./logs',
)

In [None]:
# define Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=test_set,
    compute_metrics=compute_metrics
)

In [None]:
# Train

trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.549821,0.8
2,No log,0.594814,0.8
3,No log,0.594156,0.8
4,No log,0.52687,0.8
5,No log,0.45394,0.8
6,No log,0.238038,0.9
7,No log,0.059927,1.0
8,No log,0.782923,0.8
9,No log,0.914551,0.8
10,No log,0.939348,0.8


TrainOutput(global_step=240, training_loss=0.1782382329305013, metrics={'train_runtime': 174.0635, 'train_samples_per_second': 10.341, 'train_steps_per_second': 1.379, 'total_flos': 473599899648000.0, 'train_loss': 0.1782382329305013, 'epoch': 20.0})

In [None]:
# evaluate

trainer.evaluate()

{'eval_loss': 1.1680898666381836,
 'eval_accuracy': 0.8,
 'eval_runtime': 0.3186,
 'eval_samples_per_second': 31.387,
 'eval_steps_per_second': 6.277,
 'epoch': 20.0}

In [None]:
# test

inputs  = tokenizer('High tech companies are growing up', return_tensors="pt").to(device=0)
labels  = torch.tensor([1]).unsqueeze(0).to(device=0)
outputs = model(**inputs, labels=labels)
loss    = outputs.loss
logits  = outputs.logits
answer  = torch.argmax(logits)
if (answer == 0):
    print("Sentiment: NEGATIVE")
else:
    print("Sentiment: POSITIVE")

Sentiment: POSITIVE


### Extractive Question Answering: the task of extracting an answer from a text given a question

In [None]:
question_answerer = pipeline("question-answering")

No model was supplied, defaulted to distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading (…)lve/main/config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
context = "The immune system is a system of many biological structures and processes \
within an organism that protects against diseases. To function properly the immune system \
must detect a wide variety of agents, called pathogens."

result = question_answerer(question="What are pathogens?", context=context)
print("Answer: {}".format(result['answer']))
print("Score: {}".format(round(result['score'], 4)))
print("Start: {} End: {}".format(result['start'],result['end']))

result = question_answerer(question="How does the immune system work?", context=context)
print("Answer: {}".format(result['answer']))
print("Score: {}".format(round(result['score'], 4)))
print("Start: {} End: {}".format(result['start'],result['end']))

result = question_answerer(question="What is the immune system?", context=context)
print("Answer: {}".format(result['answer']))
print("Score: {}".format(round(result['score'], 4)))
print("Start: {} End: {}".format(result['start'],result['end']))

Answer: a wide variety of agents
Score: 0.4355
Start: 176 End: 200
Answer: must detect a wide variety of agents
Score: 0.058
Start: 164 End: 200
Answer: a system of many biological structures and processes within an organism that protects against diseases
Score: 0.4359
Start: 21 End: 123


### You may also use pretrained models already fine-tuned in some dataset (e.g., SQUAD -- Stanford Question-Answering Dataset).

In [None]:
from transformers import BertForQuestionAnswering, BertTokenizer
import torch

In [None]:
# load the fine-tuned model

model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

Downloading (…)lve/main/config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# load its fine-tuned tokenizer

tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

In [None]:
# add BERT tags to process question and context

question = "[CLS]" + "What are pathogens?" + "[SEP]"
context  = context + "[SEP]"

In [None]:
# get their tokens, combine and convert them into input_ids
question_tokens = tokenizer.tokenize(question)
context_tokens  = tokenizer.tokenize(context)
all_tokens      = question_tokens + context_tokens
input_ids       = tokenizer.convert_tokens_to_ids(all_tokens)

In [None]:
# define segment_ids with zeroes for question tokens and ones for context tokens

segment_ids = [0] * len(question_tokens)
segment_ids = segment_ids + [1] * len(context_tokens)

In [None]:
# convert input and segment ids to tensors and feed them into the model
# to obtain the start and end scores

input_ids              = torch.tensor([input_ids])
segment_ids            = torch.tensor([segment_ids])

In [None]:
result = model(input_ids, token_type_ids = segment_ids)

In [None]:
#tokens with highest start and end scores
answer_start = torch.argmax(result.start_logits)
answer_end   = torch.argmax(result.end_logits)
if answer_end >= answer_start:
    answer = " ".join(all_tokens[answer_start:answer_end+1])
    print("\nQuestion:{}".format(question[5:-5]))
    print("\nAnswer: {}.".format(answer))
else:
    print("I could not find an answer to your question.")


Question:What are pathogens?

Answer: agents.


### Text Generation

In [None]:
text_generator = pipeline("text-generation")

No model was supplied, defaulted to gpt2 and revision 6c0e608 (https://huggingface.co/gpt2).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
text = text_generator("It is a strong idea", max_length=35, do_sample=False)
print(text)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'It is a strong idea that the government should be able to provide a fair and transparent process for the public to make their own decisions about the future of their lives," he said'}]


### Named Entity Recognition

In [None]:
ner_pipe = pipeline("ner")

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading (…)lve/main/config.json:   0%|          | 0.00/998 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading (…)okenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

In [None]:
text = "IBM introduces Eagle in USA -- the first processor to surpass 100 qubits."
result = ner_pipe(text)
for entity in result:
    print(entity)

{'entity': 'I-ORG', 'score': 0.9989236, 'index': 1, 'word': 'IBM', 'start': 0, 'end': 3}
{'entity': 'I-MISC', 'score': 0.7671131, 'index': 3, 'word': 'Eagle', 'start': 15, 'end': 20}
{'entity': 'I-LOC', 'score': 0.9996847, 'index': 5, 'word': 'USA', 'start': 24, 'end': 27}


### Text Summarization

In [None]:
summarizer = pipeline("summarization")

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

In [None]:
text = "IBM has unveiled an advanced quantum processor that is part of an effort \
to build super-fast computers. These machines could revolutionise computing, harnessing \
the strange world of quantum physics to solve problems beyond reach for even the most \
advanced classical ones. But the hurdles in building practical, large-scale versions \
have kept quantum computers confined to the lab. The new chip has 127 qubits, \
twice as many as the previous IBM processor. Qubits (quantum bits) are the most basic \
units of information in a quantum computer. The company called its new Eagle processor \
a key milestone on the path towards practical quantum computation."

In [None]:
result = summarizer(text, max_length=50, min_length=30, do_sample=False)

In [None]:
print(result[0]['summary_text'])

 IBM has unveiled an advanced quantum processor that is part of an effort to build super-fast computers . These machines could revolutionise computing, harnessing the strange world of quantum physics to solve problems beyond reach for even the most advanced classical ones
