In [5]:
text = "What a wonderful & film."
a = text.split()
print(a)
b = " ".join(a)
print(b)

['What', 'a', 'wonderful', '&', 'film.']
What a wonderful & film.


In [1]:
from transformers import BertTokenizer

In [2]:
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

In [3]:
# Transformer's tokenizer - input_ids
sequence = "A Titan RTX has 24GB of VRAM"
print("Original sequence: ",sequence)
tokenized_sequence = tokenizer.tokenize(sequence)
print("Tokenized sequence: ",tokenized_sequence)
encodings = tokenizer(sequence)
encoded_sequence = encodings['input_ids']
print("Encoded sequence: ", encoded_sequence)
decoded_encodings=tokenizer.decode(encoded_sequence)
print("Decoded sequence: ", decoded_encodings)

Original sequence:  A Titan RTX has 24GB of VRAM
Tokenized sequence:  ['A', 'Titan', 'R', '##T', '##X', 'has', '24', '##GB', 'of', 'V', '##RA', '##M']
Encoded sequence:  [101, 138, 18696, 155, 1942, 3190, 1144, 1572, 13745, 1104, 159, 9664, 2107, 102]
Decoded sequence:  [CLS] A Titan RTX has 24GB of VRAM [SEP]


In [4]:
encodings = tokenizer.encode_plus(
    sequence,
    add_special_tokens=True,
)
print(encodings["input_ids"])
print(tokenizer.decode(encodings["input_ids"]))

[101, 138, 18696, 155, 1942, 3190, 1144, 1572, 13745, 1104, 159, 9664, 2107, 102]
[CLS] A Titan RTX has 24GB of VRAM [SEP]


In [5]:
# Transformer's tokenizer - attention_mask
sequence_a = "This is a short sequence."
sequence_b = "This is a rather long sequence. It is at least longer than the sequence A."
print("Sequence a: ",sequence_a)
print("Sequence b: ",sequence_b)
encoded_sequence_a = tokenizer(sequence_a)["input_ids"]
encoded_sequence_b = tokenizer(sequence_b)["input_ids"]
print("A's encoding length={}. \nB's encoding length={}".format(len(encoded_sequence_a),len(encoded_sequence_b)))
padded_sequence_ab = tokenizer([sequence_a,sequence_b],padding=True)
print("Padded sequence(A,B):", padded_sequence_ab["input_ids"])
print("Attention mask(A,B):", padded_sequence_ab["attention_mask"])

Sequence a:  This is a short sequence.
Sequence b:  This is a rather long sequence. It is at least longer than the sequence A.
A's encoding length=8. 
B's encoding length=19
Padded sequence(A,B): [[101, 1188, 1110, 170, 1603, 4954, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1188, 1110, 170, 1897, 1263, 4954, 119, 1135, 1110, 1120, 1655, 2039, 1190, 1103, 4954, 138, 119, 102]]
Attention mask(A,B): [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]


In [6]:
# Transformer's tokenizer - token type id
encodings_ab = tokenizer(sequence_a, sequence_b)
print("Encoded sequence(AB):", encodings_ab["input_ids"])
decoded_ab = tokenizer.decode(encodings_ab["input_ids"])
print("Decoded sequence(AB):", decoded_ab)
print("Token type ids(AB):", encodings_ab["token_type_ids"])

Encoded sequence(AB): [101, 1188, 1110, 170, 1603, 4954, 119, 102, 1188, 1110, 170, 1897, 1263, 4954, 119, 1135, 1110, 1120, 1655, 2039, 1190, 1103, 4954, 138, 119, 102]
Decoded sequence(AB): [CLS] This is a short sequence. [SEP] This is a rather long sequence. It is at least longer than the sequence A. [SEP]
Token type ids(AB): [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [7]:
encoded_input = tokenizer("How old are you?", "I'm 6 years old")

In [8]:
print(encoded_input)

{'input_ids': [101, 1731, 1385, 1132, 1128, 136, 102, 146, 112, 182, 127, 1201, 1385, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [9]:
print(tokenizer.decode(encoded_input["input_ids"]))

[CLS] How old are you? [SEP] I'm 6 years old [SEP]


In [43]:
batch_sentences = ["Hello I'm a single sentence",
                   "And another sentence",
                   "And the very very last one"]
batch_of_second_sentences = ["I'm a sentence that goes with the first sentence",
                             "And I should be encoded with the second sentence",
                             "And I go with the very last one"]
encoded_inputs = tokenizer(batch_sentences, batch_of_second_sentences)
print(encoded_inputs)

{'input_ids': [[101, 8667, 146, 112, 182, 170, 1423, 5650, 102, 146, 112, 182, 170, 5650, 1115, 2947, 1114, 1103, 1148, 5650, 102], [101, 1262, 1330, 5650, 102, 1262, 146, 1431, 1129, 12544, 1114, 1103, 1248, 5650, 102], [101, 1262, 1103, 1304, 1304, 1314, 1141, 102, 1262, 146, 1301, 1114, 1103, 1304, 1314, 1141, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}


In [47]:
encoded_input = tokenizer(["Hello", "I'm", "a", "single", "sentence"],is_split_into_words=True)
print(encoded_input)
print(tokenizer.decode(encoded_input["input_ids"]))

{'input_ids': [101, 8667, 146, 112, 182, 170, 1423, 5650, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}
[CLS] Hello I'm a single sentence [SEP]


In [54]:
batch_sentences = [["Hello", "I'm", "a", "single", "sentence"],
                   ["And", "another", "sentence"],
                   ["And", "the", "very", "very", "last", "one"]]
encoded_inputs = tokenizer(batch_sentences, is_split_into_words=True)
print(encoded_inputs)
print(tokenizer.decode(encoded_inputs["input_ids"][1]))


{'input_ids': [[101, 8667, 146, 112, 182, 170, 1423, 5650, 102], [101, 1262, 1330, 5650, 102], [101, 1262, 1103, 1304, 1304, 1314, 1141, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1]]}
[CLS] And another sentence [SEP]


In [55]:
batch_of_second_sentences = [["I'm", "a", "sentence", "that", "goes", "with", "the", "first", "sentence"],
                             ["And", "I", "should", "be", "encoded", "with", "the", "second", "sentence"],
                             ["And", "I", "go", "with", "the", "very", "last", "one"]]
encoded_inputs = tokenizer(batch_sentences, batch_of_second_sentences, is_split_into_words=True)
print(encoded_inputs)
print(tokenizer.decode(encoded_inputs["input_ids"][0]))

{'input_ids': [[101, 8667, 146, 112, 182, 170, 1423, 5650, 102, 146, 112, 182, 170, 5650, 1115, 2947, 1114, 1103, 1148, 5650, 102], [101, 1262, 1330, 5650, 102, 1262, 146, 1431, 1129, 12544, 1114, 1103, 1248, 5650, 102], [101, 1262, 1103, 1304, 1304, 1314, 1141, 102, 1262, 146, 1301, 1114, 1103, 1304, 1314, 1141, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}
[CLS] Hello I'm a single sentence [SEP] I'm a sentence that goes with the first sentence [SEP]


In [10]:
from transformers import pipeline

nlp = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)


In [11]:
result = nlp("I hate you")[0]
print(f"label: {result['label']}, with score: {round(result['score'], 4)}")

result = nlp("I love you")[0]
print(f"label: {result['label']}, with score: {round(result['score'], 4)}")

label: NEGATIVE, with score: 0.9991
label: POSITIVE, with score: 0.9999


In [66]:
nlp_summary =pipeline("summarization")

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 (https://huggingface.co/sshleifer/distilbart-cnn-12-6)
Downloading: 100%|██████████| 1.76k/1.76k [00:00<00:00, 419kB/s]
Downloading: 100%|██████████| 1.14G/1.14G [01:29<00:00, 13.7MB/s]
Downloading: 100%|██████████| 26.0/26.0 [00:00<00:00, 5.96kB/s]
Downloading: 100%|██████████| 878k/878k [00:13<00:00, 68.6kB/s] 
Downloading: 100%|██████████| 446k/446k [00:05<00:00, 84.7kB/s] 


In [85]:
phrase = """
They manufactured machinery for sale and lease, ranging from commercial scales and industrial time recorders, meat and cheese slicers, to tabulators and punched cards. Thomas J. Watson, Sr., fired from the National Cash Register Company by John Henry Patterson, called on Flint and, in 1914, was offered a position at CTR.[12] Watson joined CTR as general manager then, 11 months later, was made President when court cases relating to his time at NCR were resolved.[13] Having learned Patterson's pioneering business practices, Watson proceeded to put the stamp of NCR onto CTR's companies.[14] He implemented sales conventions, "generous sales incentives, a focus on customer service, an insistence on well-groomed, dark-suited salesmen and had an evangelical fervor for instilling company pride and loyalty in every worker".[15][16] His favorite slogan, "THINK", became a mantra for each company's employees.[15] During Watson's first four years, revenues reached $9 million ($134 million today) and the company's operations expanded to Europe, South America, Asia and Australia.[15] Watson never liked the clumsy hyphenated name "Computing-Tabulating-Recording Company" and on February 14, 1924, chose to replace it with the more expansive title "International Business Machines" which had previously been used as the name of CTR's Canadian Division.[17] By 1933, most of the subsidiaries had been merged into one company, IBM
"""
tokens = tokenizer(phrase)
print("Length of tokens for phrase: ", len(tokens["input_ids"]))
result = nlp_summary(phrase, max_length=63)[0]
tokens = tokenizer(result['summary_text'])
print("Length of tokens for summary: ", len(tokens["input_ids"]))
#print(f"label: {result['label']}, with score: {round(result['score'], 4)}")

Length of tokens for phrase:  330
Length of tokens for summary:  64


In [86]:
print(result['summary_text'])

 Thomas J. Watson, Sr. was fired from the National Cash Register Company by John Henry Patterson in 1914 . Watson was made president of the company after court cases relating to his time at NCR were resolved . During Watson's first four years, revenues reached $9 million ($134 million today) and


In [80]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc")

classes = ["not paraphrase", "is paraphrase"]

sequence_0 = "The company HuggingFace is based in New York City"
sequence_1 = "Apples are especially bad for your health"
sequence_2 = "HuggingFace's headquarters are situated in Manhattan"

paraphrase = tokenizer(sequence_0, sequence_2, return_tensors="pt")
not_paraphrase = tokenizer(sequence_0, sequence_1, return_tensors="pt")

paraphrase_classification_logits = model(**paraphrase).logits
not_paraphrase_classification_logits = model(**not_paraphrase).logits

paraphrase_results = torch.softmax(paraphrase_classification_logits, dim=1).tolist()[0]
not_paraphrase_results = torch.softmax(not_paraphrase_classification_logits, dim=1).tolist()[0]

# Should be paraphrase
for i in range(len(classes)):
    print(f"{classes[i]}: {int(round(paraphrase_results[i] * 100))}%")

# Should not be paraphrase
for i in range(len(classes)):
    print(f"{classes[i]}: {int(round(not_paraphrase_results[i] * 100))}%")

Downloading: 100%|██████████| 29.0/29.0 [00:00<00:00, 5.69kB/s]
Downloading: 100%|██████████| 433/433 [00:00<00:00, 65.4kB/s]
Downloading: 100%|██████████| 208k/208k [00:05<00:00, 42.0kB/s] 
Downloading: 100%|██████████| 426k/426k [00:04<00:00, 92.5kB/s] 
Downloading: 100%|██████████| 413M/413M [00:40<00:00, 10.6MB/s] 


not paraphrase: 10%
is paraphrase: 90%
not paraphrase: 94%
is paraphrase: 6%


In [87]:
nlp = pipeline("question-answering")

context = r"""
Extractive Question Answering is the task of extracting an answer from a text given a question. An example of a
question answering dataset is the SQuAD dataset, which is entirely based on that task. If you would like to fine-tune
a model on a SQuAD task, you may leverage the examples/question-answering/run_squad.py script.
"""

No model was supplied, defaulted to distilbert-base-cased-distilled-squad (https://huggingface.co/distilbert-base-cased-distilled-squad)
Downloading: 100%|██████████| 473/473 [00:00<00:00, 71.7kB/s]
Downloading: 100%|██████████| 249M/249M [00:21<00:00, 11.9MB/s] 
Downloading: 100%|██████████| 29.0/29.0 [00:00<00:00, 5.61kB/s]
Downloading: 100%|██████████| 208k/208k [00:03<00:00, 54.5kB/s] 
Downloading: 100%|██████████| 426k/426k [00:05<00:00, 78.8kB/s] 


In [88]:
result = nlp(question="What is extractive question answering?", context=context)
print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")

result = nlp(question="What is a good example of a question answering dataset?", context=context)
print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")

Answer: 'the task of extracting an answer from a text given a question', score: 0.6226, start: 34, end: 95
Answer: 'SQuAD dataset', score: 0.5053, start: 147, end: 160


In [89]:
nlp = pipeline("ner")

sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very\
           close to the Manhattan Bridge which is visible from the window."

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english)
Downloading: 100%|██████████| 998/998 [00:00<00:00, 257kB/s]
Downloading: 100%|██████████| 1.24G/1.24G [02:06<00:00, 10.6MB/s]
Downloading: 100%|██████████| 60.0/60.0 [00:00<00:00, 10.9kB/s]
Downloading: 100%|██████████| 208k/208k [00:02<00:00, 103kB/s]  


In [91]:
nlp(sequence)

[{'entity': 'I-ORG',
  'score': 0.9995786,
  'index': 1,
  'word': 'Hu',
  'start': 0,
  'end': 2},
 {'entity': 'I-ORG',
  'score': 0.9909764,
  'index': 2,
  'word': '##gging',
  'start': 2,
  'end': 7},
 {'entity': 'I-ORG',
  'score': 0.9982225,
  'index': 3,
  'word': 'Face',
  'start': 8,
  'end': 12},
 {'entity': 'I-ORG',
  'score': 0.999488,
  'index': 4,
  'word': 'Inc',
  'start': 13,
  'end': 16},
 {'entity': 'I-LOC',
  'score': 0.9994345,
  'index': 11,
  'word': 'New',
  'start': 40,
  'end': 43},
 {'entity': 'I-LOC',
  'score': 0.9993196,
  'index': 12,
  'word': 'York',
  'start': 44,
  'end': 48},
 {'entity': 'I-LOC',
  'score': 0.9993794,
  'index': 13,
  'word': 'City',
  'start': 49,
  'end': 53},
 {'entity': 'I-LOC',
  'score': 0.98625815,
  'index': 19,
  'word': 'D',
  'start': 79,
  'end': 80},
 {'entity': 'I-LOC',
  'score': 0.9514269,
  'index': 20,
  'word': '##UM',
  'start': 80,
  'end': 82},
 {'entity': 'I-LOC',
  'score': 0.9336589,
  'index': 21,
  'word': 

In [102]:
import numpy as np

In [93]:
a = np.arange(6)
a.shape

(6,)

In [94]:
a = torch.tensor(np.expand_dims(a, axis=1))
print(a)


tensor([[0],
        [1],
        [2],
        [3],
        [4],
        [5]])


In [97]:
b = np.array(a)

In [99]:
list(b.squeeze())

[0, 1, 2, 3, 4, 5]

In [100]:
translator = pipeline("translation_en_to_de")
print(translator("Hugging Face is a technology company based in New York and Paris", max_length=40))