In [114]:
from transformers import pipeline

In [115]:
# This cell is only needed when working from local environment, due to local SSL security when loading the HuggingFace model and token below. Remove this when working from Colab or Kaggle for example
import os
os.environ['CURL_CA_BUNDLE'] = ''
os.environ['REQUESTS_CA_BUNDLE'] = ''

In [116]:
# Sentiment analysis model
classifier = pipeline('sentiment-analysis')

result = classifier('Nothing has been done !')

print(result)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


[{'label': 'NEGATIVE', 'score': 0.9960914254188538}]


In [117]:
# Text generation model

# generator = pipeline('text-generation', model='gpt2')
generator = pipeline('text-generation', model='distilgpt2')

result = generator(
  'When planning for a project, we need first to identify the', 
  max_length=50, do_sample=True,
  truncation=True,
  top_k=50, 
  top_p=0.95, 
  num_return_sequences=3)

print(result)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "When planning for a project, we need first to identify the best fit for you. If a project is not an excellent option, then you have to build it so that we can make it look nicer to users. What's more, you can save"}, {'generated_text': "When planning for a project, we need first to identify the potential value of the project that we are making. For instance, the company's mission to create a high quality home-built for the community is to create a new low profile home-built"}, {'generated_text': 'When planning for a project, we need first to identify the best fit and budget with the company that is running the work.›\n\n\n\n\nFor instance, most of the work is going on at the state level, so the first'}]


In [118]:
# Perform zero-shot classification

# classifier = pipeline('zero-shot-classification', model='distilbert-base-uncased-finetuned-sst-2-english')
classifier = pipeline('zero-shot-classification')

result = classifier(
  "This is a course about geo-politics",
  candidate_labels=["education", "politics", "business"],
  # multi_label=True
)

print(result)

No model was supplied, defaulted to facebook/bart-large-mnli and revision c626438 (https://huggingface.co/facebook/bart-large-mnli).
Using a pipeline without specifying a model name and revision in production is not recommended.


{'sequence': 'This is a course about geo-politics', 'labels': ['politics', 'education', 'business'], 'scores': [0.9454228281974792, 0.029302459210157394, 0.02527468465268612]}


In [119]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F

In [120]:
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

input_seq = "I feel happy about this"

print(f'tokenizer:{tokenizer}')
result = tokenizer(input_seq)
print(f'result:{result}')

tokens = tokenizer.tokenize(input_seq) # Split seq into words; still in string format
print(f'Tokens: {tokens}')

ids = tokenizer.convert_tokens_to_ids(tokens) # Provide the numeric form of the string tokens
print(f'ids: {ids}')

tokenized_sequence = tokenizer.encode(input_seq) # Add BoS (Beginning of Sentence) and EoS (i.e. End of Sentence) token ids to the tokenized sequence; all in numeric form
# print tokenized_sequence
print(f'tokenized_sequence: {tokenized_sequence}')

decoded_string = tokenizer.decode(ids) # Retrieve the text form
print(f'decoded_string: {decoded_string}')

tokenizer:DistilBertTokenizerFast(name_or_path='distilbert-base-uncased-finetuned-sst-2-english', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
result:{'input_ids': [101, 1045, 2514, 3407, 2055, 2023, 102], 'attention_mask': 

In [121]:
batch = tokenizer(input_seq, padding=True, truncation=True, max_length=10, return_tensors='pt')

In [126]:
# Perform the inference with pytorch
with torch.no_grad():
  # input_ids = torch.tensor(tokenized_sequence).unsqueeze(0)
  # input_ids = torch.tensor(tokenized_sequence).unsqueeze(0)
  print(batch)
  outputs = model(**batch) # Unpack the dictionary batch as separate arguments to the model
  logits = outputs[0]
  probs = F.softmax(logits, dim=1)
  print(f'logits: {logits}')
  print(f'probs: {probs}')
  print(f'probs[0]: {probs[0]}')
  print(f'probs[0][0]: {probs[0][0]}')
  print(f'probs[0][1]: {probs[0][1]}')
  predictions = F.softmax(logits, dim=1)
  print(predictions)
  labels = torch.argmax(predictions, dim=1)
  print(labels)

{'input_ids': tensor([[ 101, 1045, 2514, 3407, 2055, 2023,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}
logits: tensor([[-4.3432,  4.6859]])
probs: tensor([[1.1986e-04, 9.9988e-01]])
probs[0]: tensor([1.1986e-04, 9.9988e-01])
probs[0][0]: 0.0001198602476506494
probs[0][1]: 0.9998800754547119
tensor([[1.1986e-04, 9.9988e-01]])
tensor([1])
