<a href="https://colab.research.google.com/github/yashika-git/NLP/blob/main/Hugging_Face_HandsOn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [32]:
!pip -q install transformers

In [2]:
from transformers import pipeline

In [3]:
import torch

In [4]:
import torch.nn.functional as F

In [6]:
classifier = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)


Downloading:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

In [17]:
res = classifier("I'm a Face")

In [18]:
print(res)

[{'label': 'POSITIVE', 'score': 0.9994341731071472}]


In [None]:
# without using pipeline()

In [19]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification 

In [20]:
model_name = 'distilbert-base-uncased-finetuned-sst-2-english'

In [22]:
model = AutoModelForSequenceClassification.from_pretrained(model_name)  #either we can mention the model_name (from Hugging Face) or mention the directory to load the model from

In [24]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [26]:
classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

In [27]:
tokens = tokenizer.tokenize("I'm happy to learn NLP.")

In [29]:
token_ids = tokenizer.convert_tokens_to_ids(tokens)

In [30]:
input_ids = tokenizer("I'm happy to learn NLP.")

In [31]:
print(f'Tokens: {tokens}')
print(f'Token IDs: {token_ids}')
print(f'Input IDs: {input_ids}')

Tokens: ['i', "'", 'm', 'happy', 'to', 'learn', 'nl', '##p', '.']
Token IDs: [1045, 1005, 1049, 3407, 2000, 4553, 17953, 2361, 1012]
Input IDs: {'input_ids': [101, 1045, 1005, 1049, 3407, 2000, 4553, 17953, 2361, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [33]:
X_train = ['Excited to learn NLP', 'Exploring Hugging Face Library']

In [35]:
batch = tokenizer(X_train, padding=True, truncation=True, max_length=512, return_tensors='pt')

In [36]:
print(batch)

{'input_ids': tensor([[  101,  7568,  2000,  4553, 17953,  2361,   102],
        [  101, 11131, 17662,  2227,  3075,   102,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 0]])}


In [40]:
with torch.no_grad():
  #outputs = model(**batch) #unpacking the dictionary  (without labels, we get loss as None)
  outputs = model(**batch, labels=torch.tensor([1,1]))
  print(outputs)
  predictions = F.softmax(outputs.logits, dim=1)
  print(predictions)
  labels = torch.argmax(predictions, dim=1)
  print(labels)
  labels = [model.config.id2label[label_id] for label_id in labels.tolist()]
  print(labels)

SequenceClassifierOutput(loss=tensor(0.0004), logits=tensor([[-3.8803,  4.1377],
        [-3.6801,  3.9551]]), hidden_states=None, attentions=None)
tensor([[3.2937e-04, 9.9967e-01],
        [4.8287e-04, 9.9952e-01]])
tensor([1, 1])
['POSITIVE', 'POSITIVE']


In [41]:
# To save
save_directory = 'saved'
tokenizer.save_pretrained(save_directory)
model.save_pretrained(save_directory)

# To load
tokenizer = AutoTokenizer.from_pretrained(save_directory)
model = AutoModelForSequenceClassification.from_pretrained(save_directory)

In [42]:
# Fine-tuning
# 1. Prepare dataset (load)
# 2. Load pre-trained Tokenizer, call it with dataset -> encoding
# 3. Build PyTorch Dataset with Encodings
# 4. Load pre-trained model
# 5. a. Either load Trainer and train it
#    b. Or use native PyTorch training pipeline