# High-level usage of transformer models
Please check lecture notes to get an introduction to transformers and the attention mechanism, as well as how various layers of encoders/decoders are used to build popular models such as BERT and GPT.

## Requirements:
- transformers
  - will install huggingface_hub and tokenizers

In [None]:
import transformers

## Decide on a task
Here we will utilize a pre-trained model. Some are also fine-tuned on a specific task, sometimes referred to as a *downstream task*.

For the case of BERT - it is trained as a masked language model, finding a <MASK>ed word within a text. "Alice went to <MASK>". "Alice <MASK> to school".
As you'll see as you study this, the next-word prediction task can be transformed into a large selection of problems!

# 1) Masked language modeling with BERT

In [None]:
from transformers import pipeline
unmasker = pipeline('fill-mask', model='bert-base-uncased')
res = unmasker("Alice [MASK] to school")
for res_obj in res:
    score = res_obj['score']
    token = res_obj['sequence']
    print(f"{token}\t\t({score})")

# 2) Fine tuning BERT

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup


In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import TensorDataset

# create a sample dummy dataset for demonstration (fine-tuning on sentiment)

# sentiment-analysis dataset
data = [
    "This product is great!",
    "This product is just terrible...",
    "I like it",
    "It's ok",
    "My mom liked it",
    "My sister thought it was alright"
]
# 3-class classification, -1: neg, 0: neut, 1: pos
labels = [1, -1, 1, 0, 1, 0]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

print(X_train, y_train)

In [None]:
from datasets import load_dataset
d = load_dataset('imdb')
train_data = d['train']
test_data = d['test']

In [None]:
X_train, y_train = train_data['text'], train_data['label']
X_test, y_test = test_data['text'], test_data['label']

In [None]:
X_train = X_train[:1000]
y_train = y_train[:1000]
X_test = X_test[:100]
y_test = y_test[:100]

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [None]:
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import TensorDataset

train_enc = tokenizer(X_train, truncation=True, padding=True)
test_enc = tokenizer(X_test, truncation=True, padding=True)
train_enc = {k: torch.tensor(v) for k, v in train_enc.items()}
test_enc = {k: torch.tensor(v) for k, v in test_enc.items()}

train_dataset = TensorDataset(train_enc['input_ids'], train_enc['attention_mask'], torch.tensor(y_train))
test_dataset = TensorDataset(test_enc['input_ids'], test_enc['attention_mask'], torch.tensor(y_test))

train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=32)
test_dataloader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=32)

In [None]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
device = torch.device('cpu')
model = model.to(device)

In [None]:
batch_size = 16
epochs = 4
learning_rate = 2e-5
adam_epsilon = 1e-8

train_loader = DataLoader(train_dataset, 
                          sampler=RandomSampler(train_dataset), 
                          batch_size=batch_size)

test_loader = DataLoader(test_dataset, 
                         sampler=SequentialSampler(test_dataset), 
                         batch_size=batch_size)

optimizer = torch.optim.AdamW(model.parameters(),
                  lr=learning_rate, 
                  eps=adam_epsilon)

total_steps = len(train_loader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0, 
                                            num_training_steps=total_steps)

In [None]:
from tqdm import tqdm

for epoch in range(epochs):
    model.train()
    
    for step, batch in tqdm(enumerate(train_loader)):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        model.zero_grad()        
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask, 
                        labels=b_labels)
        
        loss = outputs[0]
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
    
    model.eval()
    
    with torch.no_grad():
        correct = 0
        total = 0
        
        for batch in test_loader:
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)
            
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask, 
                            labels=b_labels)
            
            _, predicted = torch.max(outputs[1], dim=1)
            total += b_labels.size(0)
            correct += (predicted == b_labels).sum().item()
        
        accuracy = 100 * correct / total
        print(f"Epoch {epoch+1}: Test Accuracy = {accuracy:.2f}%")



# Or just use a fine-tuned model:

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("lvwerra/distilbert-imdb")

model = AutoModelForSequenceClassification.from_pretrained("lvwerra/distilbert-imdb")

In [None]:
s = "This movie was sorta okay, I guess. My brother liked it, but I didn't."
s = "I love it!"

# tokenizing the data
tokenized = tokenizer(s, return_tensors='pt')
# running the data through the model
sentiment_scores = model(**tokenized)
# extracting the sentiment
sentiment = torch.argmax(sentiment_scores[0], dim=1)[0].item()

print(sentiment)

# 3) Language generation

In [None]:
generator = pipeline('text-generation', model='distilgpt2')
generated = generator("What do you", max_length=10, num_return_sequences=10, do_sample=True, top_k=50, top_p=0.95, temperature=0.4, repetition_penalty=1.5)
for g in generated:
    print(g['generated_text'])