In [1]:
import torch


In [2]:
import numpy as np
import random

In [3]:
seed=10
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.deterministic=True

In [6]:
#Model Selection
import transformers
from transformers import AutoTokenizer
base_model='bert-base-uncased'

In [8]:
tokenizer = AutoTokenizer.from_pretrained(base_model)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [9]:
len(tokenizer.vocab)

30522

In [14]:
tokens=tokenizer.tokenize('Welcome to MLDevops Workshop')
print(tokens)

['welcome', 'to', 'ml', '##dev', '##ops', 'workshop']


In [19]:
indexes=tokenizer.convert_tokens_to_ids(tokens)
print(indexes)

[6160, 2000, 19875, 24844, 11923, 8395]


In [34]:
init_token=tokenizer.cls_token
eos_token=tokenizer.sep_token
pad_token=tokenizer.pad_token
unk_token=tokenizer.unk_token
print(init_token,cos_token,pad_token,unk_token)

[CLS] [SEP] [PAD] [UNK]


In [35]:
init_token_idx=tokenizer.convert_tokens_to_ids(init_token)
eos_token_idx=tokenizer.convert_tokens_to_ids(cos_token)
pad_token_idx=tokenizer.convert_tokens_to_ids(pad_token)
unk_token_idx=tokenizer.convert_tokens_to_ids(unk_token)

In [36]:
max_input_length=10


In [37]:
def tokenize_and_cut(sentence):
    tokens = tokenizer.tokenize(sentence)
    tokens = tokens[:max_input_length-2]
    return tokens

In [38]:
# Data Collection
from torchtext.legacy import data
text = data.Field(batch_first= True,
                  use_vocab = False,
                  tokenize = tokenize_and_cut,
                  preprocessing=tokenizer.convert_tokens_to_ids,
                  init_token = init_token_idx,
                  eos_token = eos_token_idx,
                  pad_token = pad_token_idx,
                  unk_token = unk_token_idx)

In [44]:
from torchtext.legacy import datasets
TEXT=data.Field(lower=True,include_lengths=True,batch_first=True)
LABEL=data.Field(sequential=False)

train_data, test_data=datasets.IMDB.splits(TEXT, LABEL)
train_data, valid_data=train_data.split(random_state=random.seed(seed))

downloading aclImdb_v1.tar.gz


C:\Users\welcome\ML_devops_workshop\.data\imdb\aclImdb_v1.tar.gz: 100%|████████████████████████████████████████████████████████████| 84.1M/84.1M [06:28<00:00, 216kB/s]


In [45]:
train_data

<torchtext.legacy.data.dataset.Dataset at 0x26486de71f0>

In [46]:
print(f"number of training examples:{len(train_data)}")
print(f"number of validation examples:{len(valid_data)}")
print(f"number of testing examples:{len(test_data)}")

number of training examples:17500
number of validation examples:7500
number of testing examples:25000


In [47]:
print(vars(train_data.examples[10]))

{'text': ['so,', 'back', 'when', 'herbie', 'made', 'his', 'first', 'appearance,', 'i', 'was', 'perfectly', 'happy', 'watching', 'dean', 'jones', 'mug', 'away.', 'i', 'only', 'wanted', 'to', 'be', 'entertained', 'for', 'a', 'few', 'hours', 'and', 'eat', 'overly', 'buttered', 'popcorn.', 'now,', 'unfortunately,', 'i', 'have', 'expectations', 'of', 'a', 'riveting/delightful', 'story', 'whenever', 'i', 'watch', 'a', 'movie,', 'if', "i'm", 'not', 'on', 'some', 'sort', 'of', 'medication.', 'and', 'this', 'is', 'another', 'good', 'movie', 'for', 'the', 'medicated.', 'there', 'are', 'no', 'major', 'laughs,', 'no', 'complex', 'plot', 'lines,', 'no', 'difficult', 'twists.', 'herbie', 'fully', 'loaded', 'is', 'great', 'for', 'the', 'fully', 'loaded.<br', '/><br', '/>this', 'was', 'the', 'first', 'time', 'i', 'had', 'seen', 'la', 'lohan', 'on', 'the', 'screen', 'since', 'she', 'swapped', 'places', 'with', 'jamie', 'lee', 'curtis', '(i', 'thought', 'she', 'was', 'excellent', 'in', 'that),', 'and', 

In [None]:
#tokens=tokenizer.convert_ids_to_tokens(vars(tain))

In [48]:
LABEL.build_vocab(train_data)

In [49]:
print(LABEL.vocab.stoi)

defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x00000264D2FD3DF0>>, {'<unk>': 0, 'neg': 1, 'pos': 2})


In [50]:
batch_size= 128

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = batch_size, 
    device = device)

In [51]:
# build the model
from transformers import AutoTokenizer, AutoModel

b_model = AutoModel.from_pretrained(base_model)

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [52]:
import torch.nn as nn

class SentimentAnalyzer(nn.Module):
    def __init__(self,
                 b_model,
                 hidden_dim,
                 output_dim,
                 n_layers,
                 bidirectional,
                 dropout):
        
        super().__init__()
        
        self.b_model = b_model
        
        embedding_dim = b_model.config.to_dict()['hidden_size']
        
        self.rnn = nn.GRU(embedding_dim,
                          hidden_dim,
                          num_layers = n_layers,
                          bidirectional = bidirectional,
                          batch_first = True,
                          dropout = 0 if n_layers < 2 else dropout)
        
        self.out = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        
        #text = [batch size, sent len]
                
        with torch.no_grad():
            embedded = self.b_model(text)[0]
                
        #embedded = [batch size, sent len, emb dim]
        
        _, hidden = self.rnn(embedded)
        
        #hidden = [n layers * n directions, batch size, emb dim]
        
        if self.rnn.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        else:
            hidden = self.dropout(hidden[-1,:,:])
                
        #hidden = [batch size, hid dim]
        
        output = self.out(hidden)
        
        #output = [batch size, out dim]
        
        return output

In [53]:
#configuring hyper parameter
hidden_dim = 256
op_dim = 1
n_layers = 2
bidirectional = True
dropout = 0.25

model = SentimentAnalyzer(b_model,
                         hidden_dim,
                         op_dim,
                         n_layers,
                         bidirectional,
                         dropout)

In [54]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 112,241,409 trainable parameters


In [55]:
for name, param in model.named_parameters():                
    if name.startswith('b_model'):
        param.requires_grad = False

In [56]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 2,759,169 trainable parameters


In [57]:
for name, param in model.named_parameters():                
    if param.requires_grad:
        print(name)

rnn.weight_ih_l0
rnn.weight_hh_l0
rnn.bias_ih_l0
rnn.bias_hh_l0
rnn.weight_ih_l0_reverse
rnn.weight_hh_l0_reverse
rnn.bias_ih_l0_reverse
rnn.bias_hh_l0_reverse
rnn.weight_ih_l1
rnn.weight_hh_l1
rnn.bias_ih_l1
rnn.bias_hh_l1
rnn.weight_ih_l1_reverse
rnn.weight_hh_l1_reverse
rnn.bias_ih_l1_reverse
rnn.bias_hh_l1_reverse
out.weight
out.bias


In [58]:
#model training
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

In [59]:
criterion = nn.BCEWithLogitsLoss()

In [60]:
model = model.to(device)
criterion = criterion.to(device)

In [61]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [62]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [63]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [64]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [66]:
epochs = 3

best_valid_loss = float('inf')

for epoch in range(epochs):
    
    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
        
    end_time = time.time()
        
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
        
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'dev-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

AttributeError: 'Field' object has no attribute 'vocab'

In [34]:


test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.603 | Test Acc: 65.60%


# 10. Inference

In [35]:
def predict_sentiment(model, tokenizer, sentence):
    model.eval()
    tokens = tokenizer.tokenize(sentence)
    tokens = tokens[:max_input_length-2]
    indexed = [init_token_idx] + tokenizer.convert_tokens_to_ids(tokens) + [eos_token_idx]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(0)
    prediction = torch.sigmoid(model(tensor))
    return prediction.item()

In [36]:
hidden_dim = 256
op_dim = 1
n_layers = 2
bidirectional = True
dropout = 0.25

model = SentimentAnalyzer(b_model,
                         hidden_dim,
                         op_dim,
                         n_layers,
                         bidirectional,
                         dropout)

In [39]:
model.load_state_dict(torch.load('dev-model.pt'))
model.to(device)

SentimentAnalyzer(
  (b_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affi

In [40]:
predict_sentiment(model, tokenizer, "This workshop is awesome")

0.9093654751777649

In [41]:
predict_sentiment(model, tokenizer, "This workshop is boring")

0.06864137202501297

# 11. Commit your code

# 12. Struturizing code with PyCharm