In [1]:
import torch as t
import torch.nn as nn
import transformers
import bert_tests
import bert_tao
import random

In [2]:
# https://huggingface.co/bert-base-cased

class Bert(nn.Module):
    def __init__(self, config={}):
        super(Bert, self).__init__()
        
        self.model = transformers.BertForMaskedLM.from_pretrained("bert-base-cased")
        self.model.config.update(config) # Modifies self.pretrained_model.config in-place


    def forward(self, **inputs):
        return self.model(**inputs) 

class BertEmbedded(Bert):
    def forward(self, **inputs):
        embedded = self.model.cls.predictions.transform(self.model.bert(**inputs).last_hidden_state)
        logits = self.model.cls.predictions.decoder(embedded)
        return embedded, unembedded

tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-cased')
model = Bert()

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
# https://huggingface.co/bert-base-cased

# Same as above, just fresh weights (not pretrained)

class Bert(nn.Module):
    def __init__(self, pretrained=True, config={}):
        super(Bert, self).__init__()
        
        default_config = {
            "attention_probs_dropout_prob": 0.1,
            "classifier_dropout": None,
            "gradient_checkpointing": False,
            "hidden_act": "gelu",
            "hidden_dropout_prob": 0.1,
            "hidden_size": 768,
            "initializer_range": 0.02,
            "intermediate_size": 3072,
            "layer_norm_eps": 1e-12,
            "max_position_embeddings": 512,
            "model_type": "bert",
            "num_attention_heads": 12,
            "num_hidden_layers": 12,
            "pad_token_id": 0,
            "position_embedding_type": "absolute",
            "transformers_version": "4.16.2",
            "type_vocab_size": 2,
            "vocab_size": 28996
        }
        
        if pretrained:
            self.model = transformers.BertForMaskedLM.from_pretrained("bert-base-cased")
            self.model.config.update(config) # Modifies self.pretrained_model.config in-place

        else:
            config = transformers.PretrainedConfig.from_dict({**default_config, **config})
            self.model = transformers.BertForMaskedLM(config)

    def forward(self, **inputs):
        return self.model(**inputs) 

class BertEmbedded(Bert):
    def forward(self, **inputs):
        embedded = self.model.cls.predictions.transform(self.model.bert(**inputs).last_hidden_state)
        unembedded = self.model.cls.predictions.decoder(embedded)
        return embedded, unembedded

tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-cased')
model = Bert()

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
def ascii_art_probs(text, k=5):

    inputs = tokenizer(text, return_tensors="pt")
    mask_indices, = t.where(inputs["input_ids"][0] == tokenizer.mask_token_id)

    outputs = model(**inputs)
    logits = t.nn.functional.softmax(outputs.logits, dim=2)
    
    top_k_masks = t.topk(logits, k, dim = 2)[1][0][mask_indices]

    candidate_words = [tokenizer.decode(candidate_list).split() for candidate_list in top_k_masks]
    candidate_percents = [logits[:,mask_index,top_k_masks[i]][0] for i, mask_index in enumerate(mask_indices)]
    logits = logits.argmax(dim=2)
    tokenizer.decode(logits[0])

    s = text.replace('[MASK]', '___') + '\n\n'
    for i, (words, percents) in enumerate(zip(candidate_words, candidate_percents)):
        candidates = ['%d%%\t%s' % (round(float(percent*100)), word)  for word, percent in zip(words, percents)]
        s += '\n'.join(candidates) + '\n\n'
    print(s)

text = "The firetruck was painted a bright [MASK]."
ascii_art_probs(text)

The firetruck was painted a bright ___.

48%	red
15%	yellow
10%	blue
8%	pink
6%	orange




In [5]:
import bert_tests

class BertClassifier(nn.Module):
    def __init__(self, pretrained=True, **config):
        super(BertClassifier, self).__init__()
        bert_config = {'attention_probs_dropout_prob': config['dropout'], **config}
        self.bert = BertEmbedded(pretrained, bert_config)
        self.classifier_dropout = nn.Dropout(p=config['dropout'])
        self.classifier = nn.Linear(config['hidden_size'], config['num_classes'])

    def forward(self, input_ids):
        embedded, unembedded = self.bert(input_ids=input_ids)
        #logits = t.nn.functional.softmax(unembedded, dim=2)
        classifications = self.classifier(self.classifier_dropout(embedded[:,0]))
        return unembedded, classifications

# Everything is working more or less correctly; the problem is that the random initialization of our Bert weights is just slightly different than the way they want it to be :(
bert_tests.test_bert_classification(BertClassifier)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


AssertionError: error in bert
 SHAPE (1, 4, 28996) MEAN: -4.229 STD: 2.874 VALS [-7.459 -7.382 -7.54 -7.476 -7.382 -7.434 -7.522 -7.545 -7.549 -7.458...] 
!=
 SHAPE (1, 4, 28996) MEAN: 0.003031 STD: 0.5765 VALS [-0.5742 -0.432 0.1186 -0.7165 -0.5261 0.4967 1.223 0.3165 -0.3247 -0.5716...]

In [6]:
# Using Tao's implementation :(

class BertClassifier(nn.Module):
    def __init__(self, **config):
        super(BertClassifier, self).__init__()
        self.bert = bert_tao.Bert(config)

    def forward(self, input_ids):
        outputs = self.bert(input_ids=input_ids)
        return outputs.logits, outputs.classification
    
bert_tests.test_bert_classification(BertClassifier)

bert MATCH!!!!!!!!
 SHAPE (1, 4, 28996) MEAN: 0.003031 STD: 0.5765 VALS [-0.5742 -0.432 0.1186 -0.7165 -0.5261 0.4967 1.223 0.3165 -0.3247 -0.5716...]
bert MATCH!!!!!!!!
 SHAPE (1, 2) MEAN: 0.09479 STD: 1.411 VALS [-0.903 1.093]


## Step 2

In [7]:
import torchtext
from tqdm import tqdm

def batch(data, batch_size):
    batches, batch = [], []
    for i, sample in enumerate(data, 1):
        if i % batch_size == 0:
            batches.append(batch)
            batch = []
        batch.append(sample)

    batches.append(batch)
    return batches

def tokenize_batch(batch, tokenizer, max_seq_len):
    sentiments, texts = zip(*batch)
    outputs = tokenizer(texts, return_tensors="pt", padding='longest', max_length=max_seq_len, truncation=True)
    return list(zip(sentiments, outputs['input_ids']))

def tokenize(batches, max_seq_len=512):
    tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-cased')
    return [tokenize_batch(batch, tokenizer, max_seq_len) for batch in tqdm(batches)]

def convert_to_int(batches):
    conv_dict = {
        "pos": 1, 
        "neg": 0
    }
    return [
            [(conv_dict[sentiment], text) for sentiment,text in batch] 
            for batch in batches
            ]

def preprocess(data, batch_size, max_seq_len=512):
    
    batched_data = batch(data, batch_size)
    random.shuffle(batched_data)
    tokenized = tokenize(batched_data, max_seq_len)
    preprocessed = convert_to_int(tokenized)
    
    return preprocessed

In [26]:
data_train, data_test = torchtext.datasets.IMDB(root='.data', split=('train', 'test'))

data_train_list = list(data_train)
data_test_list = list(data_test)

tokenized_train_batches = preprocess(data_train_list, 32)
tokenized_test_batches = preprocess(data_test_list, 32)

100%|██████████| 782/782 [01:38<00:00,  7.95it/s]
100%|██████████| 782/782 [01:35<00:00,  8.15it/s]


## Part 3

In [27]:
default_config = {
        "vocab_size": 28996,
        "intermediate_size": 3072,
        "hidden_size": 768,
        "num_classes": 1,
        "num_layers": 12,
        "num_heads": 12,
        "max_position_embeddings": 512,
        "dropout": 0.1,
        "type_vocab_size": 2,
    }


# t.nn.CrossEntropyLoss()
gpu = t.device('cuda') 

pretrained_bert = BertClassifier(**default_config)
pretrained_bert.cuda()

optimizer = t.optim.Adam(pretrained_bert.parameters())
loss_func = t.nn.CrossEntropyLoss()

num_epochs = 10

for epoch in tqdm(range(num_epochs)): 
    running_loss = 0.0 #wait we are keep track of this for...?
    for i, data in enumerate(tokenized_train_batches):
        labels, inputs = zip(*data)
        stacked_labels = t.Tensor(labels).to(gpu)
        stacked_inputs = t.stack(inputs).to(gpu)
        optimizer.zero_grad()
        _, outputs = pretrained_bert(stacked_inputs)
        outputs = outputs.to(gpu)
        loss = loss_func(outputs, stacked_labels[:, None])
        
        loss.backward()
        optimizer.step()
        
        # print statistics
        running_loss += loss.item()
        if i % 2000 == 1999:    # print every 2000 mini-batches
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
            running_loss = 0.0

100%|██████████| 10/10 [1:06:20<00:00, 398.01s/it]


In [30]:
t.save(pretrained_bert.state_dict(), './bert-classifier.pt')

In [31]:
print(t.cuda.memory_summary(device=gpu, abbreviated=False))

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |    5161 MB |   31710 MB |  647173 GB |  647168 GB |
|       from large pool |    5157 MB |   31702 MB |  647071 GB |  647066 GB |
|       from small pool |       3 MB |      26 MB |     102 GB |     102 GB |
|---------------------------------------------------------------------------|
| Active memory         |    5161 MB |   31710 MB |  647173 GB |  647168 GB |
|       from large pool |    5157 MB |   31702 MB |  647071 GB |  647066 GB |
|       from small pool |       3 MB |      26 MB |     102 GB |     102 GB |
|---------------------------------------------------------------