In [1]:
import torch
import platform

mac_info = platform.mac_ver()
print(f"MacOS version: {mac_info[0]} (>=12.3.X)")
print(f"Using: {mac_info[2]} version of python ('arm64')")

print(f"GPU acceleration for torch: {torch.has_mps}")

print(f"Platform: {platform.platform()}")

MacOS version: 12.5.1 (>=12.3.X)
Using: arm64 version of python ('arm64')
GPU acceleration for torch: True
Platform: macOS-12.5.1-arm64-arm-64bit


Hardware acceleration

In [2]:
device = torch.device('mps')

In [3]:
from transformers import AutoTokenizer # pip install transformers
from datasets import load_dataset # pip install datasets

  from .autonotebook import tqdm as notebook_tqdm


Download data to fine-tune the Bert model

In [4]:
# load the first 1K rows of the TREC dataset
trec = load_dataset('trec', split='train[:1000]')
trec

Found cached dataset trec (/Users/yunusskeete/.cache/huggingface/datasets/trec/default/2.0.0/f2469cab1b5fceec7249fda55360dfdbd92a7a5b545e91ea0f78ad108ffac1c2)


Dataset({
    features: ['text', 'coarse_label', 'fine_label'],
    num_rows: 1000
})

In [5]:
trecTRAIN = load_dataset('trec', split='train')
trecTRAIN

Found cached dataset trec (/Users/yunusskeete/.cache/huggingface/datasets/trec/default/2.0.0/f2469cab1b5fceec7249fda55360dfdbd92a7a5b545e91ea0f78ad108ffac1c2)


Dataset({
    features: ['text', 'coarse_label', 'fine_label'],
    num_rows: 5452
})

In [6]:
trecTEST = load_dataset('trec', split='test')
trecTEST

Found cached dataset trec (/Users/yunusskeete/.cache/huggingface/datasets/trec/default/2.0.0/f2469cab1b5fceec7249fda55360dfdbd92a7a5b545e91ea0f78ad108ffac1c2)


Dataset({
    features: ['text', 'coarse_label', 'fine_label'],
    num_rows: 500
})

In [7]:
trec[0]

{'text': 'How did serfdom develop in and then leave Russia ?',
 'coarse_label': 2,
 'fine_label': 26}

In [8]:
test_trec = load_dataset('trec', split='train[1000:1200]')
test_trec

Found cached dataset trec (/Users/yunusskeete/.cache/huggingface/datasets/trec/default/2.0.0/f2469cab1b5fceec7249fda55360dfdbd92a7a5b545e91ea0f78ad108ffac1c2)


Dataset({
    features: ['text', 'coarse_label', 'fine_label'],
    num_rows: 200
})

In [9]:
test_trec[0]

{'text': 'What singer became despondent over the death of Freddie Prinze , quit show business , and then quit the business ?',
 'coarse_label': 3,
 'fine_label': 29}

Use the bert-base-uncased tokenizer from Hugging Face

In [10]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# we have a small dataset so we can tokenize everything at once
# tokenize everything
tokens = tokenizer(
    trec['text'], max_length=512,
    truncation=True, padding='max_length'
)

In [11]:
tokens[:2]

[Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])]

In [12]:
tokens[0].ids[:10], tokens[0].ids[10:20], tokens[0].ids[-10:]

([101, 2129, 2106, 14262, 2546, 9527, 4503, 1999, 1998, 2059],
 [2681, 3607, 1029, 102, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [13]:
type(tokens)

transformers.tokenization_utils_base.BatchEncoding

One-hot encode labels

In [14]:
import numpy as np

# initialize array to be used
labels = np.zeros(
    (len(trec), max(trec['coarse_label'])+1)
)
# one-hot encode
labels[np.arange(len(trec)), trec['coarse_label']] = 1
labels[:5]

array([[0., 0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.]])

In [15]:
labels = torch.Tensor(labels)

In [16]:
# initialize array to be used
test_labels = np.zeros(
    (len(trec), max(test_trec['coarse_label'])+1)
)
# one-hot encode
test_labels[np.arange(len(test_trec)), test_trec['coarse_label']] = 1
test_labels[:5]

array([[0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.]])

In [17]:
test_labels = torch.Tensor(test_labels)

Create the dataset object

In [18]:
class TrecDataset(torch.utils.data.Dataset):
    def __init__(self, tokens, labels):
        self.tokens = tokens
        self.labels = labels

    def __getitem__(self, idx):
        input_ids = self.tokens[idx].ids
        attention_mask = self.tokens[idx].attention_mask
        labels = self.labels[idx]
        return {
            'input_ids': torch.tensor(input_ids),
            'attention_mask': torch.tensor(attention_mask),
            'labels': torch.tensor(labels)
        }

    def __len__(self):
        return len(self.labels)

dataset = TrecDataset(tokens, labels)

Create the data loader

In [19]:
loader = torch.utils.data.DataLoader(
    dataset, batch_size=64
)

In [20]:
test_loader = torch.utils.data.DataLoader(
    dataset, batch_size=64
)

In [21]:
max(trec['coarse_label'])+1

6

In [22]:
from transformers import BertForSequenceClassification, BertConfig

config = BertConfig.from_pretrained('bert-base-uncased')
config.num_labels = max(trec['coarse_label'])+1
model = BertForSequenceClassification(config).to(device)

Train

In [23]:
"""
Fine-tune the classification head only:
Freeze all BERT layer parameters, leaving just final few classification layers.
"""

for param in model.bert.parameters():
    param.requires_grad = False

In [24]:
from transformers import AdamW
from tqdm.auto import tqdm

# activate training mode of model
model.train()

# initialize adam optimizer with weight decay
optim = AdamW(model.parameters(), lr=5e-5)



In [25]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 4,614 trainable parameters


In [26]:
device

device(type='mps')

In [27]:
from time import time
from tqdm.auto import tqdm

loop_time = []

# setup loop (using tqdm for the progress bar)
loop = tqdm(loader, leave=True)
for batch in loop:
    batch_mps = {
        'input_ids': batch['input_ids'].to(device),
        'attention_mask': batch['attention_mask'].to(device),
        'labels': batch['labels'].to(device)
    }
    t0 = time()
    # initialize calculated gradients (from prev step)
    optim.zero_grad()
    # train model on batch and return outputs (incl. loss)
    outputs = model(**batch_mps)
    # extract loss
    loss = outputs[0]
    # calculate loss for every parameter that needs grad update
    loss.backward()
    # update parameters
    optim.step()
    loop_time.append(time()-t0)
    # print relevant info to progress bar
    loop.set_postfix(loss=loss.item())

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  'labels': torch.tensor(labels)
100%|██████████| 16/16 [05:37<00:00, 21.07s/it, loss=0.623]


In [28]:
loop_time

[20.102845907211304,
 17.658621788024902,
 17.239214181900024,
 18.1284282207489,
 16.0555682182312,
 16.633388996124268,
 16.169923067092896,
 17.234124660491943,
 20.20867419242859,
 15.843982934951782,
 16.44768500328064,
 16.141244888305664,
 17.065425872802734,
 17.40112614631653,
 17.688358068466187,
 11.527345895767212]

In [29]:
# Put model into inference mode
model.eval()

predictions = torch.Tensor([]).to(device)

test_loop = tqdm(test_loader, leave=True)
with torch.inference_mode():
    for batch in loop:
        batch_mps = {
            'input_ids': batch['input_ids'].to(device),
            'attention_mask': batch['attention_mask'].to(device),
            'labels': batch['labels'].to(device)
        }
        outputs = model(**batch_mps)
        preds = outputs['logits']
        predictions = torch.cat((predictions, preds), 0)

  'labels': torch.tensor(labels)


In [30]:
predictions.shape, predictions[0]

  nonzero_finite_vals = torch.masked_select(


(torch.Size([1000, 6]),
 tensor([-0.2157, -0.2370, -0.5152, -0.2934, -0.3427, -0.3002], device='mps:0'))

In [31]:
# import os

# predictions_path = "./predictions"
# if not os.path.isdir(predictions_path):
#     os.mkdir(predictions_path)

In [32]:
# import shutil

# if predictions_path is not None:
#     np.save(predictions_path, predictions.cpu())

#     # np.save() automatically adds a ".npy" to the end of the file.
#     # We rename the file produced by removing the ".npy" suffix, to make sure that
#     # predictions_path is the actual file name.
#     shutil.move(str(predictions_path) + ".npy", predictions_path)

In [33]:
criterion = torch.nn.CrossEntropyLoss().to(device)

In [34]:
batch_mps

{'input_ids': tensor([[ 101, 2043, 2024,  ...,    0,    0,    0],
         [ 101, 2054, 4368,  ...,    0,    0,    0],
         [ 101, 2054, 1005,  ...,    0,    0,    0],
         ...,
         [ 101, 2054, 2003,  ...,    0,    0,    0],
         [ 101, 2040, 2001,  ...,    0,    0,    0],
         [ 101, 2040, 6791,  ...,    0,    0,    0]], device='mps:0'),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]], device='mps:0'),
 'labels': tensor([[0., 0., 0., 0., 0., 1.],
         [0., 1., 0., 0., 0., 0.],
         [0., 0., 0., 0., 1., 0.],
         [0., 0., 1., 0., 0., 0.],
         [0., 0., 0., 0., 0., 1.],
         [0., 1., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 1.],
         [0., 1., 0., 0., 0., 0.],
         [0., 1., 0., 0., 0., 0.],
         [0., 1., 0., 0., 0., 0.],
         [0., 1.,

In [58]:
# outputs = model(**batch_mps)

In [None]:
loss1 = outputs.loss
loss1

tensor(0.6254, device='mps:0', grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)

In [None]:
outputs.logits

tensor([[-0.0620, -0.3503, -0.4045, -0.5189, -0.1923, -0.2770],
        [-0.0178, -0.3267, -0.3746, -0.4427, -0.0427, -0.2308],
        [-0.0063, -0.3166, -0.3543, -0.5956, -0.1608, -0.2279],
        [-0.0798, -0.2898, -0.4046, -0.5222, -0.1880, -0.1868],
        [-0.0315, -0.3010, -0.4243, -0.5649, -0.1909, -0.2784],
        [-0.0820, -0.3003, -0.4113, -0.5401, -0.2080, -0.2435],
        [-0.0761, -0.2901, -0.3044, -0.4977, -0.1462, -0.1609],
        [ 0.0726, -0.3795, -0.3404, -0.4921, -0.1503, -0.2429],
        [ 0.0619, -0.4194, -0.3907, -0.5288, -0.1095, -0.2095],
        [-0.0349, -0.3293, -0.3984, -0.5432, -0.1490, -0.2525],
        [ 0.0435, -0.3904, -0.4238, -0.4881, -0.1842, -0.2638],
        [-0.1001, -0.3654, -0.3491, -0.4803, -0.0830, -0.2231],
        [-0.0947, -0.3699, -0.5199, -0.4874, -0.2117, -0.2825],
        [-0.0128, -0.3408, -0.4053, -0.4398, -0.1687, -0.3006],
        [-0.0511, -0.3068, -0.2931, -0.5864, -0.1528, -0.2455],
        [-0.0140, -0.3418, -0.3584, -0.5

In [36]:
loss2 = criterion(outputs.logits, batch_mps['labels'])
loss2

tensor(1.8004, device='mps:0')

```python
def accuracy(datasamples, predictions_path):

    outputs = np.load(predictions_path)
    labels = datasamples["labels"]

    preds = np.argmax(outputs, axis=1)

    # I don't think we need to one-hot encode labels AT ALL

    targets = np.argmax(labels, axis=1)
    correct = preds == targets
    acc = sum(correct) / len(correct)
    
    return acc
```

In [37]:
def accuracy(outputs, labels):

    preds = np.argmax(outputs, axis=1)

    # I don't think we need to one-hot encode labels AT ALL

    targets = np.argmax(labels, axis=1)
    correct = preds == targets
    acc = sum(correct) / len(correct)
    
    return acc

In [53]:
preds = np.argmax(outputs.logits.cpu().detach(), axis=1)
preds

tensor([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [47]:
targets = np.argmax(batch_mps['labels'].cpu().detach(), axis=1)
targets

tensor([5, 1, 4, 2, 5, 1, 5, 1, 1, 1, 1, 4, 2, 3, 3, 3, 5, 1, 3, 5, 0, 2, 4, 2,
        1, 5, 3, 1, 3, 2, 4, 1, 1, 1, 1, 2, 4, 2, 3, 3])

In [54]:
acc = accuracy(outputs.logits.cpu().detach(), batch_mps['labels'].cpu().detach())
acc

tensor(0.0250)

In [56]:
# %%
# Evaluating function
# *******************
@torch.no_grad()
def evaluate(model, loader, criterion):
    
    epoch_loss, epoch_acc = 0, 0
    
    # activate evaluation mode of model
    model.eval()
    
    for batch in tqdm(test_loader, leave=None):

        batch_mps = {
            'input_ids': batch['input_ids'].to(device),
            'attention_mask': batch['attention_mask'].to(device),
            'labels': batch['labels'].to(device)
        }
                
        outputs = model(**batch_mps)
        
        loss = outputs.loss
        # loss = criterion(outputs.logits, batch_mps["labels"])
        acc = accuracy(outputs.logits.cpu().detach(), batch_mps['labels'].cpu().detach())

        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(loader), epoch_acc / len(loader)