In [1]:
# import os
# from pathlib import Path
# from google.colab import drive

# PATH = "/content/drive"
# drive.mount(PATH)

# folder_path = Path('MyDrive/models/abs_bert')

In [2]:
# !pip install --quiet transformers datasets
# !pip install --quiet tensorboard
# tensorboard --logdir=runs

In [3]:
import torch
device = 'cuda:5' if torch.cuda.is_available() else 'cpu'
torch.cuda.is_available()

False

In [4]:
from collections import defaultdict
import os
import datetime
from pathlib import Path
import json
from tqdm import tqdm

from torch import nn
from torch.utils.data import DataLoader
import torch.optim.lr_scheduler as lr_scheduler

# from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import BertConfig, BertForMaskedLM

import importlib

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
from utils import load_corpus

In [6]:
# colab
# folder_path = Path(folder_path)
# os.makedirs(PATH/folder_path, exist_ok=True)

# 59
PATH = "data"
folder_path = Path(f"abs_bert")
os.makedirs(PATH/folder_path, exist_ok=True)

In [7]:
tokenizer_type = 'bert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(tokenizer_type)
text = ["I love bird.", "Hi, I'm Bob."]
tokenized = tokenizer(text, return_tensors='pt', padding=True)
tokenized['input_ids']

tensor([[ 101, 1045, 2293, 4743, 1012,  102,    0,    0,    0],
        [ 101, 7632, 1010, 1045, 1005, 1049, 3960, 1012,  102]])

In [13]:
%%time
corpus_name = 'bookcorpus'
# corpus_name = 'msmarco'
# corpus_name = 'wiki'

batch_size = 48
masking_probability = .15
max_length = 256
num_chunks = 1
shuffle = True
# cache_dir = PATH/Path('MyDrive/data')
cache_dir='~/data1-0756727/cache/huggingface'

corpus = load_corpus.Corpus_for_transformer(corpus_name,
                       tokenizer,
                       cache_dir,
                       batch_size=batch_size,
                       max_length=max_length,
                       num_chunks=num_chunks, 
                       shuffle=False)

train_loader = corpus.loader()
n = next(iter(train_loader))
# print(tokenizer.decode(n['data']))
{key: n[key].shape for key in n}

CPU times: total: 2.8 s
Wall time: 3.27 s


{'input_ids': torch.Size([48, 256]),
 'token_type_ids': torch.Size([48, 256]),
 'attention_mask': torch.Size([48, 256]),
 'labels': torch.Size([48, 256])}

In [8]:
import numpy as np

class Sampled_softmax_cross_entropy(nn.Module):
  """https://douglasorr.github.io/2021-10-training-objectives/3-sampled/article.html"""

  def __init__(self, model, num_sampling=100):
    super().__init__()
    self.num_sampling = num_sampling
    self.model = model

  def forward(self, predictions, labels):
    """
    preds: shape [batch_size, dim]
    labels: shape [batch_size, dim]
    """
    # batch_sizes = predictions.shape[:-1]

    # model = ...  # returns (batch_size x embedding_size)
    projection = self.model.word_embeddings()  # shape (n_classes x embedding_size)
    n_classes = projection.shape[0]

    # 2. Get target label scores, paired_inner_product(pred_emb, label_emb)
    label_scores = (predictions * projection[labels, :]).sum(-1) + self.model.bias[labels]

    # 3. Sample shared noise & get scores
    samples = torch.randint(high=n_classes, size=[self.num_sampling]).to(labels.device)
    noise_scores = predictions @ projection[samples, :].T + self.model.bias[None, samples]
    noise_scores += np.log(n_classes - 1)

    # 4. Reject samples matching target label & correct for remaining samples
    reject_samples = (labels[..., None] == samples[None, :]) & (labels[..., None] != -100)  #後面是 collator 會把非預測目標填為 -100
    noise_scores -= 1e6 * reject_samples
    noise_scores -= torch.log((self.num_sampling - reject_samples.sum(-1, keepdims=True)).float())

    # 5. Apply regular softmax cross entropy
    scores = torch.cat([label_scores[..., None], noise_scores], dim=-1)
    pseudo_label = torch.masked_fill(labels.clone(), labels != -100, 0).view(-1)
    loss = torch.nn.functional.cross_entropy(scores.view(-1, scores.shape[-1]), pseudo_label)

    return loss

In [9]:
learning_rate = 4e-5 # for fine tuning

model_params = {
    'hidden_size': 512,
    'vocab_size': tokenizer.vocab_size,
    'num_hidden_layers': 8,
    'num_attention_heads': 8,
    'intermediate_size': 3072,
    'max_position_embeddings': 256
}
loss_params = {
  'sampling_word_size': 100,
}

config = BertConfig(**model_params)
model = BertForMaskedLM(config=config).to(device)
print(model.num_parameters())

# using_loss = Sampled_softmax_cross_entropy(model, num_sampling=loss_params['sampling_word_size'])

# optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, eps=1e-5)
# scheduler = lr_scheduler.LinearLR(optimizer, total_iters=10, start_factor=1)
scheduler = lr_scheduler.CosineAnnealingWarmRestarts(optimizer, 2500, eta_min=1e-6)
optimizer

49670458


AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-05
    foreach: None
    fused: None
    initial_lr: 4e-05
    lr: 4e-05
    maximize: False
    weight_decay: 0.01
)

In [10]:
time_string = datetime.datetime.now(tz=datetime.timezone(datetime.timedelta(hours=8))).strftime('%Y%m%d.%H:%M:%S')
subfolder_path = Path(f"bert-{time_string}")

os.makedirs(PATH/folder_path/subfolder_path, exist_ok=True)

with open(PATH/folder_path/subfolder_path/f'parameters.json', 'w') as f:
  f.write(json.dumps({
    'tokenizer_type': tokenizer_type,
    'model': str(model),
    'model_params': str(model_params),
    'learning_rate': learning_rate,
    'batch_size': batch_size,
    'masking_probability': masking_probability,
    'shuffle': shuffle,
    # 'loss_type': str(using_loss),
    'loss_params': loss_params,
  }))
print(subfolder_path)

bert-20240711.22:03:17


In [11]:
from torch.utils.tensorboard import SummaryWriter

# Writer will output to ./runs/ directory by default
writer = SummaryWriter(log_dir=PATH/folder_path/subfolder_path)
benchmark_writer = benchmark.Benchmarking(writer, tokenizer)

  0%|          | 0/5183 [00:00<?, ?it/s]

In [None]:
num_epochs = 2
save_every_n_batches = 5000
clip_loss = None

multiloss = 0


print(time_string)

# os.makedirs(PATH/folder_path/subfolder_path, exist_ok=True)

##############################################################################################################
model.train() ################################################################################################
# model.eval()
##############################################################################################################

global_step = 0

loss_history = []
for epoch_num, epoch in enumerate(range(num_epochs)):
  bar = tqdm(train_loader)

  for batch_num, batch in enumerate(bar):

    optimizer.zero_grad()

    batch = {key: batch[key].to(device) for key in batch}
    loss = model(**batch).loss

    if clip_loss is not None:
      final_loss = torch.clip(loss, max=clip_loss)
    else:
      final_loss = loss

    loss_dict = {
      'loss': f"{loss.item(): .6f}",
      'final_loss': f"{final_loss.item(): .6f}",
    }

    if global_step == 0:
      writer.add_scalar('Loss/loss', loss, global_step)
      benchmark_writer.predict_and_write(model.bert.embeddings.word_embeddings.weight.detach().cpu(), global_step)
    
    if multiloss:
      loss_dict |= {key: f"{losses[key].item(): .6f}" for key in losses}

    bar.set_postfix(loss_dict)

    # with torch.autograd.detect_anomaly(True):
      # loss.backward()
      # torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=10, norm_type=2)

    final_loss.backward()
    optimizer.step()
    global_step += 1
    
    if (batch_num % 100 == 0):
      # torch.cuda.empty_cache()
      loss_history.append([epoch_num, batch_num, loss_dict])
      # print(f"{datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}, batch {batch_num}")

    if (global_step % 100 == 0):
      writer.add_scalar('Loss/loss', loss, global_step)

    if (batch_num % 1000 == 999):
      scheduler.step()

    if (batch_num % save_every_n_batches == 0):
      prefix = f"epoch_{epoch_num}-batch_{batch_num}-"
      torch.save(model, PATH/folder_path/subfolder_path/f'{prefix}-model.pt')

      with open(PATH/folder_path/subfolder_path/f'epoch_{epoch_num}-history.json', 'w') as f:
        f.write(json.dumps(loss_history))

      # torch.cuda.empty_cache()

      benchmark_writer.predict_and_write(model.bert.embeddings.word_embeddings.weight.detach().cpu(), global_step)

    if (batch_num in [500, 1000, 1500, 2000, 2500, 7500, 12500, 17500]):
      prefix = f"epoch_{epoch_num}-batch_{batch_num}-"
      torch.save(model, PATH/folder_path/subfolder_path/f'{prefix}-model.pt')

      with open(PATH/folder_path/subfolder_path/f'epoch_{epoch_num}-history.json', 'w') as f:
        f.write(json.dumps(loss_history))

      benchmark_writer.predict_and_write(model.bert.embeddings.word_embeddings.weight.detach().cpu(), global_step)
          
    # del batch, loss

20240711.22:03:17


  0%|          | 0/1541755 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (546 > 512). Running this sequence through the model will result in indexing errors
 15%|█▌        | 233395/1541755 [4:59:41<25:34:02, 14.21it/s, loss=2.912432, final_loss=2.912432]  