In [1]:
from datasets import load_from_disk
import collections
import numpy as np
import transformers
from transformers import default_data_collator
import time
import os
import gc

In [2]:
# from huggingface_hub import notebook_login
# notebook_login()

In [3]:
chunked_magnetics_ds=load_from_disk("chunked_magnetics_ds")

In [4]:
chunked_magnetics_ds

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1084394
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 123827
    })
})

In [5]:
class cfg:
    batch_size=32
    num_train_epochs = 3
    wwm_probability = 0.15
    DEBUG=True
    
    
    scheduler='linear'
    model_checkpoint="bert-base-cased"
    KERNEL_TYPE=model_checkpoint+'_model_'+scheduler+'_sched_'+str(num_train_epochs)+'_epochs_'+str(batch_size)+'_BS'
    LOG_DIR='./logs'
    MODEL_DIR='./weights'

    load_model = False
    epoch_cont=0
    load_path = model_file=os.path.join(MODEL_DIR, f'{KERNEL_TYPE}_best.pth')
    

In [6]:
if not os.path.exists(cfg.LOG_DIR):
    os.makedirs(cfg.LOG_DIR)

if not os.path.exists(cfg.MODEL_DIR):
    os.makedirs(cfg.MODEL_DIR)

cfg.KERNEL_TYPE

'bert-base-cased_model_linear_sched_3_epochs_32_BS'

In [7]:
from transformers import AutoTokenizer
tokenizer=AutoTokenizer.from_pretrained(cfg.model_checkpoint)

In [8]:
def whole_word_masking_data_collator(features):
    for feature in features:
        words_ids=feature.pop("word_ids")

        mapping=collections.defaultdict(list)
        current_word_index=-1
        current_word=None

        for idx,word_id in enumerate(words_ids):
            if word_id is not None:
                if word_id !=current_word:
                    current_word=word_id
                    current_word_index+=1
                mapping[current_word_index].append(idx)
        
        mask=np.random.binomial(1,cfg.wwm_probability,(len(mapping),))

        input_ids=feature['input_ids']
        labels=feature['labels']
        new_labels=[-100]*len(labels)

        for word_id in np.where(mask)[0]:
            word_id=word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx]=labels[idx]
                input_ids[idx]=tokenizer.mask_token_id
        feature["labels"]=new_labels

    return default_data_collator(features)


In [9]:

# samples=[chunked_magnetics_ds['train'][i] for i in range(2)]
# batch=whole_word_masking_data_collator(samples)
# for chunk in batch["input_ids"]:
#     print(f"\n'>>> {tokenizer.decode(chunk)}'")

In [10]:
def insert_random_mask(batch):
    features = [dict(zip(batch, t)) for t in zip(*batch.values())]
    masked_inputs = whole_word_masking_data_collator(features)
    # Create a new "masked" column for each column in the dataset
    return {"masked_" + k: v.numpy() for k, v in masked_inputs.items()}

In [11]:
if cfg.DEBUG:
    train_size = 500
    test_size = int(0.1 * train_size)

    chunked_magnetics_ds = chunked_magnetics_ds["train"].train_test_split(train_size=train_size, test_size=test_size, seed=42)


In [12]:
chunked_magnetics_ds

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 500
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 50
    })
})

In [13]:
eval_dataset=chunked_magnetics_ds['test'].map(insert_random_mask,batched=True,num_proc=8,\
                                              remove_columns=chunked_magnetics_ds['test'].column_names)



Map (num_proc=8):   0%|          | 0/50 [00:00<?, ? examples/s]

In [14]:
eval_dataset

Dataset({
    features: ['masked_input_ids', 'masked_token_type_ids', 'masked_attention_mask', 'masked_labels'],
    num_rows: 50
})

In [15]:
eval_dataset = eval_dataset.rename_columns(
    {
        "masked_input_ids": "input_ids",
        "masked_attention_mask": "attention_mask",
        "masked_labels": "labels",
        "masked_token_type_ids": "token_type_ids"
    }
)

In [16]:
from torch.utils.data import DataLoader


train_dataloader=DataLoader(chunked_magnetics_ds['train'],
                            shuffle=True,
                            batch_size=cfg.batch_size,
                            collate_fn=whole_word_masking_data_collator)

eval_dataloader=DataLoader(eval_dataset,
                           batch_size=cfg.batch_size,
                           collate_fn=default_data_collator)




In [17]:
from transformers import AutoModelForMaskedLM
model=AutoModelForMaskedLM.from_pretrained(cfg.model_checkpoint)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMaskedLM were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['cls.predictions.decoder.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
if cfg.load_model:
    checkpoint = torch.load(cfg.load_path)
    model.load_state_dict(checkpoint['model'])
    cfg.epoch_cont=checkpoint['epoch']+1

In [19]:
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=5e-5)

if cfg.load_model:
    checkpoint = torch.load(cfg.load_path)
    optimizer.load_state_dict(checkpoint['optimizer'])


In [20]:
from transformers import get_scheduler

num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = cfg.num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

if cfg.load_model:
    checkpoint = torch.load(cfg.load_path)
    lr_scheduler.load_state_dict(checkpoint['lr_sched'])

In [21]:
import os 
import torch

# os.environ['CUDA_VISIBLE_DEVICES'] = '4'

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cpu')

In [22]:
from huggingface_hub import create_repo,get_full_repo_name,Repository

# create_repo(f"nlp-magnets/{cfg.KERNEL_TYPE}",private=True,repo_type="model")

repo_name=get_full_repo_name(model_id=cfg.KERNEL_TYPE,organization="nlp-magnets")
repo_name
output_dir=cfg.KERNEL_TYPE
repo = Repository(output_dir, clone_from=repo_name)
repo.git_pull()

/home/huseyin.ucar/NLP_magnets/bert-base-cased_model_linear_sched_3_epochs_32_BS is already a clone of https://huggingface.co/nlp-magnets/bert-base-cased_model_linear_sched_3_epochs_32_BS. Make sure you pull the latest changes with `repo.git_pull()`.


In [23]:
from tqdm.auto import tqdm
import math

model.to(device)

progress_bar=tqdm(range(num_training_steps))
best_score=float("inf")

for epoch in range(cfg.epoch_cont,cfg.num_train_epochs):
    model.train()

    for batch in train_dataloader:
        batch={k:v.to(device) for k,v in batch.items()}
        outputs=model(**batch)
        loss=outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    model.eval()
    losses=[]
    
    for step,batch in enumerate(eval_dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs=model(**batch)

        loss=outputs.loss
        losses.append(loss.repeat(cfg.batch_size))
    
    losses=torch.cat(losses)
    losses=losses[:len(eval_dataset)]

    try:
        perplexity=math.exp(torch.mean(losses))
    except OverflowError:
        perplexity=float("inf")

    
    content=time.ctime()+' '+f'Epoch {epoch}, Perplexity: {perplexity}'
    print(content)

    with open(os.path.join(cfg.LOG_DIR, f'log_{cfg.KERNEL_TYPE}.txt'),'a')\
        as appender:
        appender.write(content + '\n')

    model_file=os.path.join(cfg.MODEL_DIR, f'{cfg.KERNEL_TYPE}_best.pth')
    if best_score > perplexity:
        print('score ({:.5f} --> {:.5f}). Saving model ...'.format(best_score, perplexity))
        best_score = perplexity

        checkpoint = { 
            'epoch': epoch,
            'model': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            'lr_sched': lr_scheduler.state_dict()}
        torch.save(checkpoint, model_file)

        model.save_pretrained(f"./{cfg.KERNEL_TYPE}")
        tokenizer.save_pretrained(f"./{cfg.KERNEL_TYPE}")
        repo.push_to_hub("best model commit")
        
        # model.push_to_hub(f"nlp-magnets/{cfg.KERNEL_TYPE}",private=True,commit_message="Best model BERT")
        
        

model_file=os.path.join(cfg.MODEL_DIR, f'{cfg.KERNEL_TYPE}_last.pth')
checkpoint = { 
            'epoch': epoch,
            'model': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            'lr_sched': lr_scheduler.state_dict()}
torch.save(checkpoint, model_file)

model.save_pretrained(f"./{cfg.KERNEL_TYPE}",)
tokenizer.save_pretrained(f"./{cfg.KERNEL_TYPE}")
repo.push_to_hub("final model commit")



  0%|          | 0/48 [00:00<?, ?it/s]

Tue Nov  7 13:44:37 2023 Epoch 0, Perplexity: 76.52265798046578
score (inf --> 76.52266). Saving model ...


Upload file pytorch_model.bin:   0%|          | 1.00/413M [00:00<?, ?B/s]

To https://huggingface.co/nlp-magnets/bert-base-cased_model_linear_sched_3_epochs_32_BS
   f431a02..8e3f4a6  main -> main



KeyboardInterrupt: 