In [1]:
# import the necessary libraries
import argparse
import datetime
import time
import logging
import math
import os
import sys
import random
import datasets
import torch
from torch.optim import AdamW
from datasets import load_dataset, load_from_disk, concatenate_datasets
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
import transformers
from accelerate import Accelerator, DistributedType
from accelerate.logging import get_logger
from accelerate.utils import set_seed, broadcast_object_list
from transformers import (
    BertConfig,
    BertTokenizerFast,
    BertForPreTraining,
    DataCollatorForLanguageModeling,
    DataCollatorWithPadding,
    SchedulerType,
    get_scheduler,
)
from transformers.utils.versions import require_version
from selectionstrategies import SubmodStrategy
from accelerate import InitProcessGroupKwargs
from selectionstrategies.helper_fns import taylor_softmax_v1
import numpy as np
import pickle
import faiss

  from .autonotebook import tqdm as notebook_tqdm
2024-07-31 16:48:41.615966: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-31 16:48:41.755495: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-31 16:48:41.828297: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-31 16:48:41.828452: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-31 16:48:41.

In [2]:
import logging

# Create a logger
logger = logging.getLogger(__name__)

# Set the logging level
logger.setLevel(logging.INFO)

# Create a file handler
file_handler = logging.FileHandler('logfile.log')

# Create a formatter
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

# Set the formatter for the file handler
file_handler.setFormatter(formatter)

# Add the file handler to the logger
logger.addHandler(file_handler)

# Log a message
logger.info('Logger created')

In [3]:
# Variables 
dataset_name = "Salesforce/wikitext"
dataset_config_name = "wikitext-2-raw-v1"
validation_split_percentage = 80
model_config_name = "google-bert/bert-base-uncased"
tokenizer_name = "bert-base-uncased"
use_slow_tokenizer = False  # Bool
num_workers = None # (int)
max_seq_len = 128
short_seq_prob = 0.1
nsp_probability = 0.1
batch_size = 1000

In [4]:
# Get and Preprocess the dataset for the task.
raw_datasets = load_dataset("Salesforce/wikitext", "wikitext-2-raw-v1")

if 'validation' not in raw_datasets.keys():
    raw_datasets=raw_datasets["train"].train_test_split(test_size=(validation_split_percentage/100), shuffle=False)
    raw_datasets=datasets.DatasetDict({"train": raw_datasets["train"], "validation": raw_datasets["test"]})

In [5]:
raw_datasets

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})

In [6]:
# # Uncomment to use custom config

# config = BertConfig(
#     vocab_size=vocab_size,
#     hidden_size=hidden_size,
#     num_hidden_layers=num_hidden_layers,
#     num_attention_heads=num_attention_heads,
#     intermediate_size=intermediate_size,
#     hidden_act="gelu",
#     hidden_dropout_prob=0.1,
#     attention_probs_dropout_prob=0.1,
#     max_position_embeddings=512,
#     type_vocab_size=2,
#     initializer_range=0.02,
#     layer_norm_eps=1e-12,
#     position_embedding_type="absolute",
# )

In [7]:
# Create and instance of the model along with its tokenizer

# Tokenizer
tokenizer = BertTokenizerFast.from_pretrained(tokenizer_name, use_fast= not use_slow_tokenizer)

# Load the model
config = BertConfig.from_pretrained(model_config_name)

# Instantiating the model
model = BertForPreTraining(config)

# Resizing the token embeddings to fit the tokenizer
model.resize_token_embeddings(len(tokenizer))

Embedding(30522, 768, padding_idx=0)

In [8]:
# Tokenize and group the data based on the kind of model

column_names=raw_datasets['train'].column_names
text_column_name="text" if "text" in column_names else column_names[0]

def tokenize_function(examples):
    return tokenizer(examples[text_column_name])

tokenized_dataset = raw_datasets.map(
    tokenize_function,
    batched=True,
    num_proc=num_workers, 
    remove_columns=column_names,
    desc="Running tokenizer on every text in dataset"
)

# Grouping the data 
from experiment_utils import group_texts

train_dataset = tokenized_dataset["train"]
eval_dataset = tokenized_dataset["validation"]

train_dataset = train_dataset.map(
    group_texts, 
    fn_kwargs={'split': 'train', 'tokenizer':tokenizer, 'max_seq_length': max_seq_len, 
               'short_seq_prob':short_seq_prob, 'nsp_probability':nsp_probability, 'tokenized_datasets':tokenized_dataset},
    batched=True,
    batch_size=batch_size,
    num_proc=num_workers,
    with_indices=True,
    desc=f"Grouping Train texts into chucks of {max_seq_len}"
)

eval_dataset = eval_dataset.map(
    group_texts, 
    fn_kwargs={'split': 'validation', 'tokenizer':tokenizer, 'max_seq_length': max_seq_len, 
               'short_seq_prob':short_seq_prob, 'nsp_probability':nsp_probability, 'tokenized_datasets':tokenized_dataset},
    batched=True,
    batch_size=batch_size,
    num_proc=num_workers,
    with_indices=True,
    desc=f"Grouping Validation texts into chucks of {max_seq_len}"
)

In [9]:
eval_dataset

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask', 'next_sentence_label'],
    num_rows: 1195
})

In [10]:
tokenizer.decode(eval_dataset[9]['input_ids'])

'[CLS] pots, although lines baited with octopus or cuttlefish sometimes succeed in tempting them out, to allow them to be caught in a net or by hand. in 2008, 4 @, @ 386 t of h. gammarus were caught across europe and north africa, of which 3 @, @ 462 t ( 79 % ) was caught in the british isles ( including the channel islands ). the minimum landing size for h. gammarus is a carapace length of 87 mm ( 3 @. @ 4 in [SEP] aquaculture systems for h. gammarus are under development, and production rates are still very low. [SEP]'

In [11]:
# Prepare the data
prepared_data = datasets.DatasetDict({"train": train_dataset, "validation": eval_dataset})
dataset=prepared_data['train']

def extract_first_sentences(examples):
    for i, input_ids in enumerate(examples["input_ids"]):
        idx=input_ids.index(tokenizer.sep_token_id)
        examples["input_ids"][i]=input_ids[:idx+1]
        examples["attention_mask"][i]=examples["attention_mask"][i][:idx+1]
        examples["token_type_ids"][i]=examples["token_type_ids"][i][:idx+1]
        examples["special_tokens_mask"][i]=examples["special_tokens_mask"][i][:idx+1]
    return examples

# Separate the data into those that have the next sentence labels and those that do not.
nsp_zero=dataset.filter(lambda examples: [x==0 for x in examples["next_sentence_label"]], batched=True, num_proc=num_workers, keep_in_memory=True)
nsp_one=dataset.filter(lambda examples: [x==1 for x in examples["next_sentence_label"]], batched=True, num_proc=num_workers, keep_in_memory=True)

# Extract the first sentences from both datasets
first_sent_nsp_zero=nsp_zero.map(extract_first_sentences, batched=True, num_proc=num_workers, remove_columns=["next_sentence_label", "special_tokens_mask"], keep_in_memory=True)
first_sent_nsp_one=nsp_one.map(extract_first_sentences, batched=True, num_proc=num_workers, remove_columns=["next_sentence_label", "special_tokens_mask"], keep_in_memory=True)

Filter: 100%|██████████| 11789/11789 [00:00<00:00, 532296.83 examples/s]
Filter: 100%|██████████| 11789/11789 [00:00<00:00, 1277394.14 examples/s]
Map: 100%|██████████| 6555/6555 [00:00<00:00, 6778.78 examples/s]
Map: 100%|██████████| 5234/5234 [00:00<00:00, 6378.93 examples/s]


In [12]:
tokenizer.decode(first_sent_nsp_one[0]['input_ids'])

"[CLS] 2010, carrying over a large portion of the work done on valkyria chronicles ii. while it retained the standard features of the series, it also underwent multiple adjustments, such as making the game more forgiving for series newcomers. character designer raita honjou and composer hitoshi sakimoto both returned from previous entries, along with valkyria chronicles ii director takeshi ozawa. a large team of writers handled the script. the game's opening theme was sung [SEP]"

In [13]:
subset_fraction = 0.25

# Sample a subset of the train dataset
num_samples = int(round(len(train_dataset) * subset_fraction, 0))
init_subset_indices = [random.sample(list(range(len(train_dataset))), num_samples)]
full_dataset=train_dataset
subset_dataset = full_dataset.select(init_subset_indices[0])

In [14]:
first_sent_nsp_zero

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 6555
})

In [15]:
train_dataset

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask', 'next_sentence_label'],
    num_rows: 11789
})

In [16]:
subset_dataset

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask', 'next_sentence_label'],
    num_rows: 2947
})

In [17]:
mlm_probability = 0.15
train_batch_size = 4
eval_batch_size = 4

# Create datacollators
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=mlm_probability)
data_collator_embd = DataCollatorWithPadding(tokenizer=tokenizer)

# warmstart dataloader (train on all the train dataset during warmup)
warmstart_dataloader = DataLoader(train_dataset.remove_columns(['special_tokens_mask']), shuffle=True, collate_fn=data_collator, batch_size=train_batch_size)

# first sent nsp zero dataloader
first_sent_nsp_zero_dataloader=DataLoader(first_sent_nsp_zero, shuffle=False, collate_fn=data_collator_embd, batch_size=eval_batch_size)

# first sent nsp one  dataloader
first_sent_nsp_one_dataloader=DataLoader(first_sent_nsp_one, shuffle=False, collate_fn=data_collator_embd, batch_size=eval_batch_size)

# subset dataloader (train)
subset_dataloader=DataLoader(subset_dataset.remove_columns(['special_tokens_mask']), shuffle=True, collate_fn=data_collator, batch_size=train_batch_size,)

# eval dataloader (validation & testing)
eval_dataloader=DataLoader(eval_dataset.remove_columns(['special_tokens_mask']), collate_fn=data_collator, batch_size=eval_batch_size)

In [20]:
learning_rate = 1e-4
scheduler_name = 'linear'
num_warmup_steps = 100
num_training_steps = 100

# Optimizer
optimizer = AdamW(model.parameters(), lr=learning_rate)

# learning scheduler
lr_scheduler = get_scheduler(
    name=scheduler_name,
    optimizer=optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps
)

In [23]:
warmer = iter(eval_dataloader)
X= next(warmer)
X['input_ids'].shape

torch.Size([4, 128])

In [25]:
warmstart_epochs = 1
completed_steps = 0

# Warmstart the model: Train the model with the warmstart data for warmstart epochs
for epoch in range(warmstart_epochs):
    if epoch==0:
        print("Begining warmstart")
    model.train() # Setup the model for training
    for step, batch in enumerate(warmstart_dataloader):
        outputs = model(**batch)
        loss = outputs.loss
        print(f"Completed Steps: {1+completed_steps}; Loss: {loss.detach().float()}; lr: {lr_scheduler.get_last_lr()};")
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        completed_steps += 1
        if completed_steps >= num_warmup_steps:
            break

    model.eval()
    losses=[]
    for step, batch in enumerate(eval_dataloader):
        with torch.inference_mode(): # Setup the model for inference (evaluation)
            outputs=model(**batch)

        loss=outputs.loss
        losses.append(loss)

    losses=torch.cat(losses)
    losses=losses[:len(eval_dataset)]
    try:
        perplexity=math.exp(torch.mean(losses))
    except OverflowError:
        perplexity=float("inf")

# Plot both training & perplexity


Begining warmstart
Completed Steps: 1; Loss: 7.953183650970459; lr: [0.0];
Completed Steps: 2; Loss: 7.613645553588867; lr: [0.0];
Completed Steps: 3; Loss: 8.01107406616211; lr: [0.0];


KeyboardInterrupt: 

In [None]:
selection_strategy = 'fl'
# num_partitions = first_sent_nsp_one_dataloader
# partition_strategy = None
# submod_optimizer = 'LazyGreedy'

# # Define subset selection strategies
# if selection_strategy in ['fl', 'logdet', 'gc', 'disparity-sum']:
#     subset_strategy = SubmodStrategy(logger, selection_strategy,
#         num_partitions=num_partitions, partition_strategy=partition_strategy,
#         optimizer=submod_optimizer, similarity_criterion='feature', 
#         metric='cosine', eta=1, stopIfZeroGain=False, 
#         stopIfNegativeGain=False, verbose=False, lambdaVal=1)

In [None]:
selection_strategy = 'fl'
# num_samples has already been defined when creating subset.
probs_nsp_zero=[]
probs_nsp_one=[]
greedyList_nsp_zero=[]
greedyList_nsp_one=[]
gains_nsp_zero=[]
gains_nsp_one=[]

# Begin subset selection 
if selection_strategy == 'Random-Online':
    subset_indices_nsp_zero = [random.sample(list(range(len(first_sent_nsp_zero))), math.floor(num_samples/2))]
    subset_indices_nsp_one = [random.sample(list(range(len(first_sent_nsp_one))), math.ceil(num_samples/2))]
elif selection_strategy in ['fl', 'logdet', 'gc', 'disparity-sum']:
    # Choose a selection strategy
    model.eval() # Set the model in evaluation model 
    representations_nsp_zero=[]
    batch_indices_nsp_zero=[]
# Unwrap the model and set it in evaluation mode.

# Go through the model specific dataset

# Get the embedding and save them in a list.

# Using the list and the selection strategy, get the indices and the gains of each data point in the list.

In [None]:
# Put all the data into a dataset called subset_dataset

# add the data to the subset dataloader

In [None]:
# Train with importance re-sampling

# Train on the entire dataset once

# Sample using the indices and gains

# Train the model on the sampled dataset

# Evaluate the model 

# Save the model 

In [None]:
# Personal Addition: Inference on the model.