In [1]:
# import the necessary libraries
import argparse
import datetime
import time
import logging
import math
import os
import sys
import random
import datasets
import torch
from torch.optim import AdamW
from datasets import load_dataset, load_from_disk, concatenate_datasets
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
import transformers
from accelerate import Accelerator, DistributedType
from accelerate.logging import get_logger
from accelerate.utils import set_seed, broadcast_object_list
from transformers import (
    BertConfig,
    BertTokenizerFast,
    BertForPreTraining,
    DataCollatorForLanguageModeling,
    DataCollatorWithPadding,
    SchedulerType,
    get_scheduler,
)
from transformers.utils.versions import require_version
from selectionstrategies import SubmodStrategy
from accelerate import InitProcessGroupKwargs
from selectionstrategies.helper_fns import taylor_softmax_v1
import numpy as np
import pickle
import faiss

os.environ['TOKENIZERS_PARALLELISM']='true'
device = "cuda" if torch.cuda.is_available() else "cpu"

In [2]:
# Create a logger
logger = logging.getLogger(__name__)

# Set the logging level
logger.setLevel(logging.INFO)

# Create a file handler
file_handler = logging.FileHandler('logfile.log')

# Create a formatter
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

# Set the formatter for the file handler
file_handler.setFormatter(formatter)

# Add the file handler to the logger
logger.addHandler(file_handler)

# Log a message
logger.info('Logger created')

In [3]:
# Variables 
dataset_name = "Salesforce/wikitext"
dataset_config_name = "wikitext-2-raw-v1"
validation_split_percentage = 80
model_config_name = "google-bert/bert-base-uncased"
tokenizer_name = "bert-base-uncased"
use_slow_tokenizer = False  # Bool
num_workers = None # (int)
max_seq_len = 128
short_seq_prob = 0.1
nsp_probability = 0.1
batch_size = 1000

In [4]:
# Get and Preprocess the dataset for the task.
raw_datasets = load_dataset("Salesforce/wikitext", "wikitext-2-raw-v1")

if 'validation' not in raw_datasets.keys():
    raw_datasets=raw_datasets["train"].train_test_split(test_size=(validation_split_percentage/100), shuffle=False)
    raw_datasets=datasets.DatasetDict({"train": raw_datasets["train"], "validation": raw_datasets["test"]})

In [5]:
raw_datasets

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})

In [6]:
# # Uncomment to use custom config

# config = BertConfig(
#     vocab_size=vocab_size,
#     hidden_size=hidden_size,
#     num_hidden_layers=num_hidden_layers,
#     num_attention_heads=num_attention_heads,
#     intermediate_size=intermediate_size,
#     hidden_act="gelu",
#     hidden_dropout_prob=0.1,
#     attention_probs_dropout_prob=0.1,
#     max_position_embeddings=512,
#     type_vocab_size=2,
#     initializer_range=0.02,
#     layer_norm_eps=1e-12,
#     position_embedding_type="absolute",
# )

In [7]:
# Create and instance of the model along with its tokenizer

# Tokenizer
tokenizer = BertTokenizerFast.from_pretrained(tokenizer_name, use_fast= not use_slow_tokenizer)

# Load the model
config = BertConfig.from_pretrained(model_config_name)

# Instantiating the model
model = BertForPreTraining(config)

# Resizing the token embeddings to fit the tokenizer
model.resize_token_embeddings(len(tokenizer))

Embedding(30522, 768, padding_idx=0)

In [8]:
# Tokenize and group the data based on the kind of model

column_names=raw_datasets['train'].column_names
text_column_name="text" if "text" in column_names else column_names[0]

def tokenize_function(examples):
    return tokenizer(examples[text_column_name])

tokenized_dataset = raw_datasets.map(
    tokenize_function,
    batched=True,
    num_proc=num_workers, 
    remove_columns=column_names,
    desc="Running tokenizer on every text in dataset"
)

# Grouping the data 
from experiment_utils import group_texts

train_dataset = tokenized_dataset["train"]
eval_dataset = tokenized_dataset["validation"]

train_dataset = train_dataset.map(
    group_texts, 
    fn_kwargs={'split': 'train', 'tokenizer':tokenizer, 'max_seq_length': max_seq_len, 
               'short_seq_prob':short_seq_prob, 'nsp_probability':nsp_probability, 'tokenized_datasets':tokenized_dataset},
    batched=True,
    batch_size=batch_size,
    num_proc=num_workers,
    with_indices=True,
    desc=f"Grouping Train texts into chucks of {max_seq_len}"
)

eval_dataset = eval_dataset.map(
    group_texts, 
    fn_kwargs={'split': 'validation', 'tokenizer':tokenizer, 'max_seq_length': max_seq_len, 
               'short_seq_prob':short_seq_prob, 'nsp_probability':nsp_probability, 'tokenized_datasets':tokenized_dataset},
    batched=True,
    batch_size=batch_size,
    num_proc=num_workers,
    with_indices=True,
    desc=f"Grouping Validation texts into chucks of {max_seq_len}"
)

In [9]:
eval_dataset

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask', 'next_sentence_label'],
    num_rows: 1196
})

In [10]:
tokenizer.decode(eval_dataset[9]['input_ids'])

'[CLS] lobster pots, although lines baited with octopus or cuttlefish sometimes succeed in tempting them out, to allow them to be caught in a net or by hand. in 2008, 4 @, @ 386 t of h. gammarus were caught across europe and north africa, of which 3 @, @ 462 t ( 79 % ) was caught in the british isles ( including the channel islands ). the minimum landing size for h. gammarus is a carapace length of 87 mm ( 3 @. @ 4 [SEP] aquaculture systems for h. gammarus are under development, and production rates are still very low. [SEP]'

In [11]:
# Prepare the data
prepared_data = datasets.DatasetDict({"train": train_dataset, "validation": eval_dataset})
dataset=prepared_data['train']

def extract_first_sentences(examples):
    for i, input_ids in enumerate(examples["input_ids"]):
        idx=input_ids.index(tokenizer.sep_token_id)
        examples["input_ids"][i]=input_ids[:idx+1]
        examples["attention_mask"][i]=examples["attention_mask"][i][:idx+1]
        examples["token_type_ids"][i]=examples["token_type_ids"][i][:idx+1]
        examples["special_tokens_mask"][i]=examples["special_tokens_mask"][i][:idx+1]
    return examples

# Separate the data into those that have the next sentence labels and those that do not.
nsp_zero=dataset.filter(lambda examples: [x==0 for x in examples["next_sentence_label"]], batched=True, num_proc=num_workers, keep_in_memory=True)
nsp_one=dataset.filter(lambda examples: [x==1 for x in examples["next_sentence_label"]], batched=True, num_proc=num_workers, keep_in_memory=True)

# Extract the first sentences from both datasets
first_sent_nsp_zero=nsp_zero.map(extract_first_sentences, batched=True, num_proc=num_workers, remove_columns=["next_sentence_label", "special_tokens_mask"], keep_in_memory=True)
first_sent_nsp_one=nsp_one.map(extract_first_sentences, batched=True, num_proc=num_workers, remove_columns=["next_sentence_label", "special_tokens_mask"], keep_in_memory=True)

Filter:   0%|          | 0/12198 [00:00<?, ? examples/s]

Filter:   0%|          | 0/12198 [00:00<?, ? examples/s]

Map:   0%|          | 0/6467 [00:00<?, ? examples/s]

Map:   0%|          | 0/5731 [00:00<?, ? examples/s]

In [12]:
tokenizer.decode(first_sent_nsp_one[0]['input_ids'])

'[CLS] of movement limited by their action gauge. up to nine characters can be assigned to a single mission. during gameplay, characters will call out if something happens to them, such as their health points ( hp ) getting low or being knocked out by enemy attacks. each character has specific " potentials ", skills unique to each character. they are divided into " personal potential ", which are innate skills that remain unaltered unless otherwise dictated by the story and can either help or impede a character, and " battle potentials ", which are grown throughout the game and always grant boons to a character. to learn battle [SEP]'

In [13]:
subset_fraction = 0.25

# Sample a subset of the train dataset
num_samples = int(round(len(train_dataset) * subset_fraction, 0))
init_subset_indices = [random.sample(list(range(len(train_dataset))), num_samples)]
full_dataset=train_dataset
subset_dataset = full_dataset.select(init_subset_indices[0])

In [14]:
first_sent_nsp_zero

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 6467
})

In [15]:
train_dataset

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask', 'next_sentence_label'],
    num_rows: 12198
})

In [16]:
subset_dataset

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask', 'next_sentence_label'],
    num_rows: 3050
})

In [17]:
mlm_probability = 0.15
train_batch_size = 4
eval_batch_size = 4

# Create datacollators
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=mlm_probability)
data_collator_embd = DataCollatorWithPadding(tokenizer=tokenizer)

# warmstart dataloader (train on all the train dataset during warmup)
warmstart_dataloader = DataLoader(train_dataset.remove_columns(['special_tokens_mask']), shuffle=True, collate_fn=data_collator, batch_size=train_batch_size)

# first sent nsp zero dataloader
first_sent_nsp_zero_dataloader=DataLoader(first_sent_nsp_zero, shuffle=False, collate_fn=data_collator_embd, batch_size=eval_batch_size)

# first sent nsp one  dataloader
first_sent_nsp_one_dataloader=DataLoader(first_sent_nsp_one, shuffle=False, collate_fn=data_collator_embd, batch_size=eval_batch_size)

# subset dataloader (train)
subset_dataloader=DataLoader(subset_dataset.remove_columns(['special_tokens_mask']), shuffle=True, collate_fn=data_collator, batch_size=train_batch_size,)

# eval dataloader (validation & testing)
eval_dataloader=DataLoader(eval_dataset.remove_columns(['special_tokens_mask']), collate_fn=data_collator, batch_size=eval_batch_size)

In [18]:
learning_rate = 1e-4
scheduler_name = 'linear'
num_warmup_steps = 10
num_training_steps = 10

# Optimizer
optimizer = AdamW(model.parameters(), lr=learning_rate)

# learning scheduler
lr_scheduler = get_scheduler(
    name=scheduler_name,
    optimizer=optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps
)

In [19]:
warmer = iter(eval_dataloader)
X= next(warmer)
X['input_ids'].shape

torch.Size([4, 128])

In [20]:
warmstart_epochs = 1
completed_steps = 0

model.to(device)
# Warmstart the model: Train the model with the warmstart data for warmstart epochs
for epoch in range(warmstart_epochs):
    if epoch==0:
        print("Begining warmstart")
    model.train() # Setup the model for training
    for step, batch in enumerate(warmstart_dataloader):
        outputs = model(**batch.to(device))
        loss = outputs.loss
        print(f"Completed Steps: {1+completed_steps}; Loss: {loss.detach().float()}; lr: {lr_scheduler.get_last_lr()};")
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        completed_steps += 1
        if completed_steps >= num_warmup_steps:
            break

    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():  # Use torch.no_grad() instead of inference_mode()
            outputs = model(**batch.to(device))
        
        loss = outputs.loss
        losses.append(loss.view(1))  # Add the loss as a 1-dimensional tensor

    if losses:
        losses = torch.cat(losses)
        losses = losses[:len(eval_dataset)]
        try:
            perplexity = math.exp(torch.mean(losses).item())
        except OverflowError:
            perplexity = float("inf")
    else:
        perplexity = float("inf")
    
    print(f"Epoch {epoch + 1}: Perplexity: {perplexity}")

# Plot both training & perplexity


Begining warmstart
Completed Steps: 1; Loss: 11.128046035766602; lr: [0.0];
Completed Steps: 2; Loss: 11.283427238464355; lr: [1e-05];
Completed Steps: 3; Loss: 11.376888275146484; lr: [2e-05];
Completed Steps: 4; Loss: 11.207054138183594; lr: [3e-05];
Completed Steps: 5; Loss: 11.090845108032227; lr: [4e-05];
Completed Steps: 6; Loss: 10.360278129577637; lr: [5e-05];
Completed Steps: 7; Loss: 10.226296424865723; lr: [6e-05];
Completed Steps: 8; Loss: 10.5401611328125; lr: [7e-05];
Completed Steps: 9; Loss: 9.90046215057373; lr: [8e-05];
Completed Steps: 10; Loss: 10.229503631591797; lr: [9e-05];
Epoch 1: Perplexity: 47207.50299634538


In [21]:
selection_strategy = 'fl'
num_partitions = 2000 # Default is 5000
partition_strategy = 'random'
submod_optimizer = 'LazyGreedy'

# Define subset selection strategies

subset_strategy = SubmodStrategy(logger, selection_strategy,
    num_partitions=num_partitions, partition_strategy=partition_strategy,
    optimizer=submod_optimizer, similarity_criterion='feature', 
    metric='cosine', eta=1, stopIfZeroGain=False, 
    stopIfNegativeGain=False, verbose=False, lambdaVal=1)

In [22]:
selection_strategy = 'fl'
layer_for_similarity_computation = 9
temperature = 0.5
seed = 23
parallel_processes = 3
# num_samples has already been defined when creating subset.
probs_nsp_zero=[]
greedyList_nsp_zero=[]
gains_nsp_zero=[]


# Begin subset selection for first_sent_nsp_zero
if selection_strategy == 'Random-Online':
    subset_indices_nsp_zero = [random.sample(list(range(len(first_sent_nsp_zero))), math.floor(num_samples/2))]
    subset_indices_nsp_one = [random.sample(list(range(len(first_sent_nsp_one))), math.ceil(num_samples/2))]
elif selection_strategy in ['fl', 'logdet', 'gc', 'disparity-sum']:
    # Choose a selection strategy
    model.eval() # Set the model in evaluation model 
    representations_nsp_zero=[]
    batch_indices_nsp_zero=[]
    total_cnt=0
    total_storage=0
    # Unwrap the model and set it in evaluation mode.
    print("Performing Subset selection for NSP class 0")
    for step, batch in enumerate(first_sent_nsp_zero_dataloader):
        with torch.no_grad():
            output = model(**batch.to(device), output_hidden_states=True)
        embeddings=output["hidden_states"][layer_for_similarity_computation]
        # print(f"Embeddings shape: {embeddings.shape}")
        mask=(batch['attention_mask'].unsqueeze(-1).expand(embeddings.size()).float())
        mask1=((batch['token_type_ids'].unsqueeze(-1).expand(embeddings.size()).float())==0)
        mask=mask*mask1
        mean_pooled=torch.sum(embeddings*mask, 1) / torch.clamp(mask.sum(1), min=1e-9)
        total_cnt += mean_pooled.size(0)
        mean_pooled = mean_pooled.cpu()
        total_storage += sys.getsizeof(mean_pooled.storage())
        representations_nsp_zero.append(mean_pooled)
        # print(f"Current total representations: {len(representations_nsp_zero)}")
        
    print(f"Final number of representations: {len(representations_nsp_zero)}")
    representations_nsp_zero=torch.cat(representations_nsp_zero, dim=0)
    representations_nsp_zero=representations_nsp_zero[:len(first_sent_nsp_zero)]
    total_storage += sys.getsizeof(representations_nsp_zero.storage())
    representations_nsp_zero=representations_nsp_zero.numpy()
    print('Representations(NSP Class 0) Size: {}, Total number of samples: {}'.format(total_storage/(1024 * 1024), total_cnt))
    batch_indices_nsp_zero=list(range(len(first_sent_nsp_zero)))
    print('Length of indices: {}'.format(len(batch_indices_nsp_zero)))
    print('Representations(NSP Class 0) gathered. Shape of representations: {}. Length of indices: {}'.format(representations_nsp_zero.shape, len(batch_indices_nsp_zero)))

    partition_indices_nsp_zero, greedyIdx_nsp_zero, gains_nsp_zero = subset_strategy.select(len(batch_indices_nsp_zero)-1, 
                                                                                            batch_indices_nsp_zero, representations_nsp_zero, 
                                                                                            parallel_processes=parallel_processes, return_gains=True)
    subset_indices_nsp_zero = [[]]
    i=0
    for p in gains_nsp_zero:
        greedyList_nsp_zero.append(greedyIdx_nsp_zero[i:i+len(p)])         
        i+=len(p)
    probs_nsp_zero=[taylor_softmax_v1(torch.from_numpy(np.array([partition_gains])/temperature)).numpy()[0] for partition_gains in gains_nsp_zero]
    print(f"Taylor Softmax Prop: {probs_nsp_zero}")
    rng=np.random.default_rng(seed+completed_steps)
    for i, partition_prob in enumerate(probs_nsp_zero):
        print(f"{i}: Partition probablity :{partition_prob}")
        partition_budget=min(math.ceil((len(partition_prob)/len(batch_indices_nsp_zero)) * math.floor(num_samples/2)), len(partition_prob)-1)
        subset_indices_nsp_zero[0].extend(rng.choice(greedyList_nsp_zero[i], size=partition_budget, replace=False, p=partition_prob).tolist())

nsp_zero_subset_dataset=nsp_zero.select(subset_indices_nsp_zero[0])

# Using the list and the selection strategy, get the indices and the gains of each data point in the list.

Performing Subset selection for NSP class 0
Final number of representations: 1617
Representations(NSP Class 0) Size: 37.966644287109375, Total number of samples: 6467
Length of indices: 6467
Representations(NSP Class 0) gathered. Shape of representations: (6467, 768). Length of indices: 6467


100%|██████████| 1617/1617 [00:00<00:00, 2569.41it/s]21 of  of 3]3]eration 32 of  of [33||||||]]              ]33% [Iteration 1 of ] of 3]]


Taylor Softmax Prop: [array([0.94346924, 0.03044052, 0.02609024]), array([0.95101907, 0.02511758, 0.02386335]), array([0.9495491, 0.0262492, 0.0242017]), array([0.94943522, 0.02630682, 0.02425796]), array([0.95280096, 0.02360637, 0.02359267]), array([0.94933376, 0.02650001, 0.02416623]), array([0.93694263, 0.03555475, 0.02750263]), array([0.9371136 , 0.03748967, 0.02539673]), array([0.93789851, 0.03603871, 0.02606279]), array([0.94511686, 0.0289472 , 0.02593594]), array([0.94320037, 0.03040485, 0.02639478]), array([0.94492755, 0.02946654, 0.02560591]), array([0.9459327 , 0.02976568, 0.02430162]), array([0.95088796, 0.02517933, 0.02393271]), array([0.94320009, 0.03030286, 0.02649706]), array([0.94469522, 0.02870705, 0.02659773]), array([0.95061802, 0.02536846, 0.02401353]), array([0.95072329, 0.02511726, 0.02415946]), array([0.95028571, 0.02524622, 0.02446807]), array([0.94499368, 0.02900847, 0.02599785]), array([0.95047257, 0.02546029, 0.02406714]), array([0.94999651, 0.02594529, 0.024

In [23]:
gains_nsp_one=[]
probs_nsp_one=[]
greedyList_nsp_one=[]

if selection_strategy in ['fl', 'logdet', 'gc', 'disparity-sum']:
    # Choose a selection strategy
    model.eval() # Set the model in evaluation model 
    representations_nsp_one=[]
    batch_indices_nsp_one=[]
    total_cnt=0
    total_storage=0
    # Unwrap the model and set it in evaluation mode.
    print("Performing Subset selection for NSP class 1")
    for step, batch in enumerate(first_sent_nsp_one_dataloader):
        with torch.no_grad():
            output = model(**batch.to(device), output_hidden_states=True)
        embeddings=output["hidden_states"][layer_for_similarity_computation]
        # print(f"Embeddings shape: {embeddings.shape}")
        mask=(batch['attention_mask'].unsqueeze(-1).expand(embeddings.size()).float())
        mask1=((batch['token_type_ids'].unsqueeze(-1).expand(embeddings.size()).float())==0)
        mask=mask*mask1
        mean_pooled=torch.sum(embeddings*mask, 1) / torch.clamp(mask.sum(1), min=1e-9)
        total_cnt += mean_pooled.size(0)
        mean_pooled = mean_pooled.cpu()
        total_storage += sys.getsizeof(mean_pooled.storage())
        representations_nsp_one.append(mean_pooled)
        # print(f"Current total representations: {len(representations_nsp_one)}")
        
    print(f"Final number of representations: {len(representations_nsp_one)}")
    representations_nsp_one=torch.cat(representations_nsp_one, dim=0)
    representations_nsp_one=representations_nsp_one[:len(first_sent_nsp_one)]
    total_storage += sys.getsizeof(representations_nsp_one.storage())
    representations_nsp_one=representations_nsp_one.numpy()
    print('Representations(NSP Class 1) Size: {}, Total number of samples: {}'.format(total_storage/(1024 * 1024), total_cnt))
    batch_indices_nsp_one=list(range(len(first_sent_nsp_one)))
    print('Length of indices: {}'.format(len(batch_indices_nsp_one)))
    print('Representations(NSP Class 0) gathered. Shape of representations: {}. Length of indices: {}'.format(representations_nsp_one.shape, len(batch_indices_nsp_one)))

    partition_indices_nsp_one, greedyIdx_nsp_one, gains_nsp_one = subset_strategy.select(len(batch_indices_nsp_one)-1, 
                                                                                            batch_indices_nsp_one, representations_nsp_one, 
                                                                                            parallel_processes=parallel_processes, return_gains=True)
    subset_indices_nsp_one = [[]]
    i=0
    for p in gains_nsp_one:
        greedyList_nsp_one.append(greedyIdx_nsp_one[i:i+len(p)])         
        i+=len(p)
    probs_nsp_one=[taylor_softmax_v1(torch.from_numpy(np.array([partition_gains])/temperature)).numpy()[0] for partition_gains in gains_nsp_one]
    print(f"Taylor Softmax Prop: {probs_nsp_one}")
    rng=np.random.default_rng(seed+completed_steps)
    for i, partition_prob in enumerate(probs_nsp_one):
        print(f"{i}: Partition probablity :{partition_prob}")
        if len(partition_prob) > 0:
            partition_budget=min(math.ceil((len(partition_prob)/len(batch_indices_nsp_one)) * math.floor(num_samples/2)), len(partition_prob)-1)
            print(f"Partition Budget: {partition_budget}")
            subset_indices_nsp_one[0].extend(rng.choice(greedyList_nsp_one[i], size=partition_budget, replace=False, p=partition_prob).tolist())

nsp_one_subset_dataset=nsp_one.select(subset_indices_nsp_one[0])

Performing Subset selection for NSP class 1
Final number of representations: 1433
Representations(NSP Class 1) Size: 33.645721435546875, Total number of samples: 5731
Length of indices: 5731
Representations(NSP Class 0) gathered. Shape of representations: (5731, 768). Length of indices: 5731


100%|██████████| 1911/1911 [00:00<00:00, 2416.51it/s]ration 250 of % [Iteration 12 of ]2]]2]2][||||||||||          ]50% [Iteration 1 of 2]2]2]]50% [Iteration 1 of 2]


Taylor Softmax Prop: [array([0.96128612, 0.03871388]), array([0.96126387, 0.03873613]), array([0.96117934, 0.03882066]), array([0.95856987, 0.04143013]), array([0.961275, 0.038725]), array([0.9500566, 0.0499434]), array([0.96134168, 0.03865832]), array([0.96127466, 0.03872534]), array([0.96131223, 0.03868777]), array([0.96137092, 0.03862908]), array([0.96126284, 0.03873716]), array([0.96125198, 0.03874802]), array([0.95670663, 0.04329337]), array([0.96093512, 0.03906488]), array([0.96121006, 0.03878994]), array([0.95017604, 0.04982396]), array([0.96126152, 0.03873848]), array([0.96131212, 0.03868788]), array([0.96122397, 0.03877603]), array([0.96122421, 0.03877579]), array([0.95841607, 0.04158393]), array([0.96129051, 0.03870949]), array([0.9563422, 0.0436578]), array([0.96135444, 0.03864556]), array([0.95034771, 0.04965229]), array([0.9598686, 0.0401314]), array([0.96113815, 0.03886185]), array([0.95025494, 0.04974506]), array([0.96127327, 0.03872673]), array([0.95777306, 0.04222694])

In [24]:
subset_indices_nsp_one

[[833,
  351,
  2458,
  3303,
  5063,
  5271,
  1272,
  3533,
  1076,
  4304,
  2392,
  3462,
  3334,
  479,
  4037,
  3613,
  2851,
  1485,
  5369,
  389,
  691,
  826,
  3113,
  4276,
  2009,
  1126,
  4584,
  1026,
  828,
  3472,
  5368,
  3036,
  7,
  3417,
  2907,
  3645,
  280,
  1706,
  2990,
  2994,
  3793,
  2517,
  861,
  613,
  864,
  3075,
  5313,
  3649,
  924,
  132,
  4284,
  5473,
  3801,
  1426,
  1138,
  5077,
  4069,
  5334,
  2756,
  3160,
  346,
  425,
  4370,
  5520,
  4031,
  1403,
  944,
  3283,
  5445,
  2575,
  1057,
  751,
  4003,
  707,
  3222,
  2791,
  4054,
  1352,
  2227,
  4420,
  1629,
  76,
  1584,
  4004,
  5102,
  515,
  1555,
  28,
  5702,
  1271,
  555,
  1102,
  4474,
  221,
  2540,
  5459,
  4194,
  3859,
  2495,
  1392,
  3402,
  2232,
  3410,
  3658,
  2010,
  1952,
  849,
  4588,
  482,
  1696,
  5431,
  622,
  1971,
  1232,
  5595,
  928,
  1041,
  652,
  3870,
  2931,
  5382,
  5168,
  3491,
  388,
  4024,
  1,
  4544,
  1535,
  302,
  4435

In [25]:
partition_budget

1

In [26]:
greedyList_nsp_zero

[[81, 5841, 2306],
 [3506, 3030, 1276],
 [41, 2879, 1297],
 [2270, 6018, 4651],
 [1025, 3028, 5777],
 [4344, 6465, 1936],
 [2923, 2889, 3475],
 [2577, 6319, 1761],
 [6463, 2206, 258],
 [1627, 1057, 5899],
 [4424, 3356, 5228],
 [4027, 2056, 57],
 [5832, 5492, 2816],
 [4892, 5097, 6011],
 [4358, 6446, 2297],
 [5702, 3867, 2707],
 [3722, 1940, 4318],
 [4207, 4851, 526],
 [5939, 1610, 4228],
 [3449, 6402, 6152],
 [1848, 494, 5116],
 [5644, 373, 2811],
 [1170, 5139, 5022],
 [408, 6316, 876],
 [3353, 679, 1921],
 [2485, 1266, 1752],
 [4373, 591, 4309],
 [4663, 6309, 2953],
 [1168, 1974, 2172],
 [4911, 1199, 4552],
 [3500, 4445, 4179],
 [3675, 4706, 1525],
 [6102, 5810, 6362],
 [1047, 1778, 3467],
 [3804, 3451, 1211],
 [4054, 5593, 2373],
 [3638, 503, 5246],
 [1910, 5464, 5085],
 [2698, 4629, 1341],
 [999, 374, 5079],
 [5136, 2866, 4549],
 [3060, 4339, 396],
 [614, 5656, 4479],
 [886, 1971, 2050],
 [1668, 1961, 4842],
 [110, 5786, 5189],
 [4752, 2774, 4839],
 [2114, 2677, 3873],
 [2151, 3473,

In [27]:
# Put all the data into a dataset called subset_dataset
# Concatenate the two datasets
subset_dataset = concatenate_datasets([nsp_zero_subset_dataset, nsp_one_subset_dataset])

subset_dataloader=DataLoader(
    subset_dataset.remove_columns(['special_tokens_mask']), shuffle=True, collate_fn=data_collator, batch_size=train_batch_size)

# add the data to the subset dataloader

In [28]:
subset_dataset.remove_columns(['special_tokens_mask'])

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'next_sentence_label'],
    num_rows: 3527
})

In [29]:
max_train_steps = 10
completed_steps = 0 # I needed to reset this!
select_every = 5 # This is needed to breaking the training and set the subset selection back in motion.
print("Begin the main training loop with importance re-sampling, after warm-start")
while completed_steps<max_train_steps:
    model.train()
    select_subset=False
    for step, batch in enumerate(subset_dataloader):
        train_time=0
        subset_time=0
        start_time=time.time()
        outputs=model(**batch.to(device))
        loss=outputs.loss
        logger.info(f"Completed Steps: {1+completed_steps}; Loss: {loss.detach().float()}; lr: {lr_scheduler.get_last_lr()};")
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        completed_steps+=1
           
        train_time += (time.time() - start_time)

        if completed_steps>=max_train_steps:
            break
        
        if (completed_steps)%select_every==0:
            select_subset=True
            break

    if select_subset==True:
        start_time = time.time()
        num_samples = int(round(len(full_dataset) * subset_fraction, 0)) 
        if selection_strategy == 'Random-Online':
            subset_indices_nsp_zero = [random.sample(list(range(len(first_sent_nsp_zero))), math.floor(num_samples/2))]
            subset_indices_nsp_one = [random.sample(list(range(len(first_sent_nsp_one))), math.ceil(num_samples/2))]

        elif selection_strategy in ["fl", "logdet", "gc", "disparity-min"]:
            print(f"Performing Subset selection for NSP class 0")
            sampling_start_time=time.time()
            
            subset_indices_nsp_zero=[[]]
            rng=np.random.default_rng(seed+completed_steps)
            for i, partition_prob in enumerate(probs_nsp_zero):
                partition_budget=min(math.ceil((len(partition_prob)/len(batch_indices_nsp_zero)) * math.floor(num_samples/2)), len(partition_prob)-1)
                subset_indices_nsp_zero[0].extend(rng.choice(greedyList_nsp_zero[i], size=partition_budget, replace=False, p=partition_prob).tolist())

            print("Sampling time(NSP Class 0): {}".format(time.time()-sampling_start_time))
        
            logger.info(f"Performing Subset selection for NSP class 1")
            sampling_start_time=time.time()
            
            subset_indices_nsp_one=[[]]
            rng=np.random.default_rng(seed+completed_steps)
            for i, partition_prob in enumerate(probs_nsp_one):
                if len(partition_prob) > 0:
                    partition_budget=min(math.ceil((len(partition_prob)/len(batch_indices_nsp_one)) * math.ceil(num_samples/2)), len(partition_prob)-1)
                    subset_indices_nsp_one[0].extend(rng.choice(greedyList_nsp_one[i], size=partition_budget, replace=False, p=partition_prob).tolist())

            print("Sampling time(NSP Class 1): {}".format(time.time()-sampling_start_time))

        nsp_zero_subset_dataset=nsp_zero.select(subset_indices_nsp_zero[0])
        nsp_one_subset_dataset=nsp_one.select(subset_indices_nsp_one[0])
        
        # Concatenate the two datasets
        subset_dataset = concatenate_datasets([nsp_zero_subset_dataset, nsp_one_subset_dataset])
        subset_dataloader=DataLoader(
            subset_dataset.remove_columns(['special_tokens_mask']), shuffle=True, collate_fn=data_collator, batch_size=train_batch_size)

        select_subset=False # Setting the up for training after re-sampling

        print(f"Subset selection Completed")
    
    model.eval()
    losses=[]
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs=model(**batch.to(device))

        loss=outputs.loss
        losses.append(loss.view(1))

    losses=torch.cat(losses)
    losses=losses[:len(eval_dataset)]
    try:
        perplexity=math.exp(torch.mean(losses))
    except OverflowError:
        perplexity=float("inf")

    print(f"Steps {completed_steps}: perplexity: {perplexity}")

print(f"Saving the final model after {completed_steps} steps.")
print(f"Training completed successfully!")

Begin the main training loop with importance re-sampling, after warm-start
Performing Subset selection for NSP class 0
Sampling time(NSP Class 0): 0.056417226791381836
Sampling time(NSP Class 1): 0.06260108947753906
Subset selection Completed
Steps 5: perplexity: 46889.875655296644
Steps 10: perplexity: 46801.19504473503
Saving the final model after 10 steps.
Training completed successfully!


In [30]:
# Train with importance re-sampling

# Train on the entire dataset once

# Sample using the indices and gains

# Train the model on the sampled dataset

# Evaluate the model 

# Save the model 

In [31]:
# Personal Addition: Inference on the model.
