In [None]:
import sys
import math
import datasets
from transformers import GPT2TokenizerFast, GPT2LMHeadModel, GPT2Config, DataCollatorForLanguageModeling, DataCollatorWithPadding, get_scheduler, SchedulerType
from itertools import chain
from datasets import load_dataset
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from selectionstrategies.helper_fns import taylor_softmax_v1
from selectionstrategies import SubmodStrategy
import random
import logging
import numpy as np

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

In [None]:
# Create a logger
logger = logging.getLogger(__name__)

# Set the logging level
logger.setLevel(logging.INFO)

# Create a file handler
file_handler = logging.FileHandler('logfile.log')

# Create a formatter
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

# Set the formatter for the file handler
file_handler.setFormatter(formatter)

# Add the file handler to the logger
logger.addHandler(file_handler)

# Log a message
logger.info('Logger created')

##### DATA PREPROCESSING

In [None]:
# Getting the dataset
ds = load_dataset("stas/openwebtext-10k")

In [None]:
# Create the train / validation set

dataset=ds["train"].train_test_split(test_size=0.05, shuffle=False)

dataset=datasets.DatasetDict({"train": dataset["train"], "validation": dataset["test"]})
tokenizer=GPT2TokenizerFast.from_pretrained("gpt2")
column_names = dataset["train"].column_names
text_column_name = "text" if "text" in column_names else column_names[0]   

def tokenize_function(examples):
    return tokenizer(examples[text_column_name])

tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    num_proc=None,
    remove_columns=column_names,
    load_from_cache_file=True,
    desc="Running tokenizer on dataset",
)

In [None]:
tokenized_datasets

In [None]:
# Group the data

block_size = 1024 # Set the length each input group to the context size

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_datasets=tokenized_datasets.map(
    group_texts,
    batched=True,
    num_proc=96,
    load_from_cache_file=True,
    desc=f"Grouping texts in chunks of {block_size}",
)

In [None]:
train_dataset = lm_datasets['train']
eval_dataset = lm_datasets['validation']
train_dataset, eval_dataset

In [None]:
len(train_dataset[0]['input_ids'])

##### MODEL SETUP

In [None]:
# Initializing a GPT2 configuration
configuration = GPT2Config()

# Initializing a model (with random weights) from the configuration
model = GPT2LMHeadModel(configuration)

# Accessing the model configuration
configuration = model.config

In [None]:
subset_fraction = 0.25

# Sample a subset of the train dataset
num_samples = int(round(len(train_dataset) * subset_fraction, 0))
init_subset_indices = [random.sample(list(range(len(train_dataset))), num_samples)]
full_dataset=train_dataset
subset_dataset = full_dataset.select(init_subset_indices[0])

In [None]:
# Dataloaders
batch_size = 1

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

warmstart_dataloader = DataLoader(train_dataset, shuffle=True, collate_fn=data_collator, batch_size=batch_size)
full_dataloader = DataLoader(full_dataset, shuffle=False, collate_fn=data_collator, batch_size=batch_size)
train_dataloader = DataLoader(train_dataset, shuffle=True, collate_fn=data_collator, batch_size=batch_size)
eval_dataloader = DataLoader(eval_dataset, shuffle=False, collate_fn=data_collator, batch_size=batch_size)

In [None]:
warmer = iter(warmstart_dataloader)
X= next(warmer)
X['input_ids'].shape

##### Setting up the model for training

In [None]:
learning_rate = 1e-4
scheduler_name = 'linear'
num_warmup_steps = 10
num_training_steps = 10

# Optimizer
optimizer = AdamW(model.parameters(), lr=learning_rate)

# learning scheduler
lr_scheduler = get_scheduler(
    name=scheduler_name,
    optimizer=optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps
)

##### Warmstart the model with the entire dataset

In [None]:
warmstart_epochs = 1
completed_steps = 0

model.to(device)
# warmstart the model: Train the model with the warmstart data for warmstart epochs
for epoch in range(warmstart_epochs):
    if epoch==0:
        print("Beginning Warmstart")
    model.train() # Setting the model into training mode to enable backprop 
    for step, batch in enumerate(warmstart_dataloader):
        outputs = model(**batch.to(device))
        loss = outputs.loss
        print(f"Completed Steps: {1+completed_steps}; Loss: {loss.detach().float()}; lr: {lr_scheduler.get_last_lr()};")
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        completed_steps += 1
        if completed_steps >= num_warmup_steps:
            break

In [None]:
!nvidia-smi

##### Subset selection

In [None]:
selection_strategy = 'fl'
num_partitions = 2000 # Default is 5000
partition_strategy = 'random'
submod_optimizer = 'LazyGreedy'

# Define subset selection strategies

subset_strategy = SubmodStrategy(logger, selection_strategy,
    num_partitions=num_partitions, partition_strategy=partition_strategy,
    optimizer=submod_optimizer, similarity_criterion='feature', 
    metric='cosine', eta=1, stopIfZeroGain=False, 
    stopIfNegativeGain=False, verbose=False, lambdaVal=1)

In [None]:
selection_strategy = 'fl'
layer_for_similarity_computation = 9
temperature = 0.5
seed = 23
parallel_processes = 3
# num_samples has already been defined when creating subset.
probs_nsp_zero=[]
greedyList_nsp_zero=[]
gains_nsp_zero=[]


# Begin subset selection for first_sent_nsp_zero
if selection_strategy == 'Random-Online':
    subset_indices_nsp_zero = [random.sample(list(range(len(train_dataset))), math.floor(num_samples/2))]
elif selection_strategy in ['fl', 'logdet', 'gc', 'disparity-sum']:
    # Choose a selection strategy
    model.eval() # Set the model in evaluation model 
    representations=[]
    total_cnt=0
    total_storage=0
    # Unwrap the model and set it in evaluation mode.
    print("Performing Subset selection for entire dataset")
    for step, batch in enumerate(full_dataloader):
        with torch.no_grad():
            output = model(**batch.to(device), output_hidden_states=True)
        embeddings=output["hidden_states"][layer_for_similarity_computation]
        attention_mask=batch['attention_mask']
        total_cnt+=embeddings.size(0)
        embeddings=embeddings.cpu()
        attention_mask=attention_mask.cpu()
        last_token_indices=torch.cat((attention_mask, torch.zeros((embeddings.shape[0],1))), dim=1).argmin(axis=1)-1
        embeddings=torch.cat([embeddings[i][last_token_indices[i]].reshape((1,-1)) for i in range(embeddings.shape[0])], dim=0)
        total_storage+=sys.getsizeof(embeddings.storage())
        representations.append(embeddings)
        # print(f"Current total representations: {len(representations}")
        
    print(f"Final number of representations: {len(representations)}")
    representations=torch.cat(representations, dim=0)
    representations_nsp_zero=representations[:len(full_dataset)]
    total_storage += sys.getsizeof(representations_nsp_zero.storage())
    representations_nsp_zero=representations_nsp_zero.numpy()
    print('Representations(NSP Class 0) Size: {}, Total number of samples: {}'.format(total_storage/(1024 * 1024), total_cnt))
    batch_indices=list(range(len(full_dataset)))
    print('Length of indices: {}'.format(len(batch_indices)))
    print('Representations(NSP Class 0) gathered. Shape of representations: {}. Length of indices: {}'.format(representations_nsp_zero.shape, len(batch_indices)))

    partition_indices_nsp_zero, greedyIdx_nsp_zero, gains_nsp_zero = subset_strategy.select(len(batch_indices)-1, 
                                                                                            batch_indices, representations, 
                                                                                            parallel_processes=parallel_processes, return_gains=True)
    init_subset_indices = [[]]
    i=0
    for p in gains_nsp_zero:
        greedyList_nsp_zero.append(greedyIdx_nsp_zero[i:i+len(p)])         
        i+=len(p)
    probs_nsp_zero=[taylor_softmax_v1(torch.from_numpy(np.array([partition_gains])/temperature)).numpy()[0] for partition_gains in gains_nsp_zero]
    print(f"Taylor Softmax Prop: {probs_nsp_zero}")
    rng=np.random.default_rng(seed+completed_steps)
    for i, partition_prob in enumerate(probs_nsp_zero):
        print(f"{i}: Partition probablity :{partition_prob}")
        partition_budget=min(math.ceil((len(partition_prob)/len(batch_indices)) * math.floor(num_samples/2)), len(partition_prob)-1)
        init_subset_indices[0].extend(rng.choice(greedyList_nsp_zero[i], size=partition_budget, replace=False, p=partition_prob).tolist())