In [None]:
# import the necessary libraries
import argparse
import datetime
import time
import logging
import math
import os
import sys
import random
import datasets
import torch
from torch.optim import AdamW
from datasets import load_dataset, load_from_disk, concatenate_datasets
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
import transformers
from accelerate import Accelerator, DistributedType
from accelerate.logging import get_logger
from accelerate.utils import set_seed, broadcast_object_list
from transformers import (
    BertConfig,
    BertTokenizerFast,
    BertForPreTraining,
    DataCollatorForLanguageModeling,
    DataCollatorWithPadding,
    SchedulerType,
    get_scheduler,
)
from transformers.utils.versions import require_version
from selectionstrategies import SubmodStrategy
from accelerate import InitProcessGroupKwargs
from selectionstrategies.helper_fns import taylor_softmax_v1
import numpy as np
import pickle
import faiss

In [None]:
# Variables 
dataset_name = "wikitext"
dataset_config_name = "wikitext-2-raw-v1"
validation_split_percentage = 80
model_config_name = None
tokenizer_name = "bert-base-uncased"
use_slow_tokenizer = False  # Bool
num_workers = None # (int)

In [None]:
# Get and Preprocess the dataset for the task.
raw_datasets = load_dataset(dataset_name, dataset_config_name)

if 'validation' not in raw_datasets.keys():
    raw_datasets=raw_datasets["train"].train_test_split(test_size=(validation_split_percentage/100), shuffle=False)
    raw_datasets=datasets.DatasetDict({"train": raw_datasets["train"], "validation": raw_datasets["test"]})

In [None]:
# Create and instance of the model along with its tokenizer

# Load the model
config = BertConfig.from_pretrained(model_config_name)

# Tokenizer
tokenizer = BertTokenizerFast.from_pretrained(tokenizer_name, use_fast= not use_slow_tokenizer)

# Instantiating the model
model = BertForPretraining(tokenizer_name)

# Resizing the token embeddings to fit the tokenizer
model.resize_token_embeddings(len(tokenizer))

In [None]:
# Tokenize and group the data based on the kind of model

column_names=raw_datasets['train'].column_names
text_column_name="text" if "text" in column_names else column_names[0]

def tokenize_function(examples):
    return tokenizer(examples[text_column_name], return_special_token_mask=True)

tokenized_dataset = raw_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=num_workers, 
    remove_columns=column_names,
    desc="Running tokenizer on every text in dataset"
)

In [None]:
# Create datacollators and dataloaders 

# warmstart dataloader

# first sent nsp zero dataloader

# first sent nsp one  dataloader

# subset dataloader (train)

# eval dataloader (validation & testing)

In [None]:
# Initialize the model and training instance

# Optimizer

# learning scheduler

In [None]:
# Warmstart the model: Train the model with the warmstart data for warmstart epochs

# Plot both training & perplexity

In [None]:
# Define subset selection strategies

In [None]:
# Begin subset selection 

# Choose a selection strategy

# Unwrap the model and set it in evaluation mode.

# Go through the model specific dataset

# Get the embedding and save them in a list.

# Using the list and the selection strategy, get the indices and the gains of each data point in the list.

In [None]:
# Put all the data into a dataset called subset_dataset

# add the data to the subset dataloader

In [None]:
# Train with importance re-sampling

# Train on the entire dataset once

# Sample using the indices and gains

# Train the model on the sampled dataset

# Evaluate the model 

# Save the model 

In [None]:
# Personal Addition: Inference on the model.