In [3]:
# import the necessary libraries
import argparse
import datetime
import time
import logging
import math
import os
import sys
import random
import datasets
import torch
from torch.optim import AdamW
from datasets import load_dataset, load_from_disk, concatenate_datasets
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
import transformers
from accelerate import Accelerator, DistributedType
from accelerate.logging import get_logger
from accelerate.utils import set_seed, broadcast_object_list
from transformers import (
    BertConfig,
    BertTokenizerFast,
    BertForPreTraining,
    DataCollatorForLanguageModeling,
    DataCollatorWithPadding,
    SchedulerType,
    get_scheduler,
)
# from transformers.utils.versions import require_version
# from selectionstrategies import SubmodStrategy
# from accelerate import InitProcessGroupKwargs
# from selectionstrategies.helper_fns import taylor_softmax_v1
# import numpy as np
import pickle
# import faiss

In [4]:
# Variables 
dataset_name = "Salesforce/wikitext"
dataset_config_name = "wikitext-2-raw-v1"
validation_split_percentage = 80
model_config_name = "google-bert/bert-base-uncased"
tokenizer_name = "bert-base-uncased"
use_slow_tokenizer = False  # Bool
num_workers = None # (int)
max_seq_len = None
short_seq_prob = None
nsp_probability = None
batch_size = None

In [5]:
# Get and Preprocess the dataset for the task.
raw_datasets = load_dataset("Salesforce/wikitext", "wikitext-2-raw-v1")

if 'validation' not in raw_datasets.keys():
    raw_datasets=raw_datasets["train"].train_test_split(test_size=(validation_split_percentage/100), shuffle=False)
    raw_datasets=datasets.DatasetDict({"train": raw_datasets["train"], "validation": raw_datasets["test"]})

Downloading readme: 100%|██████████| 10.5k/10.5k [00:00<00:00, 32.1MB/s]
Downloading data: 100%|██████████| 733k/733k [00:00<00:00, 2.41MB/s]
Downloading data: 100%|██████████| 6.36M/6.36M [00:00<00:00, 25.7MB/s]
Downloading data: 100%|██████████| 657k/657k [00:00<00:00, 6.59MB/s]
Generating test split: 100%|██████████| 4358/4358 [00:00<00:00, 136318.24 examples/s]
Generating train split: 100%|██████████| 36718/36718 [00:00<00:00, 1084613.60 examples/s]
Generating validation split: 100%|██████████| 3760/3760 [00:00<00:00, 844656.58 examples/s]


In [6]:
# Create and instance of the model along with its tokenizer

# Tokenizer
tokenizer = BertTokenizerFast.from_pretrained(tokenizer_name, use_fast= not use_slow_tokenizer)

# Load the model
config = BertConfig.from_pretrained(model_config_name)

# Instantiating the model
model = BertForPreTraining(config)

# Resizing the token embeddings to fit the tokenizer
model.resize_token_embeddings(len(tokenizer))



Embedding(30522, 768, padding_idx=0)

In [7]:
raw_datasets

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})

In [10]:
# Tokenize and group the data based on the kind of model

column_names=raw_datasets['train'].column_names
text_column_name="text" if "text" in column_names else column_names[0]

def tokenize_function(examples):
    return tokenizer(examples[text_column_name])

tokenized_dataset = raw_datasets.map(
    tokenize_function,
    batched=True,
    num_proc=num_workers, 
    remove_columns=column_names,
    desc="Running tokenizer on every text in dataset"
)

# Grouping the data 
from experiment_utils import group_texts

train_dataset = tokenized_dataset["train"]
eval_dataset = tokenized_dataset["validation"]

train_dataset = train_dataset.map(
    group_texts, 
    fn_kwargs={'split': 'train', 'tokenizer':tokenizer, 'max_seq_length': max_seq_len, 
               'short_seq_prob':short_seq_prob, 'nsp_probability':nsp_probability, 'tokenized_datasets':tokenized_dataset},
    batched=True,
    batch_size=batch_size,
    num_proc=num_workers,
    with_indices=True,
    desc=f"Grouping Train texts into chucks of {max_seq_len}"
)

eval_dataset = eval_dataset.map(
    group_texts, 
    fn_kwargs={'split': 'Validation', 'tokenizer':tokenizer, 'max_seq_length': max_seq_len, 
               'short_seq_prob':short_seq_prob, 'nsp_probability':nsp_probability, 'tokenized_datasets':tokenized_dataset},
    batched=True,
    batch_size=batch_size,
    num_proc=num_workers,
    with_indices=True,
    desc=f"Grouping Validation texts into chucks of {max_seq_len}"
)

Running tokenizer on every text in dataset:   0%|          | 0/4358 [00:00<?, ? examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (525 > 512). Running this sequence through the model will result in indexing errors
Running tokenizer on every text in dataset: 100%|██████████| 4358/4358 [00:00<00:00, 33145.97 examples/s]
Running tokenizer on every text in dataset: 100%|██████████| 36718/36718 [00:01<00:00, 21929.23 examples/s]
Running tokenizer on every text in dataset: 100%|██████████| 3760/3760 [00:00<00:00, 31723.77 examples/s]


In [11]:
tokenized_dataset

DatasetDict({
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3760
    })
})

In [None]:
# Create datacollators and dataloaders 

# warmstart dataloader

# first sent nsp zero dataloader

# first sent nsp one  dataloader

# subset dataloader (train)

# eval dataloader (validation & testing)

In [None]:
# Initialize the model and training instance

# Optimizer

# learning scheduler

In [None]:
# Warmstart the model: Train the model with the warmstart data for warmstart epochs

# Plot both training & perplexity

In [None]:
# Define subset selection strategies

In [None]:
# Begin subset selection 

# Choose a selection strategy

# Unwrap the model and set it in evaluation mode.

# Go through the model specific dataset

# Get the embedding and save them in a list.

# Using the list and the selection strategy, get the indices and the gains of each data point in the list.

In [None]:
# Put all the data into a dataset called subset_dataset

# add the data to the subset dataloader

In [None]:
# Train with importance re-sampling

# Train on the entire dataset once

# Sample using the indices and gains

# Train the model on the sampled dataset

# Evaluate the model 

# Save the model 

In [None]:
# Personal Addition: Inference on the model.