In [None]:
# !unzip openwebtext_gpt2.zip

In [None]:
# !pip install zstandard
# !pip install datasets
# !pip install jsonlines
# !pip install tiktoken
# !pip install torchsummaryX
# !pip install wandb

In [2]:
# os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
import torch

In [3]:
config = {
    'encoding_base': "gpt2",
    "block_size": 1024,
    "batch_size": 12,
    "train_loader_workers": 4,
    "val_loader_workers": 2,

    "vocab_size": 50257,
    "embedding_size": 64,  # should be divisible by num_attention_heads
    "block_num": 6,
    "num_attention_heads": 8,
    "expand_num": 4,

    "epochs": 100,
    "lr": 0.001,
    "dropout": 0.2,

    'scheduler_threshold': 0.005,
    'scheduler_factor': 0.6,
    'scheduler_patience': 1,

    'resume_from': "best_checkpoint.pth",
}

In [4]:
import tiktoken
import os

import lzma
import tarfile
import torch
import pickle
import numpy as np
from tqdm import tqdm

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device: ", device)

gpt2_base = tiktoken.get_encoding(config['encoding_base'])

encoder = tiktoken.Encoding(
    # If you're changing the set of special tokens, make sure to use a different name
    # It should be clear from the name what behaviour to expect.
    name="gpt2_im",
    pat_str=gpt2_base._pat_str,
    mergeable_ranks=gpt2_base._mergeable_ranks,
    special_tokens={
        "<|eos|>": config['vocab_size'] - 1,
    },
)

ALLOWED_SPECIAL = set(encoder._special_tokens.keys())

print(encoder._special_tokens)

CXT_ENC = encoder.encode("<content>:")
Q_ENC = encoder.encode("<question>:")
A_ENC = encoder.encode("<answer>:")
SUM_ENC = encoder.encode("<summary>:")
EOS_ENC = encoder.encode("<|eos|>", allowed_special=ALLOWED_SPECIAL)
PAD_ENC = encoder.encode(" ")

print(encoder.decode([50255]))

print(f"context: {CXT_ENC}")
print(f"question: {Q_ENC}")
print(f"answer: {A_ENC}")
print(f"summary: {SUM_ENC}")
print(f"eos: {EOS_ENC}")
print(f"pad: {PAD_ENC}")

Device:  cuda
{'<|eos|>': 50256}
 gazed
context: [27, 11299, 31175]
question: [27, 25652, 31175]
answer: [27, 41484, 31175]
summary: [27, 49736, 31175]
eos: [50256]
pad: [220]


In [5]:
DATA_DIR = "openwebtext"

SAVE_THRESH = 100

class Tokenizer():
    """
    This is used to tokenize the data and save it to disk.
    Only run this once. Load the saved data afterwards.
    """
    def __init__(self, data_dir, partition, encoder, sample_rate=1):

        self.data_dir = data_dir
        self.encoder = encoder
        self.texts = []

        data_files = [f"{file}" for file in os.listdir(data_dir) if file.endswith(".xz")]
        data_files.sort()

        cut_off = int(len(data_files) * 0.9)

        if partition == 'train':
            data_files = data_files[:cut_off]
        elif partition == 'val':
            data_files = data_files[cut_off:]
        elif partition == 'all':
            pass
        else:
            raise ValueError(f"Invalid partition {partition}")

        data_files = data_files[:int(len(data_files) * sample_rate)]

        count = 0
        save_count = 0

        token_save_dir = f"{self.data_dir}_{config['encoding_base']}"
        self.token_save_dir = token_save_dir

        if not os.path.exists(token_save_dir):
            os.makedirs(token_save_dir)

        for xz_file_path in tqdm(data_files, dynamic_ncols=True):

            count += 1

            x = []
            for encoding in self.iter_xz_file(os.path.join(data_dir, xz_file_path)):
                # save the encoding to file
                x.append(encoding)

            if count > SAVE_THRESH:
                self.save(x, save_count)
                save_count += 1
                count = 0
                x = []

        # save the remaining
        if x:
            self.save(x, save_count)

    def iter_xz_file(self, xz_file_path):
        with lzma.open(xz_file_path, 'rb') as xz_file:
        # Open the xz file as a tar archive
            with tarfile.open(fileobj=xz_file, mode="r") as tar:
                # Iterate through each file in the tar archive
                for tarinfo in tar:
                    # Check if the file is a regular file (not a directory)
                    if tarinfo.isreg():
                        # Extract the file's contents into memory
                        file_content = tar.extractfile(tarinfo)
                        # Read and process the content of the file (example: print the first 500 characters)
                        content = file_content.read().decode('utf-8')  # Assuming text content
                        # encoding = encoder.encode(content)
                        # return file name and encoding
                        yield self.encoder.encode(content)

    def save(self, x, save_count):
        with open(f"{self.token_save_dir}/{save_count}.pkl", 'wb') as f:
            pickle.dump(x, f)


# train_dataset = Tokenizer(DATA_DIR, 'all', encoder=encoder)

In [None]:
import tiktoken
import numpy as np
encoder = tiktoken.get_encoding(config['encoding_base'])

import torch

DATA_DIR = "openwebtext_gpt2"

class OpenWebText(torch.utils.data.DataLoader):
    def __init__(self, data_dir, partition, batch_size, block_size, shuffle, sample_rate = 1.0):

        self.dataset = []
        self.total_text_len = 0

        self.batch_size = batch_size
        self.block_size = block_size
        self.shuffle = shuffle

        data_files = [f"{file}" for file in os.listdir(data_dir) if file.endswith(".pkl")]
        data_files.sort()

        cut_off = int(len(data_files) * 0.95)

        if partition == 'train':
            data_files = data_files[:cut_off]
        elif partition == 'val':
            data_files = data_files[cut_off:]

        data_files = data_files[:int(len(data_files) * sample_rate)]

        for fpath in tqdm(data_files, dynamic_ncols=True):
            articles = pickle.load(open(f"{data_dir}/{fpath}", 'rb'))
            # add <eos> at the end of each article
            for article in articles:
                self.total_text_len += len(article)
                article += EOS_ENC
            self.dataset.extend(articles)

        # calculate the batch size
        self.num_batches = self.total_text_len // (block_size * batch_size)

    def __len__(self):
        return self.num_batches

    def __iter__(self):
        # TODOs:
        # 1. Shuffle data if shuffle is True
        # 2. Concatenate articles and drop extra words
        # 3. Divide the concetenated dataset into inputs and targets. How do they vary?
        # 4. Reshape the inputs and targets into batches (think about the final shape)
        # 5. Loop though the batches and yield the input and target according to the sequence length

        if self.shuffle:
            # TODO
            np.random.shuffle(self.dataset)

        data = np.concatenate(self.dataset)

        num_batches = self.num_batches

        inputs, targets = [], []

        for batch_idx in range(num_batches):
            batch_start = batch_idx * self.batch_size * self.block_size
            batch_end = batch_start + self.batch_size * self.block_size

            batch_data = data[batch_start:batch_end+1]

            inputs.append(np.array(batch_data[:-1]).reshape(self.batch_size, self.block_size))
            targets.append(np.array(batch_data[1:]).reshape(self.batch_size, self.block_size))

        batch_idx = 0
        while batch_idx < num_batches:
            yield torch.tensor(inputs[batch_idx]), torch.tensor(targets[batch_idx])
            batch_idx += 1

import gc
gc.collect()

train_loader = OpenWebText(DATA_DIR, 'train', batch_size=config['batch_size'], block_size=config['block_size'], shuffle=True, sample_rate=1)
val_loader = OpenWebText(DATA_DIR, 'val', batch_size=config['batch_size'], block_size=config['block_size'], shuffle=False, sample_rate=1)

100%|██████████| 194/194 [00:05<00:00, 35.88it/s]
100%|██████████| 11/11 [00:00<00:00, 40.90it/s]


In [None]:
import gc
gc.collect()

print("Batch size: ", config['batch_size'])
print("Train dataset batches = {}".format(len(train_loader)))
print("Val dataset batches = {}".format(len(val_loader)))

# sanity check
for data in train_loader:
    x, y = data
    print(x.shape, y.shape)
    break

Batch size:  12
Train dataset batches = 6877
Val dataset batches = 397
torch.Size([12, 1024]) torch.Size([12, 1024])


In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchsummaryX import summary
import math

class Attention(torch.nn.Module):

    def __init__(self, embedding_size, headsize, weights_keys = None, weights_queries = None, weights_values = None):

        """
        Initialize instance variables. Refer to writeup for notation.
        input_dim = D, key_dim = query_dim = D_k, value_dim = D_v

        Argument(s)
        -----------

        weights_keys (torch.tensor, dim = (D X D_k)): weight matrix for keys
        weights_queries (torch.tensor, dim = (D X D_k)): weight matrix for queries
        weights_values (torch.tensor, dim = (D X D_v)): weight matrix for values

        """

        super().__init__()

        self.embedding_size = embedding_size
        self.headsize = headsize

        # Store the given weights as parameters of the class.
        if weights_keys == None:
            self.W_k = torch.nn.Linear(embedding_size, headsize, bias=False) #nn.Linear(D_x, headsize)
        else:
            self.W_k = weights_keys

        if weights_queries == None:
            self.W_q = torch.nn.Linear(embedding_size, headsize, bias=False)
        else:
            self.W_q = weights_queries

        if weights_values == None:
            self.W_v = torch.nn.Linear(embedding_size, headsize, bias=False)
        else:
            self.W_v = weights_values

        # Use this object to perform softmax related operations.
        # It performs softmax over the last dimension which is what you'll need.
        self.softmax = F.softmax

    def forward(self, X, mask):

        """
        Compute outputs of the self-attention layer.
        Stores keys, queries, values, raw and normalized attention weights.
        Refer to writeup for notation.
        batch_size = B, seq_len = T, input_dim = D, value_dim = D_v

        Note that input to this method is a batch not a single sequence, so doing a transpose using .T can yield unexpected results.
        You should permute only the required axes.

        Input
        -----
        X (torch.tensor, dim = (B, T, D)): Input batch

        Return
        ------
        X_new (torch.tensor, dim = (B, T, D_v)): Output batch

        """

        self.X = X

        # Compute the values of Key, Query and Value


        self.Q = self.W_q(X)
        self.K = self.W_k(X)
        self.V = self.W_v(X)

        # Calculate Attention weights

        self.A_w    = self.Q @ self.K.transpose(-2, -1) / math.sqrt(self.headsize)


        # Mask
        if mask is None:
          mask = torch.tril(torch.ones_like(self.A_w))
        self.A_w = self.A_w.masked_fill(mask == 0, float('-inf'))


        # Softmax

        self.A_sig   = self.softmax(self.A_w, dim=-1)


        # Calculate Attention context

        X_new         = self.A_sig @ self.V


        return X_new


class MultiHeadAttention(torch.nn.Module):

    def __init__(self, num_heads, headsize, embedding_size, dropout):
        super().__init__()
        self.heads = torch.nn.ModuleList()
        for _ in range(num_heads):
            self.heads.append(Attention(embedding_size, headsize))

        # Add projection layer to match with output of other blocks
        self.linear = torch.nn.Linear(num_heads * headsize, embedding_size)

        # Add dropout layer
        self.dropout = torch.nn.Dropout(dropout)

    def forward(self, X, mask):
        B, T, _ = X.shape
        out = torch.empty((B, T, 0))
        out = torch.cat([head(X, mask) for head in self.heads], dim = -1)
        # Pass projection layer later
        out = self.linear(out)

        # Pass dropout layer later
        out = self.dropout(out)

        return out

class FeedForwardBlock(torch.nn.Module):
    def __init__(self, embedding_size, expand_num, dropout):

        super().__init__()

        self.linear_1 = torch.nn.Linear(embedding_size, expand_num * embedding_size)
        self.gelu = torch.nn.GELU()
        self.linear_2 = torch.nn.Linear(expand_num * embedding_size, embedding_size)
        # Optional dropout
        self.dropout = torch.nn.Dropout(dropout)

    def forward(self, X):
        X = self.linear_1(X)
        X = self.gelu(X)
        X = self.linear_2(X)
        X = self.dropout(X)

        return X

class ResidualBlock(torch.nn.Module):

    def __init__(self, num_heads, embedding_size, expand_num, dropout):
        super().__init__()


        self.layerNorm_1 = torch.nn.LayerNorm(embedding_size)
        self.layerNorm_2 = torch.nn.LayerNorm(embedding_size)
        self.multihead = MultiHeadAttention(num_heads, embedding_size//num_heads, embedding_size, dropout)
        self.feedforward = FeedForwardBlock(embedding_size, expand_num, dropout)

    def forward(self, X, mask):

        # Adding original input after passing multi head attention

        out_1 = self.layerNorm_1(X)

        out_1 = self.multihead(out_1, mask)

        out_1 = out_1 + X

        # Adding result after first residual to final output
        out_2 = self.layerNorm_2(out_1)

        out_2 = self.feedforward(out_2)

        out_2 = out_2 + out_1

        return out_2

class SimpleGPTModel(torch.nn.Module):

    def __init__(self, vocab_size, block_size, block_num, num_heads, embedding_size, expand_num, dropout):
        super().__init__()
        self.token_embedding = torch.nn.Embedding(vocab_size, embedding_size) #这俩我不太确定是不是这样写 不知道用tiktoken有没有影响？
        self.position_embedding = torch.nn.Embedding(block_size, embedding_size)

        self.blocks = torch.nn.Sequential()
        for _ in range(block_num):
            self.blocks.append(ResidualBlock(num_heads, embedding_size, expand_num, dropout))

        # Final layer norm
        self.linear = torch.nn.LayerNorm(embedding_size)

        # Projection layer (应该是映射回去吧？)
        self.proj = torch.nn.Linear(embedding_size, vocab_size)

    def _init_weights(self, module):
        if isinstance(module, torch.nn.Embedding): #我没找到embedding这个有啥专有的initialization function
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

        if isinstance(module, torch.nn.Linear):
            # Initialize the linear layer weights with Xavier/Glorot initialization
            torch.nn.init.xavier_uniform_(module.weight)
            # Set biases to zero
            if module.bias is not None:
                torch.nn.init.constant_(module.bias, 0)

    def forward(self, X, mask):
        B, T = X.shape

        token_embedding = self.token_embedding(X)
        position_embedding = self.position_embedding(torch.arange(T, device=device)) #这个我不太懂arrange是在干啥 照着写的

        X = token_embedding + position_embedding

        for block in self.blocks:
          X = block(X, mask)

        # X = self.blocks((X, mask))

        X = self.linear(X)

        logits = self.proj(X)


        return logits

    # def generate(self, X, max_new_tokens): 这个好像是用来生成decode文本的，我没太看懂逻辑 训练应该暂时用不上？

# Define a simple sequence-to-sequence model using MultiheadAttention
class Demo(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_heads):
        super(Demo, self).__init__()

        self.embedding = nn.Embedding(input_dim, hidden_dim)
        self.multihead_attn = nn.MultiheadAttention(hidden_dim, num_heads)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, src, trg):
        embedded_src = self.embedding(src)
        embedded_trg = self.embedding(trg)

        # Permute the dimensions to fit the MultiheadAttention input shape
        embedded_src = embedded_src.permute(1, 0, 2)
        embedded_trg = embedded_trg.permute(1, 0, 2)

        attn_output, _ = self.multihead_attn(embedded_trg, embedded_src, embedded_src)

        # Permute the dimensions back to the original shape
        attn_output = attn_output.permute(1, 0, 2)

        logits = self.fc(attn_output)

        return logits


In [7]:
torch.cuda.empty_cache()

model = SimpleGPTModel(
    vocab_size=config['vocab_size'],
    block_size=config['block_size'],
    block_num=config['block_num'],
    num_heads=config['num_attention_heads'],
    embedding_size=config['embedding_size'],
    expand_num=config['expand_num'],
    dropout=config['dropout'],
)

# model = torch.nn.DataParallel(model)

model = model.to(device)

# enable multiple gpus
if torch.cuda.device_count() > 1:
    print("gpu counts: ", torch.cuda.device_count())
    # Wrap the model with DataParallel
    model = nn.DataParallel(model)

criterion = nn.CrossEntropyLoss()

optimizer =  torch.optim.AdamW(model.parameters(), lr=config['lr'], weight_decay=0.05)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    mode='min',
    factor=0.6,
    patience=1,
    threshold=0.005,
    min_lr=0,
    cooldown=1,
    verbose=True)

# Mixed Precision, if you need it
scaler = torch.cuda.amp.GradScaler()

# test code to check shapes
# model.eval()


# print(model)

# for i, data in enumerate(val_loader, 0):
#     x, y = data
#     x, y = x.to(device), y.to(device)

#     # summary(model, x)

#     logits = model(x)

#     loss = criterion(logits.view(-1, logits.size(-1)), y.view(-1))

#     print(loss)

#     break



gpu counts:  2


In [8]:
from tqdm import tqdm

def train_model(model, train_loader, criterion, optimizer, finetune=False):

    model.train()
    batch_bar = tqdm(total=len(train_loader), dynamic_ncols=True, leave=False, position=0, desc='Train')

    total_loss = 0

    for i, data in enumerate(train_loader):
        optimizer.zero_grad()

        if finetune:
          x, y, mask = data
          x, y, mask = x.type(torch.LongTensor).to(device), y.type(torch.LongTensor).to(device), mask.to(device)
        else:
          x, y = data
          x, y = x.type(torch.LongTensor).to(device), y.type(torch.LongTensor).to(device)
          mask = None

        with torch.cuda.amp.autocast():
            logits = model(x, mask)
            loss = criterion(logits.view(-1, logits.size(-1)), y.view(-1))

        total_loss += loss.item()

        batch_bar.set_postfix(
            loss="{:.04f}".format(float(total_loss / (i + 1))),
            lr="{:.06f}".format(float(optimizer.param_groups[0]['lr'])))

        batch_bar.update() # Update tqdm bar

        # Another couple things you need for FP16.
        scaler.scale(loss).backward() # This is a replacement for loss.backward()
        scaler.step(optimizer) # This is a replacement for optimizer.step()
        scaler.update() # This is something added just for FP16

        del x, y, loss
        torch.cuda.empty_cache()

    batch_bar.close() # You need this to close the tqdm bar

    return total_loss / len(train_loader)


def validate_model(model, val_loader, criterion, finetune=False):

    model.eval()
    batch_bar = tqdm(total=len(val_loader), dynamic_ncols=True, position=0, leave=False, desc='Val')

    total_loss = 0

    for i, data in enumerate(val_loader):

        if finetune:
          x, y, mask = data
          x, y, mask = x.type(torch.LongTensor).to(device), y.type(torch.LongTensor).to(device), mask.to(device)
        else:
          x, y = data
          x, y = x.type(torch.LongTensor).to(device), y.type(torch.LongTensor).to(device)
          mask = None

        with torch.cuda.amp.autocast():
            logits = model(x, mask)
            loss = criterion(logits.view(-1, logits.size(-1)), y.view(-1))

        total_loss += float(loss)

        batch_bar.set_postfix(loss="{:.04f}".format(float(total_loss / (i + 1))))

        batch_bar.update()

        del x, y, loss
        torch.cuda.empty_cache()

    batch_bar.close()
    total_loss = total_loss/len(val_loader)

    return total_loss

In [9]:
def save_model(model, optimizer, scheduler, loss, epoch, path):
    torch.save(
        {'model_state_dict'         : model.state_dict(),
         'optimizer_state_dict'     : optimizer.state_dict(),
         'scheduler_state_dict'     : scheduler.state_dict(),
         'epoch'                    : epoch,
         'loss'                     : loss,
         },
         path
    )

def load_model(path, model, optimizer= None, scheduler= None):

    checkpoint = torch.load(path)

    model.load_state_dict(checkpoint['model_state_dict'])

    if optimizer != None:
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    if scheduler != None:
        scheduler.load_state_dict(checkpoint['scheduler_state_dict'])

    epoch   = checkpoint['epoch']
    loss  = checkpoint['loss']

    return [model, optimizer, scheduler, epoch, loss]

In [None]:
import wandb
wandb.login(key="a79b18083e01ff03f06bcc22e0524de4fb499b24")
run = wandb.init(
    name = "early", ## Wandb creates random run names if you skip this field
    reinit = True, ### Allows reinitalizing runs when you re-run this cell
    # run_id = ### Insert specific run id here if you want to resume a previous run
    # resume = "must" ### You need this to resume previous runs, but comment out reinit = True when using this
    project = "hw5-final-ablations", ### Project should be created in your wandb account
    config = config ### Wandb Config for your run
)

[34m[1mwandb[0m: Currently logged in as: [33mmingyang2457[0m ([33mmse[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [10]:
torch.cuda.empty_cache()

In [None]:
torch.cuda.empty_cache()
gc.collect()

last_epoch_completed = 0
start = last_epoch_completed
end = config["epochs"]
best_model_path = 'best_checkpoint.pth' # set best model path
best_loss = float('inf')

if 'resume_from' in config and config['resume_from']:
  _, _, _, last_epoch_completed, best_loss = load_model(config['resume_from'], model)

#TODO: Please complete the training loop

for epoch in range(last_epoch_completed, end):

    print("\nEpoch: {}/{}".format(epoch+1, config['epochs']))

    curr_lr = optimizer.param_groups[0]['lr']

    train_loss              = train_model(model, train_loader, criterion, optimizer)
    valid_loss              = validate_model(model, val_loader, criterion)
    scheduler.step(valid_loss)

    print("\tTrain Loss {:.04f}\t Learning Rate {:.07f}".format(train_loss, curr_lr))
    print("\tVal Loss {:.04f}".format(valid_loss))


    wandb.log({
        'train_loss': train_loss,
        'valid_loss': valid_loss,
        'lr'        : curr_lr
    })
    epoch_model_path = f'checkpoint-{epoch}.pth'
    save_model(model, optimizer, scheduler, valid_loss, epoch, epoch_model_path)
    # wandb.save(epoch_model_path)
    print("Saved epoch model")

    if valid_loss <= best_loss:
        best_loss = valid_loss
        save_model(model, optimizer, scheduler,valid_loss, epoch, best_model_path)
        wandb.save(best_model_path)
        print("Saved best model")
      # You may find it interesting to exlplore Wandb Artifcats to version your models

run.finish()


Epoch: 1/100




	Train Loss 5.4778	 Learning Rate 0.0010000
	Val Loss 5.2857
Saved epoch model
Saved best model

Epoch: 2/100




	Train Loss 5.3178	 Learning Rate 0.0010000
	Val Loss 5.1663
Saved epoch model
Saved best model

Epoch: 3/100




	Train Loss 5.2189	 Learning Rate 0.0010000
	Val Loss 5.0777
Saved epoch model
Saved best model

Epoch: 4/100




	Train Loss 5.1516	 Learning Rate 0.0010000
	Val Loss 5.0592
Saved epoch model
Saved best model

Epoch: 5/100


Train:  75%|███████▌  | 5190/6877 [1:03:52<20:35,  1.37it/s, loss=5.1014, lr=0.001000]

Buffered data was truncated after reaching the output size limit.

In [None]:
# !unzip cnn_dm_tiktoken_gpt2.zip
# !unzip squad_tiktoken_gpt2.zip

Archive:  cnn_dm_tiktoken_gpt2.zip
   creating: cnn_dm_tiktoken_gpt2/
   creating: cnn_dm_tiktoken_gpt2/train/
   creating: cnn_dm_tiktoken_gpt2/val/
  inflating: cnn_dm_tiktoken_gpt2/train/summaries.pkl  
  inflating: cnn_dm_tiktoken_gpt2/train/articles.pkl  
  inflating: cnn_dm_tiktoken_gpt2/val/summaries.pkl  
  inflating: cnn_dm_tiktoken_gpt2/val/articles.pkl  
Archive:  squad_tiktoken_gpt2.zip
   creating: squad_tiktoken_gpt2/
   creating: squad_tiktoken_gpt2/train/
   creating: squad_tiktoken_gpt2/val/
  inflating: squad_tiktoken_gpt2/train/questions.pkl  
  inflating: squad_tiktoken_gpt2/train/contexts.pkl  
  inflating: squad_tiktoken_gpt2/train/answers.pkl  
  inflating: squad_tiktoken_gpt2/val/questions.pkl  
  inflating: squad_tiktoken_gpt2/val/contexts.pkl  
  inflating: squad_tiktoken_gpt2/val/answers.pkl  


In [11]:
import json
import pickle
from tqdm import tqdm


class SQaADDataset():

    def __init__(self, data_path, block_size, load_dir=None):

        self.contexts = []
        self.questions = []
        self.answers = []

        if load_dir:
            with open(f"{load_dir}/contexts.pkl", 'rb') as f:
                self.contexts = pickle.load(f)
            with open(f"{load_dir}/questions.pkl", 'rb') as f:
                self.questions = pickle.load(f)
            with open(f"{load_dir}/answers.pkl", 'rb') as f:
                self.answers = pickle.load(f)
        else:
            with open(data_path, 'r') as f:
                raw_data = json.load(f)
            for article in tqdm(raw_data['data']):
                for paragraph in article['paragraphs']:
                    context = paragraph['context']
                    context_encoding = encoder.encode(context)
                    for qa in paragraph['qas']:
                        question = qa['question']
                        # strip leading and trailing spaces
                        question = question.strip()
                        question_encoding = encoder.encode(question)
                        if qa['is_impossible']:
                            answer = ""
                        else:
                            answer = qa['answers'][0]['text']
                        answer_encoding = encoder.encode(answer)
                        self.contexts.append(context_encoding)
                        self.questions.append(question_encoding)
                        self.answers.append(answer_encoding)


        self.X = []
        self.Y = []

        for l in [self.contexts, self.questions, self.answers]:
          for i in range(len(l)):
              l[i] = np.array(l[i], dtype="ushort")

        additional_token_len = len(CXT_ENC) + len(Q_ENC) + len(A_ENC) + len(EOS_ENC) * 2

        print("Preprocessing contexts, questions and answers")
        for i in tqdm(range(len(self.contexts))):
            context, question, answer = self.contexts[i].tolist(), self.questions[i].tolist(), self.answers[i].tolist()
            if additional_token_len + len(context) + len(question) + len(answer) > block_size:
                # truncate the context
                print("Context too long")
                context = context[:block_size - additional_token_len - len(question) - len(answer)]

            x, y = CXT_ENC + context + EOS_ENC + Q_ENC + question + EOS_ENC + A_ENC, answer + EOS_ENC
            self.X.append(np.array(x, dtype="ushort"))
            self.Y.append(np.array(y, dtype="ushort"))

        if load_dir:
          del self.contexts
          del self.questions
          del self.answers

    def save(self, dir_path):
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)
        with open(f"{dir_path}/contexts.pkl", 'wb') as f:
            pickle.dump(self.contexts, f)
        with open(f"{dir_path}/questions.pkl", 'wb') as f:
            pickle.dump(self.questions, f)
        with open(f"{dir_path}/answers.pkl", 'wb') as f:
            pickle.dump(self.answers, f)


class CNN_DailyMail():

    def __init__(self, dataset, access, block_size, load_dir=None):

        self.articles = []
        self.summaries = []

        if load_dir:
            with open(f"{load_dir}/articles.pkl", 'rb') as f:
                self.articles = pickle.load(f)
            with open(f"{load_dir}/summaries.pkl", 'rb') as f:
                self.summaries = pickle.load(f)
        else:
            for i in tqdm(range(len(dataset[access]))):
                article = dataset[access][i]['article']
                summary = dataset[access][i]['highlights']

                article_encoding = encoder.encode(article)
                summary_encoding = encoder.encode(summary)

                self.articles.append(article_encoding)
                self.summaries.append(summary_encoding)

        for l in [self.articles, self.summaries]:
          for i in range(len(l)):
              l[i] = np.array(l[i], dtype="ushort")


        self.X = []
        self.Y = []

        add_token_len = len(CXT_ENC) + len(SUM_ENC) + len(EOS_ENC)

        assert len(self.articles) == len(self.summaries)

        print("Preprocessing artices and summaries")
        for i in tqdm(range(len(self.articles))):
            article, summary = self.articles[i].tolist(), self.summaries[i].tolist()
            if add_token_len + len(article) + len(summary) > block_size:
                # take the first block_size and average on article and summary
                article_summary_len_limit = block_size - add_token_len
                chunked_article = article[:article_summary_len_limit * len(article) // (len(article) + len(summary))]
                chunked_summary = summary[:article_summary_len_limit * len(summary) // (len(article) + len(summary))]
                x, y = CXT_ENC + chunked_article + EOS_ENC + SUM_ENC, chunked_summary + EOS_ENC
            else:
                x, y = CXT_ENC + article + EOS_ENC + SUM_ENC, summary + EOS_ENC

            assert len(x) + len(y) - 1 <= block_size

            self.X.append(np.array(x, dtype="ushort"))
            self.Y.append(np.array(y, dtype="ushort"))

        if load_dir:
          del self.articles
          del self.summaries

    def save(self, dir_path):
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)
        with open(f"{dir_path}/articles.pkl", 'wb') as f:
            pickle.dump(self.articles, f)
        with open(f"{dir_path}/summaries.pkl", 'wb') as f:
            pickle.dump(self.summaries, f)

In [12]:
squad_val = SQaADDataset("dev-v2.0.json", block_size=config['block_size'], load_dir="squad_tiktoken_gpt2/val")

Preprocessing contexts, questions and answers


100%|██████████| 11873/11873 [00:00<00:00, 58802.66it/s]


In [13]:
cnn_val = CNN_DailyMail(None, 'validation', block_size=config['block_size'], load_dir="cnn_dm_tiktoken_gpt2/val")

Preprocessing artices and summaries


100%|██████████| 13368/13368 [00:00<00:00, 17738.81it/s]


In [14]:
squad_train = SQaADDataset("train-v2.0.json", block_size=config['block_size'], load_dir="squad_tiktoken_gpt2/train")

Preprocessing contexts, questions and answers


100%|██████████| 130319/130319 [00:02<00:00, 60736.32it/s]


In [15]:
cnn_train = CNN_DailyMail(None, 'train', block_size=config['block_size'], load_dir="cnn_dm_tiktoken_gpt2/train")

Preprocessing artices and summaries


100%|██████████| 287113/287113 [00:17<00:00, 16720.58it/s]


In [16]:
class FineTuneDataset(torch.utils.data.Dataset):

    def __init__(self, squad, cnn, block_size):

        self.block_size = block_size

        if not squad:
          squad_X = []
          squad_Y = []
        else:
          squad_X = squad.X.copy()
          squad_Y = squad.Y.copy()

        if not cnn:
          cnn_X = []
          cnn_Y = []
        else:
          cnn_X = cnn.X.copy()
          cnn_Y = cnn.Y.copy()

        if squad and cnn:
          # scale squad
          factor = 1
          squad_X *= factor
          squad_Y *= factor

        total_len = len(squad_X) + len(cnn_X)

        self.X = squad_X + cnn_X
        self.Y = squad_Y + cnn_Y
        self.X_l = []
        self.Y_l = []


        assert len(self.X) == len(self.Y)

        self.inputs = []
        self.outputs = []

        print("Generating inputs and outputs")
        for i in tqdm(range(len(self.X))):

            x, y = self.X[i], self.Y[i]

            valid_input_len = len(x) + len(y) - 1

            if valid_input_len > block_size:
                print(f"Input length {valid_input_len} exceeds block size {block_size}")
                continue

            # pad the input
            input = np.full(block_size, PAD_ENC, dtype='int32')
            input[:valid_input_len] = np.concatenate((x, y))[:-1]
            # input = input + PAD_ENC * (block_size - len(input))

            output = np.tile(y, (block_size // len(y) + 1))[:block_size].astype(np.int32)

            assert len(input) == len(output)
            self.inputs.append(input)
            self.outputs.append(output)
            self.X_l.append(len(x))
            self.Y_l.append(len(y))

    def _gen_mask(self, l_x, l_y):
            mask = np.tril(np.full((l_y, self.block_size), 1, dtype=bool), l_x-1)  # (len(y), block_size)

            # for i in range(0, len(y)):
            #     mask.append([1]*(len(x)+i) + [0]*(block_size - len(x) - i))

            # extend mask to block size
            mask = np.tile(mask, (self.block_size // l_y + 1, 1))[:self.block_size]
            # mask = (mask * (block_size // len(y) + 1)) [:block_size]  # (block_size, block_size)

            assert mask.shape[0]== self.block_size
            assert mask.shape[1]== self.block_size
            return mask

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return torch.tensor(self.inputs[idx]), torch.tensor(self.outputs[idx]), torch.tensor(self._gen_mask(self.X_l[idx], self.Y_l[idx]))

In [17]:
squad_val_dataset = FineTuneDataset(squad_val, None, block_size=config['block_size'])
cnn_val_dataset = FineTuneDataset(None, cnn_val, block_size=config['block_size'])

Generating inputs and outputs


100%|██████████| 11873/11873 [00:00<00:00, 65896.15it/s]


Generating inputs and outputs


100%|██████████| 13368/13368 [00:00<00:00, 69913.26it/s]


In [18]:
train_dataset = FineTuneDataset(squad_train, cnn_train, block_size=config['block_size'])

Generating inputs and outputs


100%|██████████| 417432/417432 [00:07<00:00, 57780.41it/s]


In [None]:
class MockDataset():

    def __init__(self):
        x_decoded = ["<context>:this is the context.<question>:when is it today<|eos|><answer>:",
                     "<context>:this is the article.<eos><summary:>"]
        y_decoded = ["today is December 5th.<|eos|>","is article<|eos|>"]
        self.X = [encoder.encode(x, allowed_special=ALLOWED_SPECIAL) for x in x_decoded]
        self.Y = [encoder.encode(y, allowed_special=ALLOWED_SPECIAL) for y in y_decoded]


mock_dataset = MockDataset()
print(mock_dataset.X)
mock_fine_tune_dataset = FineTuneDataset(mock_dataset, mock_dataset, 30)
print(mock_dataset.X)


for mock_input, mock_output, mock_mask in mock_fine_tune_dataset:
    for i in range(len(mock_mask)):
        context = []
        for j in range(len(mock_mask[i])):
            if mock_mask[i][j] == 0:
                break
            context.append(mock_input[j])
        print(f"{encoder.decode(context)} => {encoder.decode([mock_output[i]])}")

In [19]:
import gc
gc.collect()

train_loader = torch.utils.data.DataLoader(
    dataset = train_dataset,
    num_workers = 0,
    batch_size = config['batch_size'],
    shuffle = True,
    pin_memory = True
)

squad_val_loader = torch.utils.data.DataLoader(
    dataset = squad_val_dataset,
    num_workers = 0,
    batch_size = config['batch_size'],
    shuffle = False,
    pin_memory = True
)

cnn_val_loader = torch.utils.data.DataLoader(
    dataset = cnn_val_dataset,
    num_workers = 0,
    batch_size = config['batch_size'],
    shuffle = False,
    pin_memory = True
)

In [20]:
torch.cuda.empty_cache()
gc.collect()

last_epoch_completed = 0
start = last_epoch_completed
end = config["epochs"]
best_model_path = 'best_finetune.pth' # set best model path
best_loss = float('inf')

if 'resume_from' in config and config['resume_from']:
  _, _, _, last_epoch_completed, best_loss = load_model(config['resume_from'], model)

#TODO: Please complete the training loop

for epoch in range(last_epoch_completed, end):

    print("\nEpoch: {}/{}".format(epoch+1, config['epochs']))

    curr_lr = optimizer.param_groups[0]['lr']

    train_loss                = train_model(model, train_loader, criterion, optimizer, finetune=True)
    squad_val_loss            = validate_model(model, squad_val_loader, criterion, finetune=True)
    cnn_val_loss              = validate_model(model, cnn_val_loader, criterion, finetune=True)

    valid_loss = (squad_val_loss + cnn_val_loss) / 2

    scheduler.step(valid_loss)

    print("\tTrain Loss {:.04f}\t Learning Rate {:.07f}".format(train_loss, curr_lr))
    print("\tVal Loss {:.04f} \t Squad {:.04f} \t CNN {:.04f}".format(valid_loss, squad_val_loss, cnn_val_loss))


    wandb.log({
        'train_loss': train_loss,
        'valid_loss': valid_loss,
        'squad_loss': squad_val_loss,
        'cnn_loss'  : cnn_val_loss,
        'lr'        : curr_lr,
    })
    epoch_model_path = f'finetune-checkpoint-{epoch}.pth'
    save_model(model, optimizer, scheduler, valid_loss, epoch, epoch_model_path)
    # wandb.save(epoch_model_path)
    print("Saved epoch model")

    if valid_loss <= best_loss:
        best_loss = valid_loss
        save_model(model, optimizer, scheduler,valid_loss, epoch, best_model_path)
        wandb.save(best_model_path)
        print("Saved best model")
      # You may find it interesting to exlplore Wandb Artifcats to version your models

run.finish()


Epoch: 25/100


Train:   0%|          | 3/34786 [00:04<11:59:35,  1.24s/it, loss=1.1496, lr=0.001000]

KeyboardInterrupt: ignored

In [None]:
save_model(model, optimizer, scheduler,valid_loss, epoch, "best_finetune.pth")

In [53]:
_, _, _, last_epoch_completed, best_loss = load_model("best_finetune.pth", model)


In [68]:
# infer squad

model.eval()

max_output = 20
block_size = config['block_size']

X = cnn_val.X
Y = cnn_val.Y

for i in tqdm(range(1)):
  x, y = X[i], Y[i]
  x = np.array(encoder.encode("""Carnegie Mellon University isn’t just one of the world’s most renowned educational institutions
– it’s also a hub for some of the most talented doers, dreamers and difference-makers on the
planet. Carnegie Mellon is a private, internationally ranked research university with programs in
areas ranging from science, technology and business to public policy, the humanities and the
arts. Our community includes more than 14,000 students in the university’s seven schools and
colleges who benefit from a small student-to-faculty ratio and an education characterized by its
focus on innovation, interdisciplinary collaboration, and implementing solutions for real world
problems. Our commitment to diversity, equity and inclusion is woven into all aspects of our
campus culture, and we cultivate a vibrant, welcoming environment where everyone is valued.
  """))

  # infer from x
  valid_input_len = len(x) + len(y) - 1
  t = 0
  mask = torch.full((block_size, block_size), 0)
  mask[:, 0:len(x)] = 1

  while t < 30:
    t += 1
    input = torch.full((block_size, ), PAD_ENC[0]).type(torch.LongTensor)
    if len(x) <= block_size:
      input[:len(x)] = torch.tensor(x.astype("int32"))
    mask[:, 0:len(x)] = 1

    logits = model(input.unsqueeze(0).to(device), mask.unsqueeze(0).to(device))

    new_encode = torch.argmax(logits[0][1], dim=-1)

    x = np.append(x, new_encode.item())
    print(encoder.decode(input.tolist()))
    # if new_encode == EOS_ENC[0]:
    #   break

  0%|          | 0/1 [00:00<?, ?it/s]

Carnegie Mellon University isn’t just one of the world’s most renowned educational institutions
– it’s also a hub for some of the most talented doers, dreamers and difference-makers on the
planet. Carnegie Mellon is a private, internationally ranked research university with programs in
areas ranging from science, technology and business to public policy, the humanities and the
arts. Our community includes more than 14,000 students in the university’s seven schools and
colleges who benefit from a small student-to-faculty ratio and an education characterized by its
focus on innovation, interdisciplinary collaboration, and implementing solutions for real world
problems. Our commitment to diversity, equity and inclusion is woven into all aspects of our
campus culture, and we cultivate a vibrant, welcoming environment where everyone is valued.
                                                                                                                                                     

100%|██████████| 1/1 [00:01<00:00,  1.05s/it]

Carnegie Mellon University isn’t just one of the world’s most renowned educational institutions
– it’s also a hub for some of the most talented doers, dreamers and difference-makers on the
planet. Carnegie Mellon is a private, internationally ranked research university with programs in
areas ranging from science, technology and business to public policy, the humanities and the
arts. Our community includes more than 14,000 students in the university’s seven schools and
colleges who benefit from a small student-to-faculty ratio and an education characterized by its
focus on innovation, interdisciplinary collaboration, and implementing solutions for real world
problems. Our commitment to diversity, equity and inclusion is woven into all aspects of our
campus culture, and we cultivate a vibrant, welcoming environment where everyone is valued.
  <|eos|><|eos|><|eos|><|eos|><|eos|><|eos|><|eos|><|eos|><|eos|><|eos|><|eos|><|eos|><|eos|><|eos|><|eos|><|eos|><|eos|><|eos|><|eos|><|eos|><|eos|>




In [None]:
for x, y, mask in squad_val_loader:
  l = len(x)
  for j in range(l):
    mock_input, mock_output, mock_mask = x[j].tolist(), y[j].tolist(), mask[j].tolist()
    for i in range(len(mock_mask)):
        context = []
        for j in range(len(mock_mask[i])):
            if mock_mask[i][j] == 0:
                break
            context.append(mock_input[j])
        print(f"{encoder.decode(context)} => {encoder.decode([mock_output[i]])}")

  break

In [54]:
pip install nltk



In [55]:
from nltk.translate import rouge

# calculate ROGUE scores

for x, y, mask in squad_val_loader:

  # Example reference and generated summaries
  reference = encoder.decode(y.tolist())
  print(y)
  # Split the sentences into tokens
  reference_tokens = reference.split()
  generated_tokens = []

  # Calculate ROUGE scores
  rouge_calculator = rouge.ROUGE()
  scores = rouge_calculator.evaluate([reference_tokens], [generated_tokens])

  # Access the ROUGE scores
  rouge_1_score = scores["ROUGE-1"][0]["r"]
  rouge_2_score = scores["ROUGE-2"][0]["r"]
  rouge_l_score = scores["ROUGE-L"][0]["r"]

  print("ROUGE-1 Score:", rouge_1_score)
  print("ROUGE-2 Score:", rouge_2_score)
  print("ROUGE-L Score:", rouge_l_score)
  break



ImportError: ignored

In [56]:
pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24932 sha256=38476a3393804e69d2c36114b13db8e8bcb480cb4fb9e3d9479eb6effed4aa7a
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [66]:
from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'])
scores = scorer.score('hello world',
                      'taylow swhell')
print(scores)

{'rouge1': Score(precision=0.0, recall=0.0, fmeasure=0.0), 'rougeL': Score(precision=0.0, recall=0.0, fmeasure=0.0)}
