In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchtext import data
import numpy as np
import os
from torch.nn import init
from torchtext.vocab import Vectors
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import  train_test_split
import math
import logging
import random
from torch.utils.data import Dataset
from tqdm import tqdm
import torch.optim as optim
from torch.optim.lr_scheduler import LambdaLR
from torch.utils.data.dataloader import DataLoader
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [2]:
# set up logging
import logging
logging.basicConfig(
        format="%(asctime)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
)
logger = logging.getLogger(__name__)

In [3]:
TEXT = data.Field(tokenize='spacy', init_token='<BOS>', eos_token='<EOS>', batch_first=True)
LABEL = data.LabelField(dtype=torch.float)

In [4]:
# all_data = pd.read_csv('./data/all_data_0930.tsv', sep='\t') 
# train_data, valid_data = train_test_split(all_data, test_size = 0.2)
# train_data.to_csv("./data/train_data.csv", index=False)
# valid_data.to_csv("./data/valid_data.csv", index=False)

In [13]:
train_data, valid_data = data.TabularDataset.splits(
    path='./data', train='train_data.csv', test='valid_data.csv', format = 'csv',
    fields=[('text', TEXT), ('label', LABEL)]
)

In [14]:
glove_words = data.TabularDataset(
    path='./data/glove_words.tsv', format = 'tsv',
    fields=[('word', TEXT)]
)

In [15]:
# if not os.path.exists('.vector_cache'):
#     os.mkdir('.vector_cache')
vectors = Vectors(name='glove.6B.100d.txt')
TEXT.build_vocab(train_data, glove_words, vectors=vectors, unk_init=torch.Tensor.normal_)
LABEL.build_vocab(train_data, valid_data)
len(TEXT.vocab)

09/30/2020 17:36:19 - Loading vectors from .vector_cache/glove.6B.100d.txt.pt


830001

In [16]:
print(TEXT.vocab.freqs.most_common(10))
print(TEXT.vocab.stoi['love'])
# TEXT.vocab.vectors[4,:]

[(',', 3301551), ('.', 2955019), ('the', 2847861), ('to', 1773434), ('and', 1580005), ('of', 1394399), ('a', 1303079), ('`', 1272890), ('in', 1089998), ("'s", 651553)]
267


In [17]:
train_iterator = data.BucketIterator(
        train_data, 
        batch_size=32,
        device=device,
        sort_key=lambda x: len(x.text), 
        sort_within_batch=False,
        repeat=False,
        shuffle=True
)

In [18]:
valid_iterator = data.BucketIterator(
        valid_data, 
        batch_size=32,
        device=device,
        sort_key=lambda x: len(x.text), 
        sort_within_batch=False,
        repeat=False,
        shuffle=True
)

In [19]:
print(len(train_iterator))
batch = next(iter(train_iterator))
batch

4223



[torchtext.data.batch.Batch of size 32]
	[.text]:[torch.cuda.LongTensor of size 32x1178 (GPU 0)]
	[.label]:[torch.cuda.FloatTensor of size 32 (GPU 0)]

In [20]:
# print([TEXT.vocab.itos[int(i)] for i in batch.text[2,:512].tolist()])

In [21]:
class CausalSelfAttention(nn.Module):

    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        # key, query, value projections for all heads
        self.key = nn.Linear(config.n_embd, config.n_embd)
        self.query = nn.Linear(config.n_embd, config.n_embd)
        self.value = nn.Linear(config.n_embd, config.n_embd)
        # regularization
        self.attn_drop = nn.Dropout(config.attn_pdrop)
        self.resid_drop = nn.Dropout(config.resid_pdrop)
        # output projection
        self.proj = nn.Linear(config.n_embd, config.n_embd)
        self.n_head = config.n_head

    def forward(self, x, layer_past=None):
        B, T, C = x.size()

        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        k = self.key(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        q = self.query(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        v = self.value(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)

        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
        att = F.softmax(att, dim=-1)
        att = self.attn_drop(att)
        y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side

        # output projection
        y = self.resid_drop(self.proj(y))
        return y

In [22]:
class Block(nn.Module):
    """ an unassuming Transformer block """

    def __init__(self, config):
        super().__init__()
        self.ln1 = nn.LayerNorm(config.n_embd)
        self.ln2 = nn.LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.mlp = nn.Sequential(
            nn.Linear(config.n_embd, 4 * config.n_embd),
            nn.GELU(),
            nn.Linear(4 * config.n_embd, config.n_embd),
            nn.Dropout(config.resid_pdrop),
        )

    def forward(self, x):
        x = x + self.attn(self.ln1(x))
        x = x + self.mlp(self.ln2(x))
        return x

In [23]:
# class Positional_Encoding(nn.Module):
#     def __init__(self, embed, pad_size, device):
#         super(Positional_Encoding, self).__init__()
#         self.device = device
#         self.pe = torch.tensor([[pos / (10000.0 ** (i // 2 * 2.0 / embed)) for i in range(embed)] for pos in range(pad_size)])
#         self.pe[:, 0::2] = np.sin(self.pe[:, 0::2])
#         self.pe[:, 1::2] = np.cos(self.pe[:, 1::2])

#     def forward(self, x):
#         out = x + nn.Parameter(self.pe, requires_grad=False).to(self.device)
#         return out

In [24]:
class GPT(nn.Module):
    """  the full GPT language model, with a context size of block_size """

    def __init__(self, config):
        super().__init__()
        
        # input embedding stem
        self.pad_size = config.pad_size
        self.tok_emb = nn.Embedding(config.vocab_size, config.n_embd, padding_idx=config.pad_index)
#         self.pos_emb = Positional_Encoding(config.n_embd, config.pad_size, config.device)
        self.pos_emb = nn.Parameter(torch.zeros(1, config.pad_size, config.n_embd))
        self.drop = nn.Dropout(config.embd_pdrop)
        # transformer
        self.blocks = nn.Sequential(*[Block(config) for _ in range(config.n_layer)])
        # decoder head
        self.ln_f = nn.LayerNorm(config.n_embd)
        self.fc = nn.Linear(config.pad_size * config.n_embd, config.output_size, bias=False)

        self.apply(self._init_weights)

        print("number of parameters:", sum(p.numel() for p in self.parameters()))


    def _init_weights(self, module):
        if isinstance(module, (nn.Linear, nn.Embedding)):
            module.weight.data.normal_(mean=0.0, std=0.02)
            if isinstance(module, nn.Linear) and module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def configure_optimizers(self, train_config):
        
        # separate out all parameters to those that will and won't experience regularizing weight decay
        decay = set()
        no_decay = set()
        whitelist_weight_modules = (torch.nn.Linear, )
        blacklist_weight_modules = (torch.nn.LayerNorm, torch.nn.Embedding)
        for mn, m in self.named_modules():
            for pn, p in m.named_parameters():
                fpn = '%s.%s' % (mn, pn) if mn else pn # full param name

                if pn.endswith('bias'):
                    # all biases will not be decayed
                    no_decay.add(fpn)
                elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):
                    # weights of whitelist modules will be weight decayed
                    decay.add(fpn)
                elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
                    # weights of blacklist modules will NOT be weight decayed
                    no_decay.add(fpn)

        # special case the position embedding parameter in the root GPT module as not decayed
        no_decay.add('pos_emb')

        # validate that we considered every parameter
        param_dict = {pn: p for pn, p in self.named_parameters()}
        inter_params = decay & no_decay
        union_params = decay | no_decay
        assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), )
        assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \
                                                    % (str(param_dict.keys() - union_params), )

        # create the pytorch optimizer object
        optim_groups = [
            {"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": train_config.weight_decay},
            {"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0},
        ]
        optimizer = torch.optim.AdamW(optim_groups, lr=train_config.learning_rate, betas=train_config.betas)
        return optimizer

    def forward(self, idx, targets=None):
        b, t = idx.size()
#         assert t <= self.block_size, "Cannot forward, model block size is exhausted."

        # forward the GPT model
        token_embeddings = self.tok_emb(idx) # each index maps to a (learnable) vector
        position_embeddings = self.pos_emb[:, :t, :] # each position maps to a (learnable) vector
        x = self.drop(token_embeddings + position_embeddings)
        x = self.drop(x)
        x = self.blocks(x)
        x = self.ln_f(x).view(x.shape[0],-1)
        y_pred = self.fc(x).squeeze(-1)

        # if we are given some desired targets also calculate the loss
        loss = None
        if targets is not None:
            loss = F.binary_cross_entropy_with_logits(y_pred, targets)

        return y_pred, loss


In [25]:
class Trainer:

    def __init__(self, model, train_loader, test_loader, config):
        self.model = model
        self.train_loader = train_loader
        self.test_loader = test_loader
        self.config = config

        # take over whatever gpus are on the system
        self.device = 'cpu'
        if torch.cuda.is_available():
            self.device = torch.cuda.current_device()
            self.model = torch.nn.DataParallel(self.model).to(self.device)

    def save_checkpoint(self):
        # DataParallel wrappers keep raw model object in .module attribute
        raw_model = self.model.module if hasattr(self.model, "module") else self.model
        logger.info("saving %s", self.config.ckpt_path)
        torch.save(raw_model.state_dict(), self.config.ckpt_path)
        
    def binary_accuracy(self, preds, y):
        rounded_preds = torch.round(torch.sigmoid(preds))
        correct = (rounded_preds == y).float()
        acc = correct.sum() / len(correct)
        return acc

    def train(self):
        model, config = self.model, self.config
        raw_model = model.module if hasattr(self.model, "module") else model
        optimizer = raw_model.configure_optimizers(config)

        def run_epoch(split):
            is_train = split == 'train'
            model.train(is_train)
            loader = self.train_loader if is_train else self.test_loader
            
            losses = []
            all_y = []
            all_y_pred = []
            pbar = tqdm(enumerate(loader), total=len(loader)) if is_train else enumerate(loader)
            for it, (x, y) in pbar:
                if x.shape[1]<config.pad_size :
                    padding = torch.zeros((x.shape[0],config.pad_size-x.shape[1]),dtype=torch.long).to(device)
                    x = torch.cat([x, padding], dim=1)
                x = x[:,:config.pad_size]
                # place data on the correct device
                x = x.to(self.device)
                y = y.to(self.device)
                # forward the model
                with torch.set_grad_enabled(is_train):
                    y_pred, loss = model(x, y)
                    loss = loss.mean() # collapse all losses if they are scattered on multiple gpus
                    losses.append(loss.item())
                    step_score = self.binary_accuracy(y_pred, y)
                    all_y.extend(y)
                    all_y_pred.extend(y_pred)
                
                if is_train:

                    # backprop and update the parameters
                    model.zero_grad()
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_norm_clip)
                    optimizer.step()

                    # decay the learning rate based on our progress
                    if config.lr_decay:
                        self.tokens += (y >= 0).sum() # number of tokens processed this step (i.e. label is not -100)
                        if self.tokens < config.warmup_tokens:
                            # linear warmup
                            lr_mult = float(self.tokens) / float(max(1, config.warmup_tokens))
                        else:
                            # cosine learning rate decay
                            progress = float(self.tokens - config.warmup_tokens) / float(max(1, config.final_tokens - config.warmup_tokens))
                            lr_mult = max(0.1, 0.5 * (1.0 + math.cos(math.pi * progress)))
                        lr = config.learning_rate * lr_mult
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr
                    else:
                        lr = config.learning_rate

                    # report progress
                    pbar.set_description(f"epoch {epoch+1} iter {it}: train loss {loss.item():.5f}. score {step_score:.5f}. lr {lr:e}")

            if not is_train:
                test_loss = float(np.mean(losses))
                all_y = torch.stack(all_y, dim=0)
                all_y_pred = torch.stack(all_y_pred, dim=0)
                test_score = self.binary_accuracy(all_y_pred, all_y)
                logger.info("test loss: %f", test_loss)
                logger.info("test score: %f", test_score)
                return test_loss

        best_loss = float('inf')
        self.tokens = 0 # counter used for learning rate decay
        run_epoch('test')
        for epoch in range(config.max_epochs):

            run_epoch('train')
            if self.test_loader is not None:
                test_loss = run_epoch('test')

            # supports early stopping based on the test loss, or just save always if no test set is provided
            good_model = self.test_loader is None or test_loss < best_loss
            if self.config.ckpt_path is not None and good_model:
                best_loss = test_loss
                self.save_checkpoint()

In [26]:
class GPTConfig:
    """ base GPT config, params common to all GPT versions """
    embd_pdrop = 0.1
    resid_pdrop = 0.1
    attn_pdrop = 0.1

    def __init__(self, vocab_size, output_size, pad_index, pad_size, n_embd, **kwargs):
        self.vocab_size = vocab_size
        self.output_size = output_size
        self.pad_index = pad_index
        self.pad_size = pad_size
        self.n_embd = n_embd
        for k,v in kwargs.items():
            setattr(self, k, v)

In [27]:
class TrainerConfig:
    # optimization parameters
    max_epochs = 10
    learning_rate = 3e-4
    betas = (0.9, 0.95)
    grad_norm_clip = 1.0
    weight_decay = 0.1 # only applied on matmul weights
    # learning rate decay params: linear warmup followed by cosine decay to 10% of original
    lr_decay = False
    warmup_tokens = 375e6 # these two numbers come from the GPT-3 paper, but may not be good defaults elsewhere
    final_tokens = 260e9 # (at what point we reach 10% of original LR)
    # checkpoint settings
    ckpt_path = None
    num_workers = 0 # for DataLoader

    def __init__(self, **kwargs):
        for k,v in kwargs.items():
            print(k,v)
            setattr(self, k, v)

In [28]:
VOCAB_SIZE = len(TEXT.vocab)
OUTPUT_SIZE = 1
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
PAD_SIZE = 512
pretrained_embedding = TEXT.vocab.vectors
EMBEDDING_SIZE = pretrained_embedding.shape[1]

In [29]:
mconf = GPTConfig(VOCAB_SIZE, OUTPUT_SIZE, PAD_IDX, PAD_SIZE, 
                  EMBEDDING_SIZE, n_layer=4, n_head=4, device=device)

In [30]:
model = GPT(mconf)

number of parameters: 83587900


In [31]:
# model

In [32]:
model.tok_emb.weight.data.copy_(pretrained_embedding)
model.tok_emb.weight.requires_grad = False
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
# model.tok_emb.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_SIZE)
# model.tok_emb.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_SIZE)
print(model.tok_emb.weight.data.shape)

torch.Size([830001, 100])


In [33]:
print('{} : all params: {:4f}M'.format(model._get_name(), sum(p.numel() for p in model.parameters()) / 1000 / 1000))
print('{} : need grad params: {:4f}M'.format(model._get_name(), sum(p.numel() for p in model.parameters() if p.requires_grad) / 1000 / 1000))

GPT : all params: 83.587900M
GPT : need grad params: 0.587800M


In [34]:
tconf = TrainerConfig(max_epochs=4, learning_rate=6e-4, pad_size=PAD_SIZE,
                      lr_decay=True, warmup_tokens=512*20, final_tokens=2*len(train_data),
                      num_workers=1)

max_epochs 4
learning_rate 0.0006
pad_size 512
lr_decay True
warmup_tokens 10240
final_tokens 270224
num_workers 1


In [35]:
trainer = Trainer(model, train_iterator, valid_iterator, tconf)

In [36]:
trainer.train()

09/30/2020 17:37:56 - test loss: 2.019748
09/30/2020 17:37:56 - test score: 0.411557
epoch 1 iter 4222: train loss 0.22040. score 0.87500. lr 3.185489e-04: 100%|██████████| 4223/4223 [05:29<00:00, 12.80it/s]
09/30/2020 17:43:58 - test loss: 0.309276
09/30/2020 17:43:58 - test score: 0.878031
epoch 2 iter 4222: train loss 0.70201. score 0.75000. lr 6.000000e-05: 100%|██████████| 4223/4223 [05:35<00:00, 12.61it/s]
09/30/2020 17:50:05 - test loss: 0.231109
09/30/2020 17:50:05 - test score: 0.912549
epoch 3 iter 4222: train loss 0.21148. score 0.90625. lr 3.185489e-04: 100%|██████████| 4223/4223 [05:31<00:00, 12.74it/s]
09/30/2020 17:56:12 - test loss: 0.239344
09/30/2020 17:56:12 - test score: 0.912312
epoch 4 iter 4222: train loss 0.32363. score 0.84375. lr 5.977063e-04: 100%|██████████| 4223/4223 [05:31<00:00, 12.76it/s]
09/30/2020 18:02:16 - test loss: 0.255336
09/30/2020 18:02:16 - test score: 0.903490
