In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchtext import data
import numpy as np
import os
from torch.nn import init
from torchtext.vocab import Vectors
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import  train_test_split
import math
import logging
import random
from torch.utils.data import Dataset
from tqdm import tqdm
import torch.optim as optim
from torch.optim.lr_scheduler import LambdaLR
from torch.utils.data.dataloader import DataLoader
import spacy
import csv
nlp = spacy.load('en')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [2]:
# set up logging
import logging
logging.basicConfig(
        format="%(asctime)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
)
logger = logging.getLogger(__name__)

In [3]:
pad_size = 512
max_vocal_size = 400000
TEXT = data.Field(tokenize='spacy', init_token='<BOS>', eos_token='<EOS>', batch_first=True, fix_length=pad_size)
LABEL = data.LabelField(use_vocab=False, sequential=False, dtype=torch.float)

In [4]:
# all_data = pd.read_csv('data/all_data_1014.tsv', sep='\t') 
# train_data, valid_data = train_test_split(all_data, test_size = 0.2)
# train_data.to_csv("./data/train_data.csv", index=False)
# valid_data.to_csv("./data/valid_data.csv", index=False)

In [5]:
train_data, valid_data = data.TabularDataset.splits(
    path='./data', train='train_data.csv', test='valid_data.csv', format = 'csv',
    fields=[('text', TEXT), ('label', LABEL)]
)

In [6]:
vectors = Vectors(name='glove.6B.100d.txt')
TEXT.build_vocab(train_data, max_size=max_vocal_size, vectors=vectors, unk_init=torch.Tensor.normal_)
LABEL.build_vocab(train_data)
print("Vocab size: ", len(TEXT.vocab))

10/14/2020 18:58:25 - Loading vectors from .vector_cache/glove.6B.100d.txt.pt


Vocab size:  400004


In [7]:
# print(TEXT.vocab.freqs.most_common(30))
# print(TEXT.vocab.stoi['locve'])
# index = 30005
# print(TEXT.vocab.itos[index])
# print(TEXT.vocab.vectors[index,:10])

In [8]:
train_iterator = data.BucketIterator(
        train_data, 
        batch_size=32,
        sort_key=lambda x: len(x.text), 
        sort_within_batch=False,
        repeat=False,
        shuffle=True,
        device=device
)

In [9]:
valid_iterator = data.BucketIterator(
        valid_data, 
        batch_size=32,
        sort_key=lambda x: len(x.text), 
        sort_within_batch=False,
        repeat=False,
        shuffle=False,
        device=device
)

In [10]:
# print(len(train_iterator))
# print(len(valid_iterator))
# batch = next(iter(valid_iterator))
# x,y = batch
# print(sum(y==1))
# print(x)

In [11]:
class CausalSelfAttention(nn.Module):

    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        # key, query, value projections for all heads
        self.key = nn.Linear(config.n_embd, config.n_embd)
        self.query = nn.Linear(config.n_embd, config.n_embd)
        self.value = nn.Linear(config.n_embd, config.n_embd)
        # regularization
        self.attn_drop = nn.Dropout(config.attn_pdrop)
        self.resid_drop = nn.Dropout(config.resid_pdrop)
        # output projection
        self.proj = nn.Linear(config.n_embd, config.n_embd)
        self.n_head = config.n_head

    def forward(self, x, layer_past=None):
        B, T, C = x.size()
        # B = 32, T = 512, C = 100
        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        k = self.key(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        q = self.query(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        v = self.value(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)

        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
        att = F.softmax(att, dim=-1)
        att = self.attn_drop(att)
        y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side

        # output projection
        y = self.resid_drop(self.proj(y))
        return y

In [12]:
class Block(nn.Module):
    """ an unassuming Transformer block """

    def __init__(self, config):
        super().__init__()
        self.ln1 = nn.LayerNorm(config.n_embd)
        self.ln2 = nn.LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.mlp = nn.Sequential(
            nn.Linear(config.n_embd, 4 * config.n_embd),
            nn.GELU(),
            nn.Linear(4 * config.n_embd, config.n_embd),
            nn.Dropout(config.resid_pdrop),
        )

    def forward(self, x):
        x = x + self.attn(self.ln1(x))
        x = x + self.mlp(self.ln2(x))
        return x

In [13]:
class GPT(nn.Module):
    """  the full GPT language model, with a context size of block_size """

    def __init__(self, config):
        super().__init__()
        
        # input embedding stem
        self.tok_emb = nn.Embedding(config.vocab_size, config.n_embd, padding_idx=config.pad_index)
#         self.pos_emb = Positional_Encoding(config.n_embd, config.pad_size, config.device)
        self.pos_emb = nn.Parameter(torch.zeros(1, config.pad_size, config.n_embd))
        self.drop = nn.Dropout(config.embd_pdrop)
        # transformer
        self.blocks = nn.Sequential(*[Block(config) for _ in range(config.n_layer)])
        self.avgpool = torch.nn.AvgPool1d(config.pad_size)
        # decoder head
        self.ln_f = nn.LayerNorm(config.n_embd)
        self.fc = nn.Linear(config.n_embd, config.output_size, bias=False)

        self.apply(self._init_weights)

        print("number of parameters:", sum(p.numel() for p in self.parameters()))


    def _init_weights(self, module):
        if isinstance(module, (nn.Linear, nn.Embedding)):
            module.weight.data.normal_(mean=0.0, std=0.02)
            if isinstance(module, nn.Linear) and module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def configure_optimizers(self, train_config):
        
        # separate out all parameters to those that will and won't experience regularizing weight decay
        decay = set()
        no_decay = set()
        whitelist_weight_modules = (torch.nn.Linear, )
        blacklist_weight_modules = (torch.nn.LayerNorm, torch.nn.Embedding)
        for mn, m in self.named_modules():
            for pn, p in m.named_parameters():
                fpn = '%s.%s' % (mn, pn) if mn else pn # full param name

                if pn.endswith('bias'):
                    # all biases will not be decayed
                    no_decay.add(fpn)
                elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):
                    # weights of whitelist modules will be weight decayed
                    decay.add(fpn)
                elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
                    # weights of blacklist modules will NOT be weight decayed
                    no_decay.add(fpn)

        # special case the position embedding parameter in the root GPT module as not decayed
        no_decay.add('pos_emb')

        # validate that we considered every parameter
        param_dict = {pn: p for pn, p in self.named_parameters()}
        inter_params = decay & no_decay
        union_params = decay | no_decay
        assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), )
        assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \
                                                    % (str(param_dict.keys() - union_params), )

        # create the pytorch optimizer object
        optim_groups = [
            {"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": train_config.weight_decay},
            {"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0},
        ]
        optimizer = torch.optim.AdamW(optim_groups, lr=train_config.learning_rate, betas=train_config.betas)
        return optimizer

    def forward(self, idx, targets=None):
        b, t = idx.size()
#         assert t <= self.block_size, "Cannot forward, model block size is exhausted."

        # forward the GPT model
        token_embeddings = self.tok_emb(idx) # each index maps to a (learnable) vector
        position_embeddings = self.pos_emb[:, :t, :] # each position maps to a (learnable) vector
        x = self.drop(token_embeddings + position_embeddings)
        x = self.blocks(x).transpose(1,2)
        # (B, T, C) -> (B, C, T)
        x = self.avgpool(x).squeeze(-1)
        # (B, C)
        x = self.ln_f(x)
        y_pred = self.fc(x).squeeze(-1)
        # if we are given some desired targets also calculate the loss
        loss = None
        if targets is not None:
            loss = F.binary_cross_entropy_with_logits(y_pred, targets)

        return y_pred, loss


In [14]:
class Trainer:

    def __init__(self, model, train_loader, test_loader, config):
        self.model = model
        self.train_loader = train_loader
        self.test_loader = test_loader
        self.config = config

        # take over whatever gpus are on the system
        self.device = 'cpu'
        if torch.cuda.is_available():
            self.device = torch.cuda.current_device()
            self.model = torch.nn.DataParallel(self.model).to(self.device)

    def save_checkpoint(self):
        # DataParallel wrappers keep raw model object in .module attribute
        raw_model = self.model.module if hasattr(self.model, "module") else self.model
        logger.info("saving %s", self.config.ckpt_path)
        torch.save(raw_model.state_dict(), self.config.ckpt_path)
        
    def binary_accuracy(self, preds, y):
        rounded_preds = torch.round(torch.sigmoid(preds))
        correct = (rounded_preds == y).float()
        acc = correct.sum() / len(correct)
        return acc

    def train(self):
        model, config = self.model, self.config
        raw_model = model.module if hasattr(self.model, "module") else model
        optimizer = raw_model.configure_optimizers(config)

        def run_epoch(split):
            is_train = split == 'train'
            model.train(is_train)
            loader = self.train_loader if is_train else self.test_loader
            
            losses = []
            all_y = []
            all_y_pred = []
            pbar = tqdm(enumerate(loader), total=len(loader)) if is_train else enumerate(loader)
            for it, (x, y) in pbar:
                # place data on the correct device
                x = x.to(self.device)
                y = y.to(self.device)
                # forward the model
                with torch.set_grad_enabled(is_train):
                    y_pred, loss = model(x, y)
                    loss = loss.mean() # collapse all losses if they are scattered on multiple gpus
                    losses.append(loss.item())
                    step_score = self.binary_accuracy(y_pred, y)
                    all_y.extend(y)
                    all_y_pred.extend(y_pred)
                
                if is_train:

                    # backprop and update the parameters
                    model.zero_grad()
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_norm_clip)
                    optimizer.step()

                    # decay the learning rate based on our progress
                    if config.lr_decay:
                        self.tokens += (y >= 0).sum() # number of tokens processed this step (i.e. label is not -100)
                        if self.tokens < config.warmup_tokens:
                            # linear warmup
                            lr_mult = float(self.tokens) / float(max(1, config.warmup_tokens))
                        else:
                            # cosine learning rate decay
                            progress = float(self.tokens - config.warmup_tokens) / float(max(1, config.final_tokens - config.warmup_tokens))
                            lr_mult = max(0.1, 0.5 * (1.0 + math.cos(math.pi * progress)))
                        lr = config.learning_rate * lr_mult
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr
                    else:
                        lr = config.learning_rate

                    # report progress
                    pbar.set_description(f"epoch {epoch+1} iter {it}: train loss {loss.item():.5f}. score {step_score:.5f}. lr {lr:e}")

            if not is_train:
                test_loss = float(np.mean(losses))
                all_y = torch.stack(all_y, dim=0)
                all_y_pred = torch.stack(all_y_pred, dim=0)
                test_score = self.binary_accuracy(all_y_pred, all_y)
                logger.info("test loss: %f", test_loss)
                logger.info("test score: %f", test_score)
                return test_loss

        self.tokens = 0 # counter used for learning rate decay
                best_loss = float('inf')
#         best_loss = run_epoch('test')
        for epoch in range(config.max_epochs):

            run_epoch('train')
            if self.test_loader is not None:
                test_loss = run_epoch('test')

            # supports early stopping based on the test loss, or just save always if no test set is provided
            good_model = self.test_loader is None or test_loss < best_loss
            if self.config.ckpt_path is not None and good_model:
                best_loss = test_loss
                self.save_checkpoint()

In [15]:
class GPTConfig:
    """ base GPT config, params common to all GPT versions """
    embd_pdrop = 0.1
    resid_pdrop = 0.1
    attn_pdrop = 0.1

    def __init__(self, vocab_size, output_size, pad_index, pad_size, n_embd, **kwargs):
        self.vocab_size = vocab_size
        self.output_size = output_size
        self.pad_index = pad_index
        self.pad_size = pad_size
        self.n_embd = n_embd
        for k,v in kwargs.items():
            setattr(self, k, v)

In [16]:
class TrainerConfig:
    # optimization parameters
    max_epochs = 10
    learning_rate = 3e-4
    betas = (0.9, 0.95)
    grad_norm_clip = 1.0
    weight_decay = 0.1 # only applied on matmul weights
    # learning rate decay params: linear warmup followed by cosine decay to 10% of original
    lr_decay = False
    warmup_tokens = 375e6 # these two numbers come from the GPT-3 paper, but may not be good defaults elsewhere
    final_tokens = 260e9 # (at what point we reach 10% of original LR)
    # checkpoint settings
    ckpt_path = 'gpt-model.pt'
    num_workers = 0 # for DataLoader

    def __init__(self, **kwargs):
        for k,v in kwargs.items():
            print(k,v)
            setattr(self, k, v)

In [27]:
# vocab_size = len(TEXT.vocab)
vocab_size = max_vocal_size + 4
output_size = 1
# pad_idx = TEXT.vocab.stoi[TEXT.pad_token]
pad_idx = 1
# embbding_size  = TEXT.vocab.vectors.shape[1]
embbding_size = 100

In [28]:
mconf = GPTConfig(vocab_size, output_size, pad_idx, pad_size, 
                  embbding_size, n_layer=4, n_head=4, device=device)

In [29]:
model = GPT(mconf).to(device)

number of parameters: 40537100


In [30]:
# model

In [31]:
model.tok_emb.weight.data.copy_(TEXT.vocab.vectors)
model.tok_emb.weight.requires_grad = False
# UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
# model.tok_emb.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_SIZE)
# model.tok_emb.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_SIZE)
print(model.tok_emb.weight.data.shape)

torch.Size([400004, 100])


In [32]:
# model.tok_emb.weight[6666]

In [33]:
print('{} : all params: {:4f}M'.format(model._get_name(), sum(p.numel() for p in model.parameters()) / 1000 / 1000))
print('{} : need grad params: {:4f}M'.format(model._get_name(), sum(p.numel() for p in model.parameters() if p.requires_grad) / 1000 / 1000))

GPT : all params: 40.537100M
GPT : need grad params: 0.536700M


In [34]:
tconf = TrainerConfig(max_epochs=2, learning_rate=6e-4,lr_decay=True, 
                      warmup_tokens=32*200, final_tokens=4*len(train_data),
                      num_workers=1)

max_epochs 2
learning_rate 0.0006
lr_decay True
warmup_tokens 6400
final_tokens 2786308
num_workers 1


In [35]:
trainer = Trainer(model, train_iterator, valid_iterator, tconf)

In [36]:
trainer.train()

10/14/2020 19:56:14 - test loss: 0.657418
10/14/2020 19:56:14 - test score: 0.662961
epoch 1 iter 21768: train loss 0.14445. score 0.87500. lr 5.132796e-04: 100%|██████████| 21769/21769 [24:52<00:00, 14.59it/s]   
10/14/2020 20:23:12 - test loss: 0.213118
10/14/2020 20:23:12 - test score: 0.921945
10/14/2020 20:23:12 - saving gpt-model.pt
epoch 2 iter 21768: train loss 0.06823. score 0.96875. lr 3.010849e-04: 100%|██████████| 21769/21769 [24:32<00:00, 14.79it/s]   
10/14/2020 20:49:52 - test loss: 0.179157
10/14/2020 20:49:52 - test score: 0.933056
10/14/2020 20:49:52 - saving gpt-model.pt


In [37]:
class Predict:
    
    def __init__(self, model):
        self.model = model
    
    def predict(self, text):
        tokenized = [tok.text for tok in nlp.tokenizer(text)]
        indexed = [TEXT.vocab.stoi[t] for t in tokenized]
        x = torch.LongTensor(indexed).to(device) # seq_len
        x = x.unsqueeze(0) # seq_len * batch_size(1)
        if x.shape[1] < pad_size :
            padding = torch.zeros((x.shape[0],pad_size-x.shape[1]),dtype=torch.long).to(device)
            x = torch.cat([x, padding], dim=1)
        x = x[:,:pad_size]
        self.model.eval()
        pred = torch.sigmoid(self.model(x)[0])
        return pred.item()
    
    def count_acc(self, text_list, local):
        result = []
        for text in text_list:
            result.append(self.predict(text))
        result = torch.tensor(result, dtype = torch.float)
        if local:
            acc = sum(result > 0.5).item()/len(result)
        else:
            acc = sum(result < 0.5).item()/len(result)
        return result, acc
        

In [38]:
# local_test = []
# with open('data/local_test.tsv') as f:
#     reader= csv.reader(f, delimiter='\t')
#     for line in reader:
#         local_test.append(line[0])

In [39]:
# non_local_test = []
# with open('data/non_local_test.tsv') as f:
#     reader= csv.reader(f, delimiter='\t')
#     for line in reader:
#         non_local_test.append(line[0])

In [40]:
model.load_state_dict(torch.load("gpt-model.pt"))

<All keys matched successfully>

In [41]:
predict = Predict(model)

In [42]:
# result, acc = predict.count_acc(local_test, local = True)
# acc

In [43]:
# result_2, acc_2 = predict.count_acc(non_local_test, local = False)
# acc_2

In [44]:
# ((result < 0.5).nonzero()).squeeze()[:20]

In [45]:
# text_1 = local_test[2]
# text_1

In [46]:
# predict.predict(text_1)

In [47]:
# ((result_2 > 0.5).nonzero()).squeeze()[:20]

In [48]:
# text_2 = non_local_test[54]
# text_2

In [49]:
# predict.predict(text_2)

In [50]:
test_text = []
with open('data/test_data.tsv') as f:
    reader= csv.reader(f, delimiter='\t')
    for line in reader:
        test_text.append(line[3])

In [51]:
fout = open('gpt-predict.tsv','w')
for text in test_text:
    prob = predict.predict(text)
    fout.write('{}\n'.format(prob))

In [52]:
test_text[6]

"Swift , Apple 's open source programming language , announces Swift Algorithms . Swift has announced Swift Algorithms . The announcement was made in a blog post on the Swift.org website . The algorithms should help developers fix code and improve app performance faster . Announced on Twitter on Wednesday night , Swift , Apple 's open-source programming language , is bringing new algorithm packages to developers . The announcement was made through the Swift Language account : The new open source Swift Algorithms package was just released , and ready for the community to jump in ! The new open source Swift Algorithms package was just released , and ready for the community to jump in ! To learn more about these new sequence and collection-focused algorithms , head to the https://t.co/5NNXraGyus blog : https://t.co/EsoUq1Q0pU -- Swift Language -LRB- @SwiftLang -RRB- October 8 , 2020 In a blog post on the Swift website , Nate Cook , a member of the Swift standard library team at Apple , sa

In [53]:
predict.predict(test_text[6])

0.5720215439796448