# Memory Information

In [159]:
import psutil
def get_size(bytes, suffix="B"):
    factor = 1024
    for unit in ["", "K", "M", "G", "T", "P"]:
        if bytes < factor:
            return f"{bytes:.2f}{unit}{suffix}"
        bytes /= factor
print("="*40, "Memory Information", "="*40)
svmem = psutil.virtual_memory()
print(f"Total: {get_size(svmem.total)}") ; print(f"Available: {get_size(svmem.available)}")
print(f"Used: {get_size(svmem.used)}") ; print(f"Percentage: {svmem.percent}%")

Total: 25.51GB
Available: 22.45GB
Used: 5.39GB
Percentage: 12.0%


# GPU Information

In [160]:
! nvidia-smi

Wed Sep  2 11:24:34 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.66       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P0    34W / 250W |  14641MiB / 16280MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [161]:
!pip install -r requirements.txt



In [162]:
from transformers import GPT2Tokenizer, GPT2Model, GPT2Config
from transformers import BertTokenizer, BertModel, BertConfig
import torch
from dataset import SSTDataset
from torch.utils.data import DataLoader
from utils import transformer_params
from utils import evaluation_metrics, save_model, root_and_binary_title
from math import ceil
from loguru import logger
import numpy as np
import os
import time
from datetime import timedelta
from tqdm import tqdm

In [163]:
def add_special_tokens(text):
  text = list(text)
  for i in range(len(text)):
    text[i] = '<|endoftext|> ' + text[i] + ' <|endoftext|>'
  return tuple(text)

In [164]:
class GPT2ForSequenceClassification(torch.nn.Module):
    def __init__(self, num_labels):
        super(GPT2ForSequenceClassification, self).__init__()
        self.model = GPT2Model.from_pretrained('gpt2',
                                           config=GPT2Config.from_pretrained('gpt2'))
        self.dropout = torch.nn.Dropout(p=0.1)
        self.fc_layer = torch.nn.Linear(in_features=768, out_features=768)
        self.tanh = torch.nn.Tanh()

    def forward(self, input_ids, attention_mask):
        gpt_last_layer = self.model(input_ids, attention_mask=attention_mask)[0]
        #[batch_size, seq_len, embedding_size(channels)] = [*, *, 768]
    
        gpt_last_layer = gpt_last_layer.permute(0, 2, 1)
        #[batch_size, embedding_size(channels), seq_len] = [*, 768, *]
    
        global_max_pooling_out, _ = torch.max(gpt_last_layer, axis=2)
        global_max_pooling_out = self.dropout(global_max_pooling_out)
        #[batch_size, embedding_size(channels)] = [*, 768]
    
        fc_layer_out = self.fc_layer(global_max_pooling_out)
        fc_layer_out = self.tanh(fc_layer_out)
        #[batch_size, embedding_size(channels)] = [*, 768]
    
        return fc_layer_out

In [165]:
class BERTGPT2ForSequenceClassification(torch.nn.Module):
    def __init__(self, num_labels):
        super(BERTGPT2ForSequenceClassification, self).__init__()
        self.gpt2_model = GPT2ForSequenceClassification(num_labels)
        self.gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        self.gpt2_tokenizer.add_special_tokens({'pad_token': '.'})
    
        self.bert_model = BertModel.from_pretrained("bert-base-uncased")
        self.bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        
        self.fc_layer = torch.nn.Linear(in_features=768, out_features=1152)
        self.tanh = torch.nn.Tanh()
        self.out_layer = torch.nn.Linear(in_features=1152, out_features=num_labels)
        self.criterion = torch.nn.CrossEntropyLoss()

    def forward(self, text, labels):
        gpt_encoding = self.gpt2_tokenizer(add_special_tokens(text), padding=True, return_tensors='pt')
        gpt_encoding = gpt_encoding.to(device)
        gpt2_out = self.gpt2_model(gpt_encoding['input_ids'], gpt_encoding['attention_mask'])
        #[batch_size, embedding_size(channels)] = [*, 768]
        del gpt_encoding
    
        bert_encoding = self.bert_tokenizer(text, padding=True, return_tensors='pt')
        bert_encoding = bert_encoding.to(device)
        bert_out = self.bert_model(bert_encoding['input_ids'], bert_encoding['attention_mask'])[1]
        #[batch_size, embedding_size(channels)] = [*, 768]
        
        merge_out = torch.cat((
                               gpt2_out[:, torch.randint(0, 768, (384,))],
                               bert_out[:, torch.randint(0, 768, (384,))]
        ), axis=1)
        #[batch_size, embedding_size(channels)] = [*, 768]
    
        fc_layer_out = self.fc_layer(merge_out)
        fc_layer_out = self.tanh(fc_layer_out)
        ##[batch_size, embedding_size(channels)] = [*, 1152]
    
        logits = self.out_layer(fc_layer_out)
        #[batch_size, embedding_size(channels)] = [*, num_labels]
        
        loss = self.criterion(logits, labels)
                                       
        return logits, loss


In [166]:
def load_transformer(name, binary):
    num_classes = 5
    if binary:
        num_classes = 2
    model = BERTGPT2ForSequenceClassification(num_classes)
    
    return {'model': model}

In [167]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [168]:
def train_step(model, inputs, labels, optimizer):
    optimizer.zero_grad()

    logits, loss = model(inputs, labels=labels)
    #print(logits)
    #print(loss)

    loss.backward()
    optimizer.step()

    return logits, loss

In [169]:
def eval_step(model, inputs, labels):
    logits, loss = model(inputs, labels=labels)

    return logits, loss

In [170]:
def train_epoch(model, train_dataset, optimizer, batch_size):
    train_loader = DataLoader(dataset=train_dataset,
                              batch_size=batch_size,
                              shuffle=True)

    correct_count = 0
    total_loss = 0

    model.train()
    with tqdm(total=ceil(len(train_dataset)/batch_size), desc='train', unit='batch') as pbar:
        for text, sentiment in train_loader:
            sentiment = sentiment.to(device)

            logits, loss = train_step(model, text, sentiment, optimizer)

            preds = torch.argmax(logits, axis=1)
            correct_count += (preds == sentiment).sum().item()
            total_loss += loss.item()
            pbar.update(1)

    return correct_count / len(train_dataset), total_loss / len(train_dataset)

In [171]:
def eval_epoch(model, eval_dataset, batch_size, split):
    eval_loader = DataLoader(dataset=eval_dataset,
                            batch_size=batch_size,
                            shuffle=True)

    correct_count = 0
    total_loss = 0
    y_pred = list()
    y_true = list()

    model.eval()
    with torch.no_grad():
        with tqdm(total=ceil(len(eval_dataset)/batch_size), desc=split, unit='batch') as pbar:
            for text, sentiment in eval_loader:
                sentiment = sentiment.to(device)

                logits, loss = eval_step(model, text, sentiment)

                preds = torch.argmax(logits, axis=1)
                y_pred += preds.cpu().numpy().tolist()
                y_true += sentiment.cpu().numpy().tolist()

                correct_count += (preds == sentiment).sum().item()
                total_loss += loss.item()
                pbar.update(1)

    metrics_score = evaluation_metrics(y_true, y_pred, split=split)
    return correct_count / len(eval_dataset), total_loss / len(eval_dataset), metrics_score

In [172]:
def train(name, root, binary, epochs=25, patience=3, save=False):

    #load model and tokenizer..
    try:
        transformer_container = load_transformer(name, binary)
    except ValueError:
        logger.error("Invalid transformer name!")
        os._exit(0)
    model = transformer_container['model']
    model = model.to(device)

    #load batch_size and learning rate..
    params_container = transformer_params(name)
    batch_size = params_container['batch_size']
    learning_rate = params_container['learning_rate']

    #load train, dev and test datasets..
    train_dataset = SSTDataset(root=root, binary=binary, split='train')
    dev_dataset = SSTDataset(root=root, binary=binary, split='dev')
    test_dataset = SSTDataset(root=root, binary=binary, split='test')

    #Intialize optimizer..
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    #Initialize training variables..
    best_acc = 0.0
    best_loss = np.inf
    stopping_step = 0
    best_model_name = None

    total_train_seconds = 0
    for epoch in range(epochs):

        start = time.time()
        train_acc, train_loss = train_epoch(model, train_dataset, optimizer, batch_size)
        end = time.time()
        total_train_seconds += (end - start)
        logger.info(f"epoch: {epoch+1}, transformer: {name}, train_loss: {train_loss:.4f}, train_acc: {train_acc*100:.2f}")

        dev_acc, dev_loss, _ = eval_epoch(model, dev_dataset, batch_size, 'dev')
        logger.info(f"epoch: {epoch+1}, transformer: {name}, dev_loss: {dev_loss:.4f}, dev_acc: {dev_acc*100:.2f}")

        test_acc, test_loss, test_evaluation_metrics = eval_epoch(model, test_dataset,
                                                                  batch_size, 'test')
        logger.info(f"epoch: {epoch+1}, transformer: {name}, test_loss: {test_loss:.4f}, test_acc: {test_acc*100:.2f}")
        logger.info(f"epoch: {epoch+1}, transformer: {name}, "
                    f"test_precision: {test_evaluation_metrics['test_precision']*100:.2f}, "
                    f"test_recall: {test_evaluation_metrics['test_recall']*100:.2f}, "
                    f"test_f1_score: {test_evaluation_metrics['test_f1_score']*100:.2f}, "
                    f"test_accuracy_score: {test_evaluation_metrics['test_accuracy']*100:.2f}")
        logger.info(f"epoch: {epoch+1}, transformer: {name}, test_confusion_matrix: \n"
                    f"{test_evaluation_metrics['test_confusion_matrix']}")

        logger.info(f"Total training time elapsed: {timedelta(seconds=total_train_seconds)}")
        logger.info(f"Mean time per train epoch: {timedelta(seconds=total_train_seconds/(epoch+1))}")

        #save best model and delete previous ones...
        if save:
            if test_acc > best_acc:
                best_acc = test_acc
                phrase_type, label = root_and_binary_title(root, binary)
                model_name = "{}_{}_{}_{}.pickle".format(name, phrase_type, label, epoch)
                save_model(model, model_name, best_model_name)


        # Implement early stopping here
        if test_loss < best_loss:
            best_loss = test_loss
            stopping_step = 0
        else:
            stopping_step += 1

        if stopping_step >= patience:
            logger.info("EarlyStopping!")
            os._exit(1)


In [173]:
train('gpt2', True, False, 30, 300, False)

Some weights of GPT2Model were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.0.attn.masked_bias', 'h.1.attn.masked_bias', 'h.2.attn.masked_bias', 'h.3.attn.masked_bias', 'h.4.attn.masked_bias', 'h.5.attn.masked_bias', 'h.6.attn.masked_bias', 'h.7.attn.masked_bias', 'h.8.attn.masked_bias', 'h.9.attn.masked_bias', 'h.10.attn.masked_bias', 'h.11.attn.masked_bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2020-09-02 11:24:49.075 | INFO     | dataset:__init__:17 - Preparing dataset config root: True, binary: False, split: train!
2020-09-02 11:24:56.023 | INFO     | dataset:__init__:17 - Preparing dataset config root: True, binary: False, split: dev!
2020-09-02 11:25:01.036 | INFO     | dataset:__init__:17 - Preparing dataset config root: True, binary: False, split: test!
train: 100%|██████████| 267/267 [01:37<00:00,  2.74batch/s]
2020-09-02 11:26:38.996 | INFO     | __main__:train:38 - e

In [174]:
a = torch.tensor([[1, 2, 3],
                  [4, 5, 6]])
b = torch.tensor([[10, 20, 30],
                 [40, 50, 60]])

out = torch.cat((
                 a[:, torch.randint(0, 2, (2,))],
                 b[:, torch.randint(0, 2, (2,))]
), axis = 1)
out

tensor([[ 2,  2, 10, 20],
        [ 5,  5, 40, 50]])