# Memory Information

In [None]:
import psutil
def get_size(bytes, suffix="B"):
    factor = 1024
    for unit in ["", "K", "M", "G", "T", "P"]:
        if bytes < factor:
            return f"{bytes:.2f}{unit}{suffix}"
        bytes /= factor
print("="*40, "Memory Information", "="*40)
svmem = psutil.virtual_memory()
print(f"Total: {get_size(svmem.total)}") ; print(f"Available: {get_size(svmem.available)}")
print(f"Used: {get_size(svmem.used)}") ; print(f"Percentage: {svmem.percent}%")

Total: 25.51GB
Available: 24.60GB
Used: 587.51MB
Percentage: 3.6%


# GPU Information

In [None]:
! nvidia-smi

Tue Sep  8 13:27:08 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.66       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P0    25W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!pip install -r requirements.txt

Collecting pytreebank
  Downloading https://files.pythonhosted.org/packages/e0/12/626ead6f6c0a0a9617396796b965961e9dfa5e78b36c17a81ea4c43554b1/pytreebank-0.2.7.tar.gz
Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ae/05/c8c55b600308dc04e95100dc8ad8a244dd800fe75dfafcf1d6348c6f6209/transformers-3.1.0-py3-none-any.whl (884kB)
[K     |████████████████████████████████| 890kB 14.4MB/s 
Collecting loguru
[?25l  Downloading https://files.pythonhosted.org/packages/a0/f7/85704c74dd5b616a528aa724d359033f4dbec896b604cf12b65c4f9badb1/loguru-0.5.2-py3-none-any.whl (57kB)
[K     |████████████████████████████████| 61kB 6.3MB/s 
Collecting optuna
[?25l  Downloading https://files.pythonhosted.org/packages/9f/b1/a5f0574fa0d769bf0a5629722c84bb0af015967191a37401fe8508c5fb6a/optuna-2.1.0.tar.gz (232kB)
[K     |████████████████████████████████| 235kB 18.0MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l

In [None]:
from transformers import GPT2Tokenizer, GPT2Model, GPT2Config
import torch
from dataset import SSTDataset
from torch.utils.data import DataLoader
from utils import transformer_params
from utils import evaluation_metrics, save_model, root_and_binary_title
from math import ceil
from loguru import logger
import numpy as np
import os
import time
from datetime import timedelta
from tqdm import tqdm
import optuna

In [None]:
class GPT2ForSequenceClassification(torch.nn.Module):
  def __init__(self, num_labels):
    super(GPT2ForSequenceClassification, self).__init__()
    self.model = GPT2Model.from_pretrained('gpt2',
                                       config=GPT2Config.from_pretrained('gpt2'))
    self.max_pool = torch.nn.MaxPool1d(3, 2)
    self.dropout = torch.nn.Dropout(p=0.1)
    self.layer_norm = torch.nn.LayerNorm(768)
    self.conv1d_1 = torch.nn.Conv1d(in_channels=768, out_channels=768, kernel_size=1, stride=1)
    self.fc_layer = torch.nn.Linear(in_features=768, out_features=768)
    self.tanh = torch.nn.Tanh()
    self.out_layer = torch.nn.Linear(in_features=768, out_features=num_labels)
    self.criterion = torch.nn.CrossEntropyLoss()

  def forward(self, input_ids, attention_mask, labels):
    gpt_last_layer = self.model(input_ids, attention_mask=attention_mask)[0]
    #[batch_size, seq_len, embedding_size(channels)] = [*, *, 768]

    gpt_last_layer = gpt_last_layer.permute(0, 2, 1)
    #[batch_size, embedding_size(channels), seq_len] = [*, 768, *]

    max_pool_out = self.max_pool(gpt_last_layer)
    #batch_size, embedding_size(channels), seq_len] = [*, 768, *]

    max_pool_out = self.dropout(max_pool_out)
    max_pool_out = max_pool_out.permute(0, 2, 1)
    #[batch_size, seq_len, embedding_size(channels)] = [*, *, 768]

    layer_norm_out = self.layer_norm(max_pool_out)
    layer_norm_out = layer_norm_out.permute(0, 2, 1)
    #[batch_size, embedding_size(channels), seq_len] = [*, 768, *]

    conv1d_1_out = self.conv1d_1(layer_norm_out)
    conv1d_1_out = self.tanh(conv1d_1_out)
    #[batch_size, embedding_size(channels), seq_len] = [*, 768, *]

    global_max_pooling_out, _ = torch.max(conv1d_1_out, axis=2)
    global_max_pooling_out = self.dropout(global_max_pooling_out)
    #[batch_size, embedding_size(channels)] = [*, 768]

    fc_layer_out = self.fc_layer(global_max_pooling_out)
    fc_layer_out = self.tanh(fc_layer_out)
    #[batch_size, embedding_size(channels)] = [*, 768]

    fc_layer_out = self.dropout(fc_layer_out)
    logits = self.out_layer(fc_layer_out)
    #[batch_size, embedding_size(channels)] = [*, num_labels]
    
    loss = self.criterion(logits, labels)
                                   
    return logits, loss


In [None]:

gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt2_tokenizer.add_special_tokens({'pad_token': '.'})

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




0

In [None]:
def load_transformer(name, binary):
  num_classes = 5
  if binary:
    num_classes = 2
  model = GPT2ForSequenceClassification(num_classes)
  tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
  tokenizer.add_special_tokens({'pad_token': '.'})

  return {'model': model,
          'tokenizer': tokenizer}

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
def train_step(model, inputs, labels, optimizer):
    optimizer.zero_grad()

    logits, loss = model(inputs['input_ids'], attention_mask=inputs['attention_mask'], labels=labels)

    loss.backward()
    optimizer.step()

    return logits, loss

In [None]:
def eval_step(model, inputs, labels):
    logits, loss = model(inputs['input_ids'], attention_mask=inputs['attention_mask'], labels=labels)

    return logits, loss

In [None]:
def train_epoch(model, tokenizer, train_dataset, optimizer, batch_size):
    train_loader = DataLoader(dataset=train_dataset,
                              batch_size=batch_size,
                              shuffle=True)

    correct_count = 0
    total_loss = 0

    model.train()
    with tqdm(total=ceil(len(train_dataset)/batch_size), desc='train', unit='batch') as pbar:
        for text, sentiment in train_loader:
            text = tokenizer(text, padding=True, return_tensors='pt').to(device)
            sentiment = sentiment.to(device)

            logits, loss = train_step(model, text, sentiment, optimizer)

            preds = torch.argmax(logits, axis=1)
            correct_count += (preds == sentiment).sum().item()
            total_loss += loss.item()
            pbar.update(1)

    return correct_count / len(train_dataset), total_loss / len(train_dataset)

In [None]:
def eval_epoch(model, tokenizer, eval_dataset, batch_size, split):
    eval_loader = DataLoader(dataset=eval_dataset,
                            batch_size=batch_size,
                            shuffle=True)

    correct_count = 0
    total_loss = 0
    y_pred = list()
    y_true = list()

    model.eval()
    with torch.no_grad():
        with tqdm(total=ceil(len(eval_dataset)/batch_size), desc=split, unit='batch') as pbar:
            for text, sentiment in eval_loader:
                text = tokenizer(text, padding=True, return_tensors='pt').to(device)
                sentiment = sentiment.to(device)

                logits, loss = eval_step(model, text, sentiment)

                preds = torch.argmax(logits, axis=1)
                y_pred += preds.cpu().numpy().tolist()
                y_true += sentiment.cpu().numpy().tolist()

                correct_count += (preds == sentiment).sum().item()
                total_loss += loss.item()
                pbar.update(1)

    metrics_score = evaluation_metrics(y_true, y_pred, split=split)
    return correct_count / len(eval_dataset), total_loss / len(eval_dataset), metrics_score

In [None]:
def train(name, root, binary, epochs=25, patience=3, save=False, trial=None):

    #load model and tokenizer..
    try:
        transformer_container = load_transformer(name, binary)
    except ValueError:
        logger.error("Invalid transformer name!")
        os._exit(0)
    model = transformer_container['model']
    model = model.to(device)
    tokenizer = transformer_container['tokenizer']

    #load batch_size and learning rate..
    params_container = transformer_params(name)
    batch_size = params_container['batch_size']
    learning_rate = params_container['learning_rate']

    #Hyper-paramters
    batch_size = trial.suggest_categorical("batch_size", [8, 16, 32])
    learning_rate = trial.suggest_float("lr", 1e-7, 1e-4, log=True)
    epochs = trial.suggest_int("epochs", 1, 5)


    #load train, dev and test datasets..
    train_dataset = SSTDataset(root=root, binary=binary, split='train')
    dev_dataset = SSTDataset(root=root, binary=binary, split='dev')
    test_dataset = SSTDataset(root=root, binary=binary, split='test')

    #Intialize optimizer..
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    #Initialize training variables..
    best_acc = 0.0
    best_loss = np.inf
    stopping_step = 0
    best_model_name = None

    #test_f1
    test_f1 = 0.0



    total_train_seconds = 0
    for epoch in range(epochs):

        start = time.time()
        train_acc, train_loss = train_epoch(model, tokenizer, train_dataset, optimizer, batch_size)
        end = time.time()
        total_train_seconds += (end - start)
        logger.info(f"epoch: {epoch+1}, transformer: {name}, train_loss: {train_loss:.4f}, train_acc: {train_acc*100:.2f}")

        dev_acc, dev_loss, _ = eval_epoch(model, tokenizer, dev_dataset, batch_size, 'dev')
        logger.info(f"epoch: {epoch+1}, transformer: {name}, dev_loss: {dev_loss:.4f}, dev_acc: {dev_acc*100:.2f}")

        test_acc, test_loss, test_evaluation_metrics = eval_epoch(model, tokenizer, test_dataset,
                                                                  batch_size, 'test')
        logger.info(f"epoch: {epoch+1}, transformer: {name}, test_loss: {test_loss:.4f}, test_acc: {test_acc*100:.2f}")
        logger.info(f"epoch: {epoch+1}, transformer: {name}, "
                    f"test_precision: {test_evaluation_metrics['test_precision']*100:.2f}, "
                    f"test_recall: {test_evaluation_metrics['test_recall']*100:.2f}, "
                    f"test_f1_score: {test_evaluation_metrics['test_f1_score']*100:.2f}, "
                    f"test_accuracy_score: {test_evaluation_metrics['test_accuracy']*100:.2f}")
        logger.info(f"epoch: {epoch+1}, transformer: {name}, test_confusion_matrix: \n"
                    f"{test_evaluation_metrics['test_confusion_matrix']}")

        logger.info(f"Total training time elapsed: {timedelta(seconds=total_train_seconds)}")
        logger.info(f"Mean time per train epoch: {timedelta(seconds=total_train_seconds/(epoch+1))}")

        test_f1 = test_evaluation_metrics['test_f1_score']
        #save best model and delete previous ones...
        '''if save:
            if test_acc > best_acc:
                best_acc = test_acc
                phrase_type, label = root_and_binary_title(root, binary)
                model_name = "{}_{}_{}_{}.pickle".format(name, phrase_type, label, epoch)
                save_model(model, model_name, best_model_name)


        # Implement early stopping here
        if test_loss < best_loss:
            best_loss = test_loss
            stopping_step = 0
        else:
            stopping_step += 1

        if stopping_step >= patience:
            logger.info("EarlyStopping!")
            os._exit(1)'''
        trial.report(test_f1, epoch)
        if trial.should_prune():
          raise optuna.TrialPruned()

    return test_f1


In [None]:
def hpoptimize_run(trial):
  return train('gpt2', True, False, 100, 300, False, trial)

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(hpoptimize_run, timeout=1200)

[I 2020-09-08 13:28:59,245] A new study created in memory with name: no-name-20f8303e-89a6-4b17-ace5-e3d40591231a


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=665.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=548118077.0, style=ProgressStyle(descri…




Some weights of GPT2Model were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.0.attn.masked_bias', 'h.1.attn.masked_bias', 'h.2.attn.masked_bias', 'h.3.attn.masked_bias', 'h.4.attn.masked_bias', 'h.5.attn.masked_bias', 'h.6.attn.masked_bias', 'h.7.attn.masked_bias', 'h.8.attn.masked_bias', 'h.9.attn.masked_bias', 'h.10.attn.masked_bias', 'h.11.attn.masked_bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2020-09-08 13:29:50.402 | INFO     | dataset:__init__:17 - Preparing dataset config root: True, binary: False, split: train!


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


2020-09-08 13:30:01.292 | INFO     | dataset:__init__:17 - Preparing dataset config root: True, binary: False, split: dev!
2020-09-08 13:30:06.949 | INFO     | dataset:__init__:17 - Preparing dataset config root: True, binary: False, split: test!
train: 100%|██████████| 534/534 [01:01<00:00,  8.70batch/s]
2020-09-08 13:31:09.033 | INFO     | __main__:train:50 - epoch: 1, transformer: gpt2, train_loss: 0.0991, train_acc: 25.81
dev: 100%|██████████| 69/69 [00:02<00:00, 30.01batch/s]

Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.

2020-09-08 13:31:11.363 | INFO     | __main__:train:53 - epoch: 1, transformer: gpt2, dev_loss: 0.0986, dev_acc: 26.16
test: 100%|██████████| 139/139 [00:04<00:00, 29.76batch/s]
2020-09-08 13:31:16.057 | INFO     | __main__:train:57 - epoch: 1, transformer: gpt2, test_loss: 0.0991, test_acc: 28.64
2020-09-08 13:31:16.058 | INFO     | __main__:train:58 - epoch: 1, transfo

In [None]:
print(study.best_trial)