# Memory Information

In [None]:
import psutil
def get_size(bytes, suffix="B"):
    factor = 1024
    for unit in ["", "K", "M", "G", "T", "P"]:
        if bytes < factor:
            return f"{bytes:.2f}{unit}{suffix}"
        bytes /= factor
print("="*40, "Memory Information", "="*40)
svmem = psutil.virtual_memory()
print(f"Total: {get_size(svmem.total)}") ; print(f"Available: {get_size(svmem.available)}")
print(f"Used: {get_size(svmem.used)}") ; print(f"Percentage: {svmem.percent}%")

Total: 25.51GB
Available: 24.52GB
Used: 651.97MB
Percentage: 3.9%


# GPU Information

In [None]:
! nvidia-smi

Fri Sep  4 07:16:04 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.66       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   59C    P0    35W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# **Training BERT**

In [None]:
!pip install -r requirements.txt



In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, BertConfig
import torch
from dataset import SSTDataset
from torch.utils.data import DataLoader
from utils import transformer_params
from utils import evaluation_metrics, save_model, root_and_binary_title
from math import ceil
from loguru import logger
import numpy as np
import os
import time
from datetime import timedelta
from tqdm import tqdm

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!ls /content/drive/My\ Drive/

'Colab Notebooks'   Model


In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
def load_transformer(name, binary):
    config = BertConfig.from_pretrained('bert-base-uncased')
    if not binary:
      config.num_labels = 5
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', config=config)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    
    return {'model': model,
            'tokenizer': tokenizer}

In [None]:
def train_step(model, inputs, labels, optimizer):
    optimizer.zero_grad()

    loss, logits = model(inputs['input_ids'], attention_mask=inputs['attention_mask'],
                         labels=labels)[:2]

    loss.backward()
    optimizer.step()

    return logits, loss

In [None]:
def eval_step(model, inputs, labels):
    labels = labels.unsqueeze(0)
    loss, logits = model(inputs['input_ids'], attention_mask=inputs['attention_mask']
                         , labels=labels)[:2]

    return logits, loss

In [None]:
def train_epoch(model, tokenizer, train_dataset, optimizer, batch_size):
    train_loader = DataLoader(dataset=train_dataset,
                              batch_size=batch_size,
                              shuffle=True)

    correct_count = 0
    total_loss = 0

    model.train()
    with tqdm(total=ceil(len(train_dataset)/batch_size), desc='train', unit='batch') as pbar:
        for text, sentiment in train_loader:
            text = tokenizer(text, padding=True, return_tensors='pt').to(device)
            sentiment = sentiment.to(device)

            logits, loss = train_step(model, text, sentiment, optimizer)

            preds = torch.argmax(logits, axis=1)
            correct_count += (preds == sentiment).sum().item()
            total_loss += loss.item()
            pbar.update(1)

    return correct_count / len(train_dataset), total_loss / len(train_dataset)

In [None]:
def eval_epoch(model, tokenizer, eval_dataset, batch_size, split):
    eval_loader = DataLoader(dataset=eval_dataset,
                            batch_size=batch_size,
                            shuffle=True)

    correct_count = 0
    total_loss = 0
    y_pred = list()
    y_true = list()

    model.eval()
    with torch.no_grad():
        with tqdm(total=ceil(len(eval_dataset)/batch_size), desc=split, unit='batch') as pbar:
            for text, sentiment in eval_loader:
                text = tokenizer(text, padding=True, return_tensors='pt').to(device)
                sentiment = sentiment.to(device)

                logits, loss = eval_step(model, text, sentiment)

                preds = torch.argmax(logits, axis=1)
                y_pred += preds.cpu().numpy().tolist()
                y_true += sentiment.cpu().numpy().tolist()

                correct_count += (preds == sentiment).sum().item()
                total_loss += loss.item()
                pbar.update(1)

    metrics_score = evaluation_metrics(y_true, y_pred, split=split)
    return correct_count / len(eval_dataset), total_loss / len(eval_dataset), metrics_score

In [None]:
def train(name, root, binary, epochs=25, patience=3, save=False):

    #load model and tokenizer..
    try:
        transformer_container = load_transformer(name, binary)
    except ValueError:
        logger.error("Invalid transformer name!")
        os._exit(0)
    model = transformer_container['model']
    model = model.to(device)
    tokenizer = transformer_container['tokenizer']

    #load batch_size and learning rate..
    params_container = transformer_params(name)
    batch_size = params_container['batch_size']
    learning_rate = params_container['learning_rate']

    #load train, dev and test datasets..
    train_dataset = SSTDataset(root=root, binary=binary, split='train')
    dev_dataset = SSTDataset(root=root, binary=binary, split='dev')
    test_dataset = SSTDataset(root=root, binary=binary, split='test')

    #Intialize optimizer..
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    #Initialize training variables..
    best_acc = 0.0
    best_loss = np.inf
    stopping_step = 0
    best_model_name = None

    total_train_seconds = 0
    for epoch in range(epochs):

        start = time.time()
        train_acc, train_loss = train_epoch(model, tokenizer, train_dataset, optimizer, batch_size)
        end = time.time()
        total_train_seconds += (end - start)
        logger.info(f"epoch: {epoch+1}, transformer: {name}, train_loss: {train_loss:.4f}, train_acc: {train_acc*100:.2f}")

        dev_acc, dev_loss, _ = eval_epoch(model, tokenizer, dev_dataset, batch_size, 'dev')
        logger.info(f"epoch: {epoch+1}, transformer: {name}, dev_loss: {dev_loss:.4f}, dev_acc: {dev_acc*100:.2f}")

        test_acc, test_loss, test_evaluation_metrics = eval_epoch(model, tokenizer, test_dataset,
                                                                  batch_size, 'test')
        logger.info(f"epoch: {epoch+1}, transformer: {name}, test_loss: {test_loss:.4f}, test_acc: {test_acc*100:.2f}")
        logger.info(f"epoch: {epoch+1}, transformer: {name}, "
                    f"test_precision: {test_evaluation_metrics['test_precision']*100:.2f}, "
                    f"test_recall: {test_evaluation_metrics['test_recall']*100:.2f}, "
                    f"test_f1_score: {test_evaluation_metrics['test_f1_score']*100:.2f}, "
                    f"test_accuracy_score: {test_evaluation_metrics['test_accuracy']*100:.2f}")
        logger.info(f"epoch: {epoch+1}, transformer: {name}, test_confusion_matrix: \n"
                    f"{test_evaluation_metrics['test_confusion_matrix']}")

        logger.info(f"Total training time elapsed: {timedelta(seconds=total_train_seconds)}")
        logger.info(f"Mean time per train epoch: {timedelta(seconds=total_train_seconds/(epoch+1))}")

        #save best model and delete previous ones...
        if save:
            if test_acc > best_acc:
                best_acc = test_acc
                phrase_type, label = root_and_binary_title(root, binary)
                dir_path = '/content/drive/My Drive/Model'
                model_name = os.path.join(dir_path,
                "{}_{}_{}_{}.pickle".format(name, phrase_type, label, epoch+1))
                save_model(model, model_name, best_model_name)
                best_model_name = model_name


        # Implement early stopping here
        if test_loss < best_loss:
            best_loss = test_loss
            stopping_step = 0
        else:
            stopping_step += 1

        if stopping_step >= patience:
            logger.info("EarlyStopping!")
            os._exit(1)


In [None]:
train('bert', True, False, 10, 3, True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

# **BERT XGBoost**

In [None]:
!pip install xgboost
!pip install optuna
!pip install pytreebank
!pip install tqdm
!pip install loguru
!pip install transformers
!pip install scikit-learn
!pip install nltk

Collecting optuna
[?25l  Downloading https://files.pythonhosted.org/packages/06/b0/9a6313c78bca92abfacc08a2ad8b27bfe845256f615786ee2b6452ae1978/optuna-2.0.0.tar.gz (226kB)
[K     |████████████████████████████████| 235kB 3.3MB/s 
[?25hCollecting alembic
[?25l  Downloading https://files.pythonhosted.org/packages/60/1e/cabc75a189de0fbb2841d0975243e59bde8b7822bacbb95008ac6fe9ad47/alembic-1.4.2.tar.gz (1.1MB)
[K     |████████████████████████████████| 1.1MB 25.6MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting cliff
[?25l  Downloading https://files.pythonhosted.org/packages/71/06/03b1f92d46546a18eabf33ff7f37ef422c18c93d5a926bf590fee32ebe75/cliff-3.4.0-py3-none-any.whl (76kB)
[K     |████████████████████████████████| 81kB 7.3MB/s 
[?25hCollecting cmaes>=0.5.1
  Downloading https://files.pythonhosted.org/packages/63/88/d5e9b78151dce671d7e78ee4cc8905d832

In [None]:
import torch
from dataset import SSTDataset
from torch.utils.data import DataLoader
import optuna
import os
from transformers import BertTokenizer
import xgboost as xgb
from utils import evaluation_metrics

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [None]:
model_path = os.path.join('/content/drive/My Drive/', 'Model/bert_root_fine_2.pickle')
model_path

'/content/drive/My Drive/Model/bert_root_fine_2.pickle'

In [None]:
model = torch.load(model_path)
#print(model)

In [None]:
class BertFeatures(torch.nn.Module):
  def __init__(self, model):
    super(BertFeatures, self).__init__()
    self.model = model.bert

  def forward(self, inputs):
    return self.model(inputs['input_ids'], attention_mask=inputs['attention_mask'])

In [None]:
features_model = BertFeatures(model)
#print(features_model)

In [None]:
root = True
binary = False
batch_size = 32

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
train_dataset = SSTDataset(root=root, binary=binary, split='train')
dev_dataset = SSTDataset(root=root, binary=binary, split='dev')
test_dataset = SSTDataset(root=root, binary=binary, split='test')

2020-09-05 05:28:43.261 | INFO     | dataset:__init__:17 - Preparing dataset config root: True, binary: False, split: train!


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


2020-09-05 05:28:55.012 | INFO     | dataset:__init__:17 - Preparing dataset config root: True, binary: False, split: dev!
2020-09-05 05:29:00.933 | INFO     | dataset:__init__:17 - Preparing dataset config root: True, binary: False, split: test!


In [None]:
def BERT_forward(text, model, tokenizer):
  encoded_text = tokenizer(text, padding=True, return_tensors='pt').to(device)

  model.eval()
  with torch.no_grad():
    last_hidden_state = model(encoded_text)[1]

  return last_hidden_state

In [None]:
train_loader = DataLoader(dataset=train_dataset,
                              batch_size=batch_size,
                              shuffle=True)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

param = {'max_depth': 10, 'eta': 1, 'objective': 'multi:softmax'}
param['nthread'] = 4
param['eval_metric'] = 'mlogloss'
param['num_class'] = 5

num_round = 10

y_actual = list()
y_pred = list()
mlog_loss = 0.0
batch_no = 0

for text, sentiment in train_loader:
  y_actual += sentiment.numpy().tolist()
  features = BERT_forward(text, features_model, tokenizer)
  features = features.cpu().numpy()
  #print(features.shape)
  
  dtrain = xgb.DMatrix(data=features, label=sentiment.numpy())
  if batch_no == 0:
    bst = xgb.train(param, dtrain, num_round)
  else:
    bst = xgb.train(param, dtrain, num_round, xgb_model=bst)
        
  mlog_loss += float(bst.eval(dtrain, name='mlogloss').split(':')[1])
  print("loss: {}".format(mlog_loss))
  y_pred += bst.predict(data=dtrain).tolist()  
  
  print("iteration {} completed!".format(batch_no))
  batch_no += 1
  
print("Mean loss: {}".format(mlog_loss/batch_no))

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…


loss: 0.075301
iteration 0 completed!
loss: 0.16329
iteration 1 completed!
loss: 0.323939
iteration 2 completed!
loss: 0.416531
iteration 3 completed!
loss: 0.516445
iteration 4 completed!
loss: 0.770362
iteration 5 completed!
loss: 0.875898
iteration 6 completed!
loss: 1.1988539999999999
iteration 7 completed!
loss: 1.772706
iteration 8 completed!
loss: 4.738552
iteration 9 completed!
loss: 5.985766
iteration 10 completed!
loss: 6.672102
iteration 11 completed!
loss: 8.00964
iteration 12 completed!
loss: 8.392202999999999
iteration 13 completed!
loss: 9.559890999999999
iteration 14 completed!
loss: 10.678220999999999
iteration 15 completed!
loss: 12.126195
iteration 16 completed!
loss: 14.447215
iteration 17 completed!
loss: 16.881285
iteration 18 completed!
loss: 19.761946
iteration 19 completed!
loss: 22.446106999999998
iteration 20 completed!
loss: 25.902942999999997
iteration 21 completed!
loss: 26.848027999999996
iteration 22 completed!
loss: 30.147035999999996
iteration 23 comp

In [None]:
def trial_BERT_XGBoost(trial):
  train_loader = DataLoader(dataset=train_dataset,
                              batch_size=batch_size,
                              shuffle=True)
  dev_loader = DataLoader(dataset=dev_dataset,
                              batch_size=batch_size,
                              shuffle=True)

  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

  param = {'objective': 'multi:softmax',
           "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
           "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
           "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),}


  if param["booster"] == "gbtree" or param["booster"] == "dart":
    param["max_depth"] = trial.suggest_int("max_depth", 1, 9)
    param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
    param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
    param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
  if param["booster"] == "dart":
    param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
    param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
    param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
    param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

  param['eval_metric'] = 'mlogloss'
  param['num_class'] = 5

  num_round = 10

  y_true = list()
  y_pred = list()
  mlog_loss = 0.0
  batch_no = 0

  for text, sentiment in train_loader:
    y_true += sentiment.numpy().tolist()
    features = BERT_forward(text, features_model, tokenizer)
    features = features.cpu().numpy()
    #print(features.shape)

    dtrain = xgb.DMatrix(data=features, label=sentiment.numpy())
    if batch_no == 0:
      bst = xgb.train(param, dtrain)
    else:
      bst = xgb.train(param, dtrain, xgb_model=bst)

    mlog_loss += float(bst.eval(dtrain, name='mlogloss').split(':')[1])
    #print("loss: {}".format(mlog_loss))
    y_pred += bst.predict(data=dtrain).tolist()  

    #print("iteration {} completed!".format(batch_no))
    batch_no += 1

  #print("Mean loss: {}".format(mlog_loss/batch_no))

  y_true = list()
  y_pred = list()
  mlog_loss = 0.0
  batch_no = 0

  for text, sentiment in dev_loader:
    y_true += sentiment.numpy().tolist()
    features = BERT_forward(text, features_model, tokenizer)
    features = features.cpu().numpy()
    #print(features.shape)

    ddev = xgb.DMatrix(data=features, label=sentiment.numpy())

    mlog_loss += float(bst.eval(ddev, name='mlogloss').split(':')[1])
    #print("loss: {}".format(mlog_loss))
    y_pred += bst.predict(data=ddev).tolist()  

    #print("iteration {} completed!".format(batch_no))
    batch_no += 1

  #print("Mean loss: {}".format(mlog_loss/batch_no))
  metrics = evaluation_metrics(y_true, y_pred, split='dev')

  return metrics['dev_f1_score']


In [None]:
study = optuna.create_study(direction='maximize')
#study.optimize(trial_BERT_XGBoost, timeout=900)
study.optimize(trial_BERT_XGBoost, n_trials=100)

[I 2020-09-05 06:30:32,901] Trial 0 finished with value: 0.48785366745721515 and parameters: {'booster': 'gbtree', 'lambda': 1.6148183427140605e-05, 'alpha': 0.00026767384302040187, 'max_depth': 8, 'eta': 0.001019615693312664, 'gamma': 1.6859726228584748e-05, 'grow_policy': 'lossguide'}. Best is trial 0 with value: 0.48785366745721515.

Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.

[I 2020-09-05 06:30:55,744] Trial 1 finished with value: 0.08316546762589928 and parameters: {'booster': 'gblinear', 'lambda': 2.052367336165611e-08, 'alpha': 0.3742160379699689}. Best is trial 0 with value: 0.48785366745721515.
[I 2020-09-05 06:31:18,786] Trial 2 finished with value: 0.05213270142180095 and parameters: {'booster': 'gblinear', 'lambda': 1.3537270162134945e-05, 'alpha': 0.04611024602630847}. Best is trial 0 with value: 0.48785366745721515.
[I 2020-09-05 06:31:41,789] Trial 3 finished with value: 0.08

KeyboardInterrupt: ignored