Creates BertForSequenceClassification predictions for the Technique Classification (TC) task.
This is where we extract the pre-softmax embeddings for the *base model* of our TC system.

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
!pip install pytorch-pretrained-bert pytorch-nlp

In [0]:
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
import pandas as pd
import io
import numpy as np
from sklearn.preprocessing import LabelEncoder
import time

In [0]:
### CONFIG ###
TRAIN_URL = 'https://raw.githubusercontent.com/cicl-iscl/CyberWallE/master/data/tc-train-repetition.tsv?token=AD7GEDPHP327J4M4QTVBV3K6NVUAE'
DEV_URL = 'https://raw.githubusercontent.com/cicl-iscl/CyberWallE/master/data/tc-dev-repetition.tsv?token=AD7GEDK2P4NPZCM32L5M4BK6NVUEA'
TEST_URL = 'https://raw.githubusercontent.com/cicl-iscl/CyberWallE/master/data/tc-test-repetition.tsv?token=AD7GEDIUC2H5665EBOKWRNK6NVUE6'

MAX_LEN = 200
BATCH_SIZE = 12
LEARNING_RATE = 2e-5
WARMUP = .1
N_EPOCHS = 5  # 2-4 recommended
BERT_MODEL = 'bert-base-uncased'

ROUNDING_ACC = 9
# Can be 'all', 'last' or a list of epoch numbers:
PREDICTION_EPOCHS = [2, 3, 4, 5]
SAVE_LAYER_EPOCHS = []  


UNCASED = 'uncased' in BERT_MODEL
FILE_PREFIX = 'gdrive/My Drive/colab_projects/'
NOW = time.strftime("%Y%m%d-%H%M%S", time.localtime())
LOG_FILE = FILE_PREFIX + 'semeval-predictions/log_bert_' + NOW + '.txt'
BERT_TRAIN_PFX = FILE_PREFIX + 'data/tc_train_' + NOW + '_'
BERT_DEV_PFX = FILE_PREFIX + 'data/tc_dev_' + NOW + '_'
BERT_TEST_PFX = FILE_PREFIX + 'data/tc_test_' + NOW + '_'
PREDICTIONS_DEV_PFX = FILE_PREFIX + 'semeval-predictions/labels_dev_' + NOW + '_'
PREDICTIONS_TEST_PFX = FILE_PREFIX + 'semeval-predictions/labels_test_' + NOW + '_'
##############

In [0]:
def get_data(url, training=True):
    df = pd.read_csv(url, sep='\t', quoting=3, usecols=[0, 1, 2, 3, 4])
    labels = None
    label_encoder = None
    if training:
        label_encoder = LabelEncoder()
        labels = label_encoder.fit_transform(df['label'])

    sentences = ["[CLS] " + sentence + " [SEP]" for sentence in df.text.values]
    tokenizer = BertTokenizer.from_pretrained(BERT_MODEL,
                                              do_lower_case=not UNCASED)
    tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long",
                              truncating="post", padding="post")

    attention_masks = []
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)

    # Used for extracting the data in the right order:
    spans = df.text.tolist()
    span_ids = list(range(len(spans)))

    if UNCASED:
        spans = [span.lower() for span in spans]

    if training:
        data = TensorDataset(torch.tensor(input_ids),
                             torch.tensor(attention_masks),
                             torch.tensor(labels),
                             torch.tensor(span_ids))
        sampler = RandomSampler(data)
    else:
        data = TensorDataset(torch.tensor(input_ids),
                             torch.tensor(attention_masks))
        sampler = SequentialSampler(data)

    dataloader = DataLoader(data, sampler=sampler, batch_size=BATCH_SIZE)

    return df, label_encoder, dataloader, spans

_, label_encoder, train_dataloader, train_spans = get_data(TRAIN_URL)
dev_df, _, dev_dataloader, dev_spans = get_data(DEV_URL, training=False)
del dev_df['text']
test_df, _, test_dataloader, test_spans = get_data(TEST_URL, training=False)
del test_df['text']

In [0]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
print(torch.cuda.get_device_name(0))

model = BertForSequenceClassification.from_pretrained(BERT_MODEL, num_labels=14)
model.cuda()

In [0]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

optimizer = BertAdam(optimizer_grouped_parameters,
                     lr=LEARNING_RATE,
                     warmup=WARMUP)

In [0]:
def predict(model, dataloader, save_layer_rep, layer_file_pfx, pred_file_pfx,
            epoch, spans, df):
    model.eval()
    preds = []

    for batch in dataloader:
        batch = tuple(t.to(device) for t in batch)  # Add batch to GPU
        b_input_ids, b_input_mask = batch

        # Telling the model not to compute or store gradients, saving memory and speeding up prediction
        with torch.no_grad():
            # Forward pass, calculate logit predictions
            logits = model(b_input_ids, token_type_ids=None,
                           attention_mask=b_input_mask)

        logits = logits.detach().cpu().numpy()  # Move logits and labels to CPU
        preds.append(logits)

    predictions = [item for sublist in preds for item in sublist]
    flat_predictions = np.argmax(predictions, axis=1).flatten()

    if save_layer_rep:
        with open(layer_file_pfx + str(epoch) + '.tsv', 'w',
                    encoding='utf-8') as f:
            for pred, span in zip(predictions, spans):
                f.write('1\tclass\t' + span + '\t')
                values = [round(x, ROUNDING_ACC) for x in pred]
                f.write(str(values) + '\n')

    predicted_labels = label_encoder.inverse_transform(flat_predictions)
    df['label'] = predicted_labels
    df.to_csv(pred_file_pfx + str(epoch) + '.txt', sep='\t',
              header=False, index=False)

In [0]:
train_loss_steps = []
train_loss_epochs = []

for epoch in range(1, N_EPOCHS + 1):
    print('Epoch', epoch, end=' ')
    make_prediction = (PREDICTION_EPOCHS == 'all'
                       or epoch in PREDICTION_EPOCHS
                       or (PREDICTION_EPOCHS == 'last' and epoch == N_EPOCHS))
    save_layer_rep = (SAVE_LAYER_EPOCHS == 'all'
                      or epoch in SAVE_LAYER_EPOCHS
                      or (SAVE_LAYER_EPOCHS == 'last' and epoch == N_EPOCHS))
    entries = []

    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)  # Add batch to GPU

        b_input_ids, b_input_mask, b_labels, b_span_ids = batch

        optimizer.zero_grad()  # Clear out the gradients (by default they accumulate)
        # Forward pass
        loss = model(b_input_ids, token_type_ids=None,
                     attention_mask=b_input_mask, labels=b_labels)
        train_loss_steps.append(loss.item())    
        loss.backward()  # Backward pass
        optimizer.step()  # Update parameters via gradient
        
        # Update tracking variables
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1 

        if save_layer_rep:
            # Save predictions (pre-softmax)
            layers = model(b_input_ids, token_type_ids=None,
                           attention_mask=b_input_mask)
            b_span_ids = b_span_ids.tolist()
            for entry in range(layers.size(0)):
                predictions = layers[entry].detach().cpu().numpy()
                values = [round(x, ROUNDING_ACC) for x in predictions]
                entries.append((b_span_ids[entry],
                                train_spans[b_span_ids[entry]],
                                str(values)))
                
        if step % 50 == 0:
            print('.', end='')

    print('\tTraining loss: {}'.format(tr_loss / nb_tr_steps))
    train_loss_epochs.append(tr_loss / nb_tr_steps)

    if save_layer_rep:
        entries = sorted(entries, key=lambda entry: entry[0])
        with open(BERT_TRAIN_PFX + str(epoch) + '.tsv', 'w',
                  encoding='utf-8') as f:
            for entry in entries:
                f.write('1\tclass\t' + entry[1] + '\t' + entry[2] + '\n')

    if make_prediction:
        predict(model, dev_dataloader, save_layer_rep, BERT_DEV_PFX,
                PREDICTIONS_DEV_PFX, epoch, dev_spans, dev_df)
        predict(model, test_dataloader, save_layer_rep, BERT_TEST_PFX,
                PREDICTIONS_TEST_PFX, epoch, test_spans, test_df)

In [0]:
with open(LOG_FILE, 'w', encoding='utf-8') as f:
    f.write('BERT_MODEL: ' + str(BERT_MODEL) + '\n')
    f.write('MAX_LEN: ' + str(MAX_LEN) + '\n')
    f.write('BATCH_SIZE: ' + str(BATCH_SIZE) + '\n')
    f.write('LEARNING_RATE: ' + str(LEARNING_RATE) + '\n')
    f.write('WARMUP: ' + str(WARMUP) + '\n')
    f.write('N_EPOCHS: ' + str(N_EPOCHS) + '\n')
    f.write('TRAIN LOSS BY EPOCH: ' + str(train_loss_epochs))