In [3]:
import ipdb
import sys
import csv
import numpy as np
import torch.nn as nn
import torch.optim as optim
sys.path.extend(['/Users/zeerakw/Documents/PhD/projects/active/Multitask-abuse/src'])

from gen.shared.data import GeneralDataset
from gen.shared.batching import Batch, BatchExtractor
from gen.shared.base import Field
from gen.neural import RNNClassifier
from gen.shared.clean import Cleaner

In [4]:
def batch_data(dataset, batch_size):
    batched = Batch(batch_size, dataset)
    batched.create_batches()
    extractor = BatchExtractor('encoded', 'label', batched)
    return (batched, extractor)

# Load data

In [5]:
# Load Davidson
text_field = Field('text', train = True, label = False, ignore = False, ix = 6, cname = 'text')
label_field = Field('label', train = False, label = True, cname = 'label', ignore = False, ix = 5)
ignore_field = Field('ignore', train = False, label = False, cname = 'ignore', ignore = True)

davidson_fields = [ignore_field, ignore_field, ignore_field, ignore_field, ignore_field, label_field, text_field]

davidson = GeneralDataset(data_dir = '~/PhD/projects/active/Generalisable_abuse/data/',
                          ftype = 'csv', fields = davidson_fields, train = 'davidson_train.csv', dev = None,
                          test = None, train_labels = None, tokenizer = lambda x: x.split(),
                          lower = True, preprocessor = None, transformations = None,
                          label_processor = None, sep = ',', name = 'Davidson et al.')
davidson.load('train')

loading Davidson et al. (train): 887it [00:00, 46776.82it/s]


In [6]:
# Load Garcia
t_field = Field('text', train = True, label = False, ignore = False, ix = 5, cname = 'text')
l_field = Field('label', train = False, label = True, cname = 'label', ignore = False, ix = 4)

garcia_fields = [ignore_field, ignore_field, ignore_field, l_field, t_field]

garcia = GeneralDataset(data_dir = '~/PhD/projects/active/Generalisable_abuse/data/',
                        ftype = 'tsv', fields = garcia_fields, train = 'garcia_stormfront_train.tsv', dev = None,
                        test = None, train_labels = None, tokenizer = lambda x: x.split(),
                        lower = True, preprocessor = None, transformations = None,
                        label_processor = lambda x: x[0], sep = '\t', name = 'Garcia et al.')
garcia.load('train')

loading Garcia et al. (train): 1914it [00:00, 22566.44it/s]


# Process data

In [7]:
train, dev, test = davidson.split(davidson.data, [0.8, 0.1, 0.1])
davidson.build_token_vocab(train)
davidson.build_label_vocab(train)
davidson.process_labels(train)

davidson.process_labels(dev)

davidson_train = davidson.encode(train, onehot = True)
davidson_dev = davidson.encode(dev, onehot = True)
davidson_test = davidson.encode(test, onehot = True)
davidson.process_labels(test)

Building vocabulary: 100%|██████████| 709/709 [00:00<00:00, 41646.99it/s]
Encoding data: 100%|██████████| 709/709 [00:00<00:00, 866.72it/s]
Encoding data: 0it [00:00, ?it/s]
Encoding data: 100%|██████████| 88/88 [00:00<00:00, 553.25it/s]


In [8]:
garcia_train, garcia_dev, garcia_test = garcia.split(garcia.data, [0.8, 0.1, 0.1])

garcia.build_token_vocab(garcia_train)
garcia.build_label_vocab(garcia_train)
garcia.process_labels(garcia_train)

garcia.process_labels(garcia_dev)

garcia_train = garcia.encode(garcia_train, onehot = True)
garcia_dev = garcia.encode(garcia_dev, onehot = True)
garcia_test = garcia.encode(garcia_test, onehot = True)

Building vocabulary: 100%|██████████| 1531/1531 [00:00<00:00, 124693.76it/s]
Encoding data: 100%|██████████| 1531/1531 [00:24<00:00, 62.13it/s]
Encoding data: 0it [00:00, ?it/s]
Encoding data: 100%|██████████| 191/191 [00:03<00:00, 62.08it/s]


In [None]:
def train_model(model, training_datasets, save_path, optimizer,
                batch_size=64, epochs=30, dev_data=None, clip=None,
                dev_task_id=0,
                patience=10, batches_per_epoch=None, shuffle_data=True,
                loss_weights=None, loss_func = None):
    """
    Trains a model
    :param model:
    :param training_datasets: list of tuples containing dense matrices
    :param save_path: path to save trained model to
    :param optimizer: Pytorch optimizer to train model
    :param batch_size: Training batch size
    :param patience: Number of epochs to observe non-improving dev performance
    before early stopping
    :param epochs: Maximum number of epochs (if no early stopping)
    :param dev_data: tuple (x, y) of development data
    :param dev_task_id: Task ID for task to use for early stopping, in case of
    multitask learning
    :param clip: use gradient clipping
    :param batches_per_epoch: set fixed number of batches per epoch. If
    None, an epoch consists of all training examples
    :param shuffle_data: whether to shuffle data at training
    :param loss_weights: array or list of floats. When using multiple
    input/output functions, these weights determine relative task importance
    :return:
    """
    if loss_weights is None:
        loss_weights = np.ones(len(training_datasets))

    if batches_per_epoch is None:
        batches_per_epoch = sum([len(dataset[0]) * batch_size for dataset
                                 in training_datasets]) // batch_size
    if patience > 0:
        early_stopping = EarlyStopping(save_path, patience,
                                       low_is_good=not model.binary)  # What is model.binary?
        
    batchers, extractors = [], []
    
    for training_data in training_datasets:
        batch, extractor = batch_data(training_data)
        batchers.append(batch)
        extractors.append(extractor)
        
    for epoch in tqdm(range(epochs)):
        epoch_loss = 0
        optimizer.zero_grad()
        
        for b in range(batches_per_epoch):
            task_id = np.random.choice(range(len(training_datasets)), p = [0.8, 0.2]) # set probability for each task
            batcher = extractors[task_id]
            X, y = next(iter(batcher))
            
            # Do model training
            model.train()
            optimizer.zero_grad()
            
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   preds = model(X, task_id)
            loss = loss_func(preds, y) * loss_weight[task_id]
            loss.backwards()
            
            if clip is not None:
                torch.nn.utils.clip_grad_norm(model.parameters(), clip)  # Prevent exploding gradients

            optimizer.step()
            
            epoch_loss += loss.cpu()
            
            print("Epoch train loss:", np.array(epoch_cwi_loss).mean())

        if dev_data is not None:
            batch, extractor = batch_data(dev_data, len(dev_data))
            X_dev, y_dev = next(iter(extractor))
            score, corr, _ = eval_model(model, X_dev, y_dev,
                                        task_id=dev_task_id,
                                        batch_size=batch_size)

            if early_stopping is not None and early_stopping(model, score):
                early_stopping.set_best_state(model)
                break

In [None]:
def JOACHIMS_train_model(model, training_datasets, save_path, optimizer,
                batch_size=64, epochs=30, dev_data=None, clip=None,
                dev_task_id=0,
                patience=10, batches_per_epoch=None, shuffle_data=True,
                loss_weights=None, loss_decay_aux=True):
    """
    Trains a model
    :param model:
    :param training_datasets: list of tuples containing dense matrices
    :param save_path: path to save trained model to
    :param optimizer: Pytorch optimizer to train model
    :param batch_size: Training batch size
    :param patience: Number of epochs to observe non-improving dev performance
    before early stopping
    :param epochs: Maximum number of epochs (if no early stopping)
    :param dev_data: tuple (x, y) of development data
    :param dev_task_id: Task ID for task to use for early stopping, in case of
    multitask learning
    :param clip: use gradient clipping
    :param batches_per_epoch: set fixed number of batches per epoch. If
    None, an epoch consists of all training examples
    :param shuffle_data: whether to shuffle data at training
    :param loss_weights: array or list of floats. When using multiple
    input/output functions, these weights determine relative task importance
    :return:
    """
    if loss_weights is None:
        loss_weights = np.ones(len(training_datasets))

    if batches_per_epoch is None:
        batches_per_epoch = sum([len(dataset[0]) * batch_size for dataset
                                 in training_datasets]) // batch_size
    batchers = []

    early_stopping = None
    if patience > 0:
        early_stopping = EarlyStopping(save_path, patience,
                                       low_is_good=not model.binary)  # ZW: What is model.binary?

    for training_dataset in training_datasets:
        
        X, y = training_dataset
        if shuffle_data:  # ZW: Why shuffling it out here instead of within the epoch loop?
            X, y = shuffle(X, y)

        batcher = Batcher(len(X), batch_size)
        batchers.append(batcher)

    for epoch in tqdm(range(epochs)):
        if loss_decay_aux:
            loss_weights[1:] = loss_weights[1:] * 0.9
        epoch_loss = 0
        epoch_cwi_loss = []  # ZW: Why this additional loss?
        epoch_data_size = 0
        for b in range(batches_per_epoch):
            task_id = random.choice(range(len(training_datasets)))
            batcher = batchers[task_id]
            X, y = training_datasets[task_id]
            X = torch.tensor(X).float()
            y = torch.tensor(y).float()
            size, start, end = batcher.next_loop()
            d, gold = Variable(X[start:end]), y[start:end]
            model.train() # ZW: What does this do?
            optimizer.zero_grad()   # Why setting zero gradient for the optimizer?
            logits = model(d, input_task_id=task_id)

            logits = logits.view([size, 1])  # Why?
            if model.binary:
                loss = torch.nn.functional.binary_cross_entropy(logits, gold)
            else:
                loss = (logits - gold).pow(2).mean()
            loss = loss * loss_weights[task_id]
            epoch_cwi_loss.append(loss.data.numpy())  # ?
            loss.backward()

            epoch_loss += loss.cpu()
            epoch_data_size += size

            if clip is not None:
                torch.nn.utils.clip_grad_norm(model.parameters(), clip)

            optimizer.step()

        print("Epoch train loss:", np.array(epoch_cwi_loss).mean())

        if dev_data is not None:
            X_dev, y_dev = dev_data
            score, corr, _ = eval_model(model, X_dev, y_dev,
                                        task_id=dev_task_id,
                                        batch_size=batch_size)

            if early_stopping is not None and early_stopping(model, score):
                early_stopping.set_best_state(model)
                break

    if early_stopping is not None:
        early_stopping.set_best_state(model)
