In [4]:
import ipdb
import sys
import csv
import numpy as np
from tqdm import tqdm

from torch.optim import Adam
from mlearn.base import Field
from mlearn.data import clean
from mlearn.data import loaders
from mlearn.modeling.multitask import OnehotLSTMClassifier
from mlearn.data.dataset import GeneralDataset
from mlearn.utils.early_stopping import EarlyStopping
from mlearn.utils.train import process_and_batch, train_mtl_model

# Load data

In [7]:
cl = clean.Cleaner(processes = ['lower', 'url', 'hashtag'])
pr = clean.Preprocessors(liwc_dir = '~/PhD/projects/active/MTL_abuse/data/')

In [8]:
## Slow version
davidson = loaders.davidson(cleaners = cl, data_path = '~/PhD/projects/active/MTL_abuse/data/', length = 200,
                            label_processor = None)

Loading Davidson et al. (train): 24783it [01:36, 255.52it/s]


In [12]:
## Slow version
hoover = loaders.hoover(cleaners = cl, data_path = '~/PhD/projects/active/MTL_abuse/data/', length = 200,
                        preprocessor = pr.word_token, label_processor = lambda x: x.split()[0])

Loading Hoover et al. (train): 34987it [02:18, 253.34it/s]


# Process data

In [13]:
# Davidson
davidson.build_token_vocab(davidson.data)
davidson.build_label_vocab(davidson.data)

Building vocabulary: 100%|██████████| 24783/24783 [00:00<00:00, 181417.05it/s]
Encoding vocabulary: 100%|██████████| 53683/53683 [00:00<00:00, 1095933.48it/s]
Encode label vocab: 100%|██████████| 3/3 [00:00<00:00, 9467.95it/s]


In [14]:
hoover.build_token_vocab(hoover.data)
hoover.build_label_vocab(hoover.data)

Building vocabulary: 100%|██████████| 27989/27989 [00:00<00:00, 147005.56it/s]
Encoding vocabulary: 100%|██████████| 42590/42590 [00:00<00:00, 1031590.72it/s]
Encode label vocab: 100%|██████████| 11/11 [00:00<00:00, 20876.63it/s]


In [None]:
print(hoover.ltoi)
print(davidson.ltoi)
print(hoover.vocab_size())
print(davidson.vocab_size())
print(hoover.ltoi)
print(davidson.ltoi)
print(hoover.data[0].__dict__)
print(davidson.data[0].__dict__)

In [None]:
model = OnehotLSTMClassifier(input_dims = [int(hoover.vocab_size()), int(davidson.vocab_size())], shared_dim = 150,
                          hidden_dims = [128, 128], output_dims = [hoover.label_count(), davidson.label_count()],
                          no_layers = 1, dropout = 0.2)

In [None]:
optimizer = Adam(model.parameters(), lr=0.1)

In [None]:
train_model(model, [hoover, davidson], 'results/', optimizer, dev_data = hoover.dev)

In [None]:
def train_model(model, training_datasets, save_path, optimizer,
                batch_size=64, epochs=2, dev_data=None, clip=None,
                dev_task_id=0,
                patience=10, batches_per_epoch=None, shuffle_data=True,
                loss_weights=None, loss_func = None):
    """
    Trains a model
    :param model:
    :param training_datasets: list of tuples containing dense matrices
    :param save_path: path to save trained model to
    :param optimizer: Pytorch optimizer to train model
    :param batch_size: Training batch size
    :param patience: Number of epochs to observe non-improving dev performance
    before early stopping
    :param epochs: Maximum number of epochs (if no early stopping)
    :param dev_data: tuple (x, y) of development data
    :param dev_task_id: Task ID for task to use for early stopping, in case of
    multitask learning
    :param clip: use gradient clipping
    :param batches_per_epoch: set fixed number of batches per epoch. If
    None, an epoch consists of all training examples
    :param shuffle_data: whether to shuffle data at training
    :param loss_weights: array or list of floats. When using multiple
    input/output functions, these weights determine relative task importance
    :return:
    """
    if loss_weights is None:
        loss_weights = np.ones(len(training_datasets))

    if batches_per_epoch is None:
        batches_per_epoch = sum([len(dataset) * batch_size for dataset
                                 in training_datasets]) // batch_size
    if patience > 0:
        early_stopping = EarlyStopping(save_path, patience,
                                       low_is_good=False)
        
    batchers, extractors = [], []

    for training_data in training_datasets:
        batches = process_and_batch(training_data, training_data.data, batch_size, 'label')
        batchers.append(batches)
        
        #batch, extractor = batch_data(training_data)
        #batchers.append(batch)
        #extractors.append(extractor)
        
    for epoch in tqdm(range(epochs)):
        epoch_loss = 0
        optimizer.zero_grad()
        
        for b in range(batches_per_epoch):
            task_id = np.random.choice(range(len(training_datasets)), p = [0.8, 0.2]) # set probability for each task
            batcher = batchers[task_id]
            X, y = next(iter(batcher))
            
            # Do model training
            model.train()
            optimizer.zero_grad()
            breakpoint()
            
            preds = model(X, task_id)
            loss = loss_func(preds, y) * loss_weight[task_id]
            loss.backwards()
            
            if clip is not None:
                torch.nn.utils.clip_grad_norm(model.parameters(), clip)  # Prevent exploding gradients

            optimizer.step()
            
            epoch_loss += loss.data.item().cpu()
            
            print("Epoch train loss:", np.array(epoch_cwi_loss).mean())

        if dev_data is not None:
            batch, extractor = batch_data(dev_data, len(dev_data))
            X_dev, y_dev = next(iter(extractor))
            score, corr, _ = eval_model(model, X_dev, y_dev,
                                        task_id=dev_task_id,
                                        batch_size=batch_size)

            if early_stopping is not None and early_stopping(model, score):
                early_stopping.set_best_state(model)
                break