In [1]:
import re
import pickle
import os
import sys
import json

import pandas as pd
import warnings
from tqdm import tqdm

from sklearn import metrics
from sklearn.model_selection import train_test_split
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
import gensim

from transformers import BertTokenizer
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import Dataset
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

# nltk.download('stopwords')
spw_set = set(stopwords.words('english'))
spw_set.add('url')
tokenizer = TweetTokenizer()
warnings.filterwarnings("ignore")
# set the which GPU to use
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ['CUDA_VISIBLE_DEVICES'] = "1"


def preprocess(tweet):
    """
    Preprocess a single tweet
    :param tweet:
    :return:
    """
    global tokenizer

    # lowercase
    tweet = tweet.lower()
    # noinspection PyUnresolvedReferences
    tweet = re.sub(r"https?:\S+", "URL", tweet)  # replace url
    # replace user
    # tweet = re.sub(r'@\w+', 'USER', tweet)
    # replace hashtag
    # tweet = re.sub(r'#\S+', 'HASHTAG', tweet)
    # tokenize
    return [item.strip() for item in tokenizer.tokenize(tweet) if len(item.strip()) > 0]


def label_encoder(raw_label):
    pre_labels = [
        'subversion', 'loyalty', 'care', 'cheating',
        'purity', 'fairness', 'degradation', 'betrayal', 'harm', 'authority'
    ]
    encode_label = [0]*(len(pre_labels) + 1)
    if type(raw_label) != str:
        encode_label[-1] = 1
        return encode_label
    for label in raw_label.split(','):
        if label not in pre_labels:
            encode_label[-1] = 1
        else:
            encode_label[pre_labels.index(label)] = 1
    return encode_label


def micro_f1_average(y_preds, y_truths):
    precisions = []
    recalls = []
    for idx, (y_pred, y_truth) in enumerate(zip(y_preds, y_truths)):
        # noinspection PyUnresolvedReferences
        true_positives = np.sum(np.logical_and(y_truth, y_pred))

        # compute the sum of tp + fp across training examples and labels
        # noinspection PyUnresolvedReferences
        l_prec_den = np.sum(y_pred)
        if l_prec_den != 0:
            # compute micro-averaged precision
            precisions.append(true_positives / l_prec_den)

        # compute sum of tp + fn across training examples and labels
        # noinspection PyUnresolvedReferences
        l_recall_den = np.sum(y_truth)

        # compute mirco-average recall
        if l_recall_den != 0:
            recalls.append(true_positives / l_recall_den)

    precisions = np.average(precisions)
    recalls = np.average(recalls)
    if precisions + recalls == 0:
        return 0
    f1 = 2 * precisions * recalls / (precisions + recalls)
    return f1


def multi_label_f1(y_preds, y_truths, mode='weighted'):
    preds = dict()
    truths = dict()
    for idx in range(len(y_truths)):
        for jdx in range(len(y_truths[idx])):
            if jdx not in preds:
                preds[jdx] = []
                truths[jdx] = []
            preds[jdx].append(y_preds[idx][jdx])
            truths[jdx].append(y_truths[idx][jdx])
    results = []
    for jdx in preds:
        results.append(metrics.f1_score(preds[jdx], truths[jdx], average=mode))
    return np.average(results)


def build_wt(tkn, emb_path, opath):
    """Build weight using word embedding"""
    embed_len = len(tkn.word_index)
    if embed_len > tkn.num_words:
        embed_len = tkn.num_words

    if emb_path.endswith('.bin'):
        embeds = gensim.models.KeyedVectors.load_word2vec_format(
            emb_path, binary=True, unicode_errors='ignore'
        )
        emb_size = embeds.vector_size
        emb_matrix = list(np.zeros((embed_len + 1, emb_size)))
        for pair in zip(embeds.wv.index2word, embeds.wv.syn0):
            if pair[0] in tkn.word_index and \
                    tkn.word_index[pair[0]] < tkn.num_words:
                emb_matrix[tkn.word_index[pair[0]]] = np.asarray([
                    float(item) for item in pair[1]
                ], dtype=np.float32)
    else:
        dfile = open(emb_path)
        line = dfile.readline().strip().split()
        if len(line) < 5:
            line = dfile.readline().strip().split()
        emb_size = len(line[1:])
        emb_matrix = list(np.zeros((embed_len + 1, emb_size)))
        dfile.close()

        with open(emb_path) as dfile:
            for line in dfile:
                line = line.strip().split()
                if line[0] in tkn.word_index and \
                        tkn.word_index[line[0]] < tkn.num_words:
                    emb_matrix[tkn.word_index[line[0]]] = np.asarray([
                        float(item) for item in line[1:]
                    ], dtype=np.float32)
    # emb_matrix = np.array(emb_matrix, dtype=np.float32)
    np.save(opath, emb_matrix)
    return emb_matrix


def build_tok(docs, max_feature, opath):
    if os.path.exists(opath):
        return pickle.load(open(opath, 'rb'))
    else:
        # load corpus
        tkn = Tokenizer(num_words=max_feature)
        tkn.fit_on_texts(docs)

        with open(opath, 'wb') as wfile:
            pickle.dump(tkn, wfile)
        return tkn


class DataEncoder(object):
    def __init__(self, params, mtype='rnn'):
        """

        :param params:
        :param mtype: Model type, rnn or bert
        """
        self.params = params
        self.mtype = mtype
        if self.mtype == 'rnn':
            self.tok = pickle.load(open(
                os.path.join(params['tok_dir'], '{}.tok'.format(params['dname'])), 'rb'))
        elif self.mtype == 'bert':
            self.tok = AutoTokenizer.from_pretrained(params['bert_name'], )
        else:
            raise ValueError('Only support BERT and RNN data encoders')

    def __call__(self, batch):
        docs = []
        labels = []
        domains = []
        for text, label, domain in batch:
            if self.mtype == 'bert':
                text = self.tok.encode_plus(
                    text, padding='max_length', max_length=self.params['max_len'],
                    return_tensors='pt', return_token_type_ids=False,
                    truncation=True,
                )
                docs.append(text['input_ids'][0])
            else:
                docs.append(text)
            labels.append(label)
            domains.append(domain)

        labels = torch.tensor(labels, dtype=torch.float)
        domains = torch.tensor(domains, dtype=torch.long)
        if self.mtype == 'rnn':
            # padding and tokenize
            docs = self.tok.texts_to_sequences(docs)
            docs = pad_sequences(docs)
            docs = torch.Tensor(docs).long()
        else:
            docs = torch.stack(docs).long()
        return docs, labels, domains


class TorchDataset(Dataset):
    def __init__(self, dataset, domain_name):
        self.dataset = dataset
        self.domain_name = domain_name

    def __len__(self):
        return len(self.dataset['docs'])

    def __getitem__(self, idx):
        if self.domain_name in self.dataset:
            return self.dataset['docs'][idx], self.dataset['labels'][idx], self.dataset[self.domain_name][idx]
        else:
            return self.dataset['docs'][idx], self.dataset['labels'][idx], -1


class RegularBERT(nn.Module):
    def __init__(self, params):
        super(RegularBERT, self).__init__()
        self.params = params

        self.bert_model = AutoModel.from_pretrained(self.params['bert_name'])
        self.dropout = nn.Dropout(self.params['dp_rate'])
        
        # gru layer
#         self.doc_net_general = nn.GRU(
#             self.wemb.embedding_dim, self.word_hidden_size,
#             bidirectional=self.params['bidirectional'], dropout=self.params['dp_rate'],
#             batch_first=True
#         )
        # prediction
        self.predictor = nn.Linear(
            self.bert_model.config.hidden_size, self.params['num_label'], bias=False)

    def forward(self, input_docs):
        output_bert = self.bert_model(input_docs)
        # take the outputs pooler layer
        # doc_embs = self.linear(output_bert[1])
        doc_embs = torch.mean(output_bert[0], dim=1)
        doc_embs = torch.squeeze(doc_embs)
        doc_embs = self.dropout(doc_embs)
        # doc_embs = torch.relu(self.linear(doc_embs))

        # prediction
        doc_preds = self.predictor(doc_embs)
        return doc_preds


class AdaptBERT(nn.Module):
    def __init__(self, params):
        super(AdaptBERT, self).__init__()
        self.params = params
        self.bert_model = AutoModel.from_pretrained(self.params['bert_name'])
        self.dropout = nn.Dropout(self.params['dp_rate'])

        # domain prediction
#         self.domain_net = nn.GRU(
#             self.wemb.embedding_dim, self.word_hidden_size,
#             bidirectional=self.params['bidirectional'], dropout=self.params['dp_rate'],
#             batch_first=True
#         )
        # two domains, this domain vs others
        self.domain_clf = nn.Linear(
            self.bert_model.config.hidden_size, 2, bias=False
        )

        # regular prediction
#         self.document_net = nn.GRU(
#             self.wemb.embedding_dim, self.word_hidden_size,
#             bidirectional=self.params['bidirectional'], dropout=self.params['dp_rate'],
#             batch_first=True
#         )
        # prediction
        self.document_predictor = nn.Linear(
            self.bert_model.config.hidden_size, self.params['num_label'], bias=False
        )

    def forward(self, input_docs):
        output_bert = self.bert_model(input_docs)
        # take the outputs pooler layer
        # doc_embs = self.linear(output_bert[1])
        doc_embs = torch.mean(output_bert[0], dim=1)
        doc_embs = torch.squeeze(doc_embs)
        doc_embs = self.dropout(doc_embs)
        
        # prediction
        doc_preds = self.document_predictor(doc_embs)
        return doc_preds

    def discriminator(self, input_docs):
        output_bert = self.bert_model(input_docs)
        # take the outputs pooler layer
        # doc_embs = self.linear(output_bert[1])
        doc_embs = torch.mean(output_bert[0], dim=1)
        doc_embs = torch.squeeze(doc_embs)
        doc_embs = self.dropout(doc_embs)

        # prediction
        domain_preds = self.domain_clf(doc_embs)
        return domain_preds

    def freeze_layer(self, if_train=True):
        for param in self.bert_model.parameters():
            param.requires_grad = if_train


def data_split(data):
    """
    :param data:
    :return:
    """
    data_indices = list(range(len(data['docs'])))
    np.random.seed(33)  # for reproductive results
    np.random.shuffle(data_indices)

    train_indices = data_indices[:int(.8 * len(data_indices))]
    dev_indices = data_indices[int(.8 * len(data_indices)):int(.9 * len(data_indices))]
    test_indices = data_indices[int(.9 * len(data_indices)):]
    return train_indices, dev_indices, test_indices

In [2]:
all_morality = [
    'subversion', 'loyalty', 'care', 'cheating',
    'purity', 'fairness', 'degradation', 'betrayal', 'harm', 'authority'
]

result_dir = '../resource/results/'
if not os.path.exists(result_dir):
    os.mkdir(result_dir)
model_dir = '../resource/model/'
if not os.path.exists(model_dir):
    os.mkdir(model_dir)
model_dir = model_dir + 'adapt_bert/'
if not os.path.exists(model_dir):
    os.mkdir(model_dir)

params = {
    'result_path': os.path.join(result_dir, 'adapt_bert.txt'),
    'model_dir': model_dir,
    'dname': 'all',
    'dpath': '../data/dataset.tsv',
    'max_feature': 15000,
    'over_sample': True,
    'domain_name': 'corpus',
    'epochs': 10,
    'batch_size': 16,
    'lr': 9e-5,
    'max_len': 60,
    'dp_rate': .2,
    'optimizer': 'adam',
    'emb_dim': 200,
    'unique_domains': [],
    'bidirectional': False,
    'device': 'cuda',
    'bert_name': 'bert-base-uncased', # 'bert-base-uncased','vinai/bertweet-base', 'digitalepidemiologylab/covid-twitter-bert'
    'num_label': len(all_morality)+1,  # plus no-moral
}

In [3]:
all_labels = [
    'subversion', 'loyalty', 'care', 'cheating',
    'purity', 'fairness', 'degradation', 'betrayal', 'harm', 'authority'
]
wfile = open(params['result_path'], 'a')

print('Loading Data...')
all_data = pd.read_csv(params['dpath'], sep='\t', dtype=str)
all_data.tid = all_data.tid.apply(lambda x: str(x))
all_data = all_data[~all_data.text.isna()]
all_data = all_data[~all_data.labels.isna()]
# preprocess tweet and remove short tweet
all_data.text = all_data.text.apply(lambda x: preprocess(x))
all_data = all_data[all_data.text.apply(lambda x: len(x) > 3)]
all_data.text = all_data.text.apply(lambda x: ' '.join(x))
all_data.labels = all_data.labels.apply(lambda x: label_encoder(x))
params['unique_domains'] = list(all_data.corpus.unique())
wfile.write(json.dumps(params) + '\n')

if torch.cuda.is_available() and params['device'] != 'cpu':
    device = torch.device(params['device'])
else:
    device = torch.device('cpu')
params['device'] = device

# load the vaccine data and test the classifier on the vaccine data
vaccine_df = pd.read_csv('../data/vaccine_morality.csv', dtype=str)
vaccine_df.text = vaccine_df.text.apply(lambda x: preprocess(x))
# vaccine_df = vaccine_df[vaccine_df.text.apply(lambda x: len(x) > 3)]
vaccine_df.text = vaccine_df.text.apply(lambda x: ' '.join(x))
vaccine_df = vaccine_df.sample(frac=1).reset_index(drop=True)

# domains
domain_encoder = list(all_data.corpus.unique()) + ['vaccine']

# use half of the vaccine as train and half as test
all_corpus = {
    'docs': all_data.text.to_list(),
    'labels': all_data.labels.to_list(),
    'corpus': all_data.corpus.to_list(),
}
all_corpus['corpus'] = [domain_encoder.index(item) for item in all_corpus['corpus']]

Loading Data...


In [6]:
params['lr'] = 9e-6

In [24]:
print('Run over domains...')
for didx, domain in enumerate(tqdm(params['unique_domains'])):
    if domain in ['ALM', ]: #  'Baltimore', 'BLM', 'Davidson', 'Election', 'MeToo'
        continue
    wfile.write('Working on Domain {}, Domain index {} \n'.format(domain, didx))
    in_domain_indices = [item for item in range(len(all_corpus['corpus'])) if all_corpus['corpus'][item] == didx]
    out_domain_indices = [item for item in range(len(all_corpus['corpus'])) if all_corpus['corpus'][item] != didx]

    train_corpus = {
        'docs': [all_corpus['docs'][item] for item in out_domain_indices],
        'labels': [all_corpus['labels'][item] for item in out_domain_indices],
        'corpus': [all_corpus['corpus'][item] for item in out_domain_indices],
    }
    domain_corpus = {
        'docs': [item for item in train_corpus['docs']],
        'labels': [item for item in train_corpus['labels']],
        'corpus': [0] * len(train_corpus['docs']),  # first collect documents from out of domain
    }
    in_domain_corpus = {
        'docs': [all_corpus['docs'][item] for item in in_domain_indices],
        'labels': [all_corpus['labels'][item] for item in in_domain_indices],
        'corpus': [all_corpus['corpus'][item] for item in in_domain_indices],
    }
    
    domain_corpus['docs'].extend(in_domain_corpus['docs'])
    domain_corpus['labels'].extend(in_domain_corpus['labels'])
    domain_corpus['corpus'].extend([1] * len(in_domain_corpus['docs']))

    # 10% for training, 10% for valid, the rest for testing
    test_indices, val_indices, train_indices = data_split(in_domain_corpus)
    in_domain_train = {
        'docs': [in_domain_corpus['docs'][item] for item in train_indices],
        'labels': [in_domain_corpus['labels'][item] for item in train_indices],
        'corpus': [in_domain_corpus['corpus'][item] for item in train_indices]
    }
    train_corpus['docs'].extend(in_domain_train['docs'])
    train_corpus['labels'].extend(in_domain_train['labels'])
    train_corpus['corpus'].extend(in_domain_train['corpus'])

    valid_corpus = {
        'docs': [in_domain_corpus['docs'][item] for item in val_indices],
        'labels': [in_domain_corpus['labels'][item] for item in val_indices],
        'corpus': [in_domain_corpus['corpus'][item] for item in val_indices]
    }
    test_corpus = {
        'docs': [in_domain_corpus['docs'][item] for item in test_indices],
        'labels': [in_domain_corpus['labels'][item] for item in test_indices],
        'corpus': [in_domain_corpus['corpus'][item] for item in test_indices]
    }

    # start to iteratively train and test the proposed approach.
    train_data = TorchDataset(train_corpus, params['domain_name'])
    valid_data = TorchDataset(valid_corpus, params['domain_name'])
    test_data = TorchDataset(test_corpus, params['domain_name'])
    in_domain_train_data = TorchDataset(in_domain_train, params['domain_name'])
    domain_data = TorchDataset(domain_corpus, params['domain_name'])

    train_data_loader = DataLoader(
        train_data, batch_size=params['batch_size'], shuffle=True,
        collate_fn=DataEncoder(params, mtype='bert')
    )
    valid_data_loader = DataLoader(
        valid_data, batch_size=params['batch_size'], shuffle=True,
        collate_fn=DataEncoder(params, mtype='bert')
    )
    test_data_loader = DataLoader(
        test_data, batch_size=params['batch_size'], shuffle=True,
        collate_fn=DataEncoder(params, mtype='bert')
    )
    in_domain_train_data_loader = DataLoader(
        in_domain_train_data, batch_size=params['batch_size'], shuffle=True,
        collate_fn=DataEncoder(params, mtype='bert')
    )
    domain_data_loader = DataLoader(
        domain_data, batch_size=params['batch_size'], shuffle=True,
        collate_fn=DataEncoder(params, mtype='bert')
    )

    omit_optim_names = ['bias', 'LayerNorm.weight']

    adapt_model = AdaptBERT(params)
    adapt_model = adapt_model.to(device)
    domain_criterion = nn.CrossEntropyLoss().to(device)
    criterion_adapt = nn.BCEWithLogitsLoss(reduction='none').to(device)
#     pred_params = [
#         param for name, param in adapt_model.named_parameters() if 'domain' not in name and not any(nd in name for nd in omit_optim_names)]
#     adapt_pred_optim = torch.optim.Adam(pred_params, lr=params['lr']*5)
    pred_params = [
        param for name, param in adapt_model.named_parameters() if 'domain' not in name]
    adapt_pred_optim = torch.optim.Adam(pred_params, lr=params['lr'])
#     domain_params = [
#         param for name, param in adapt_model.named_parameters() if ('domain' in name or 'bert' in name) and not any(nd in name for nd in omit_optim_names)]
#     adapt_domain_optim = torch.optim.Adam(domain_params, lr=params['lr'])
    domain_params = [
        param for name, param in adapt_model.named_parameters() if 'domain' in name]
    adapt_domain_optim = torch.optim.Adam(domain_params, lr=params['lr']*5)

    # train the networks
    print('Start to train...')
    print(params)
    best_valid_adapt = 0.    
    best_test_adapt = 0.

    for epoch in tqdm(range(params['epochs'])):
        train_loss_adapt = 0.
        adapt_model.train()
            
        # train discriminator first
        # adapt_model.freeze_layer(False)
        for _ in range(3):
            for step, train_batch in enumerate(domain_data_loader):
                train_batch = tuple(t.to(device) for t in train_batch)
                input_docs, input_labels, input_domains = train_batch
                adapt_domain_optim.zero_grad()
                domain_preds = adapt_model.discriminator(**{
                    'input_docs': input_docs
                })
                domain_loss = domain_criterion(domain_preds, input_domains)
                domain_loss.backward()
                adapt_domain_optim.step()
#         sys.exit(-1)
        # train predictor
        # adapt_model.freeze_layer(True)
        for step, train_batch in enumerate(train_data_loader):
            train_batch = tuple(t.to(device) for t in train_batch)
            input_docs, input_labels, input_domains = train_batch
            if len(input_docs) == 1:
                continue
            adapt_pred_optim.zero_grad()

            # adapt models
            adapt_preds = adapt_model(**{
                'input_docs': input_docs
            })
            loss_adapt = criterion_adapt(adapt_preds, input_labels)
            domain_preds = torch.sigmoid(adapt_model.discriminator(**{'input_docs': input_docs}))
            loss_adapt = loss_adapt.mean(axis=1)
#             loss_adapt = domain_preds[:, 1] * loss_adapt
            loss_adapt = loss_adapt.mean()
            train_loss_adapt += loss_adapt.item()
            loss_avg_adapt = train_loss_adapt / (step + 1)

            loss_adapt.backward()
            # torch.nn.utils.clip_grad_norm_(rnn_model.parameters(), 0.5)
            adapt_pred_optim.step()

        # fit on in domain corpus.
        for _ in range(3):
            for step, train_batch in enumerate(in_domain_train_data_loader):
                train_batch = tuple(t.to(device) for t in train_batch)
                input_docs, input_labels, input_domains = train_batch
                if len(input_docs) == 1:
                    continue
                adapt_pred_optim.zero_grad()
                adapt_preds = adapt_model(**{
                    'input_docs': input_docs
                })
                loss_adapt = criterion_adapt(adapt_preds, input_labels)
                loss_adapt = loss_adapt.mean()
                loss_adapt.backward()
                adapt_pred_optim.step()
        
        # evaluate on valid data
        adapt_model.eval()
        y_preds_adapt = []
        y_trues = []
        for valid_batch in valid_data_loader:
            valid_batch = tuple(t.to(device) for t in valid_batch)
            input_docs, input_labels, input_domains = valid_batch
            with torch.no_grad():
                preds_adapt = adapt_model(**{'input_docs': input_docs})

            logits_adapt = (torch.sigmoid(preds_adapt) > .5).long().cpu().numpy()
            y_preds_adapt.extend(logits_adapt)
            y_trues.extend(input_labels.to('cpu').numpy())

        eval_score_adapt = micro_f1_average(y_preds=y_preds_adapt, y_truths=y_trues)

        if eval_score_adapt > best_valid_adapt:
            best_valid_adapt = eval_score_adapt
            
        # test
        y_preds = []
        y_trues = []
        # evaluate on the test set
        for test_batch in test_data_loader:
            test_batch = tuple(t.to(device) for t in test_batch)
            input_docs, input_labels, input_domains = test_batch

            with torch.no_grad():
                preds_adapt = adapt_model(**{
                    'input_docs': input_docs,
                })
            logits_adapt = (torch.sigmoid(preds_adapt) > .5).long().cpu().numpy()
            y_preds.extend(logits_adapt)
            y_trues.extend(input_labels.to('cpu').numpy())

        test_score_adapt = micro_f1_average(y_preds=y_preds, y_truths=y_trues)
        if best_test_adapt < test_score_adapt:
            best_test_adapt = test_score_adapt
            torch.save(adapt_model, params['model_dir'] + 'adapt_bert_moral.pth')
        test_score_adapt = 'Test on Adapt BERT, Domain {}, Epoch {}, F1-micro-average {}, Valid Score {}\n'.format(
                domain, epoch, test_score_adapt, best_valid_adapt)
#         print('Adapt Results: ', test_score_adapt)
#         wfile.write(test_score_adapt)
    print('Best on Adapt BERT, Domain {}, F1-micro-average {}, Valid Score {}\n'.format(
        domain, best_test_adapt, best_valid_adapt))

  0%|          | 0/7 [00:00<?, ?it/s]

Run over domains...


Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.

  0%|          | 0/10 [00:00<?, ?it/s][A

Start to train...
{'result_path': '../resource/results/adapt_bert.txt', 'model_dir': '../resource/model/adapt_bert/', 'dname': 'all', 'dpath': '../data/dataset.tsv', 'max_feature': 15000, 'over_sample': True, 'domain_name': 'corpus', 'epochs': 10, 'batch_size': 64, 'lr': 9e-05, 'max_len': 60, 'dp_rate': 0.2, 'optimizer': 'adam', 'emb_dim': 200, 'unique_domains': ['ALM', 'Baltimore', 'BLM', 'Davidson', 'Election', 'MeToo', 'Sandy'], 'bidirectional': False, 'device': device(type='cuda'), 'bert_name': 'vinai/bertweet-base', 'num_label': 11}



 10%|█         | 1/10 [03:48<34:19, 228.79s/it][A
 20%|██        | 2/10 [07:33<30:20, 227.59s/it][A
 30%|███       | 3/10 [11:17<26:25, 226.54s/it][A
 40%|████      | 4/10 [15:03<22:37, 226.32s/it][A
 50%|█████     | 5/10 [18:47<18:47, 225.58s/it][A
 60%|██████    | 6/10 [22:31<15:00, 225.11s/it][A
 70%|███████   | 7/10 [26:15<11:14, 224.83s/it][A
 80%|████████  | 8/10 [29:59<07:29, 224.61s/it][A
 90%|█████████ | 9/10 [33:43<03:44, 224.49s/it][A
100%|██████████| 10/10 [37:28<00:00, 224.82s/it][A
 29%|██▊       | 2/7 [37:36<1:34:00, 1128.01s/it]

Best on Adapt BERT, Domain Baltimore, F1-micro-average 0.6443542341900416, Valid Score 0.6408787997977187



Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.

  0%|          | 0/10 [00:00<?, ?it/s][A

Start to train...
{'result_path': '../resource/results/adapt_bert.txt', 'model_dir': '../resource/model/adapt_bert/', 'dname': 'all', 'dpath': '../data/dataset.tsv', 'max_feature': 15000, 'over_sample': True, 'domain_name': 'corpus', 'epochs': 10, 'batch_size': 64, 'lr': 9e-05, 'max_len': 60, 'dp_rate': 0.2, 'optimizer': 'adam', 'emb_dim': 200, 'unique_domains': ['ALM', 'Baltimore', 'BLM', 'Davidson', 'Election', 'MeToo', 'Sandy'], 'bidirectional': False, 'device': device(type='cuda'), 'bert_name': 'vinai/bertweet-base', 'num_label': 11}



 10%|█         | 1/10 [03:48<34:15, 228.42s/it][A
 20%|██        | 2/10 [07:32<30:17, 227.15s/it][A
 30%|███       | 3/10 [11:16<26:22, 226.08s/it][A
 40%|████      | 4/10 [15:00<22:32, 225.47s/it][A
 50%|█████     | 5/10 [18:44<18:45, 225.05s/it][A
 60%|██████    | 6/10 [22:28<14:58, 224.70s/it][A
 70%|███████   | 7/10 [26:11<11:13, 224.36s/it][A
 80%|████████  | 8/10 [29:55<07:28, 224.11s/it][A
 90%|█████████ | 9/10 [33:39<03:44, 224.13s/it][A
100%|██████████| 10/10 [37:23<00:00, 224.35s/it][A
 43%|████▎     | 3/7 [1:15:07<1:37:39, 1464.99s/it]

Best on Adapt BERT, Domain BLM, F1-micro-average 0.0, Valid Score 0.0



Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.

  0%|          | 0/10 [00:00<?, ?it/s][A

Start to train...
{'result_path': '../resource/results/adapt_bert.txt', 'model_dir': '../resource/model/adapt_bert/', 'dname': 'all', 'dpath': '../data/dataset.tsv', 'max_feature': 15000, 'over_sample': True, 'domain_name': 'corpus', 'epochs': 10, 'batch_size': 64, 'lr': 9e-05, 'max_len': 60, 'dp_rate': 0.2, 'optimizer': 'adam', 'emb_dim': 200, 'unique_domains': ['ALM', 'Baltimore', 'BLM', 'Davidson', 'Election', 'MeToo', 'Sandy'], 'bidirectional': False, 'device': device(type='cuda'), 'bert_name': 'vinai/bertweet-base', 'num_label': 11}



 10%|█         | 1/10 [03:49<34:22, 229.14s/it][A
 20%|██        | 2/10 [07:33<30:22, 227.80s/it][A
 30%|███       | 3/10 [12:56<30:11, 258.77s/it][A
 43%|████▎     | 3/7 [1:28:11<1:57:35, 1763.77s/it]


KeyboardInterrupt: 

In [None]:
print('Run over domains...')
for didx, domain in enumerate(tqdm(params['unique_domains'])):
    if domain in ['ALM', 'Baltimore', 'BLM', 'Davidson', 'Election', 'MeToo']:
        continue
    wfile.write('Working on Domain {}, Domain index {} \n'.format(domain, didx))
    in_domain_indices = [item for item in range(len(all_corpus['corpus'])) if all_corpus['corpus'][item] == didx]
    out_domain_indices = [item for item in range(len(all_corpus['corpus'])) if all_corpus['corpus'][item] != didx]

    train_corpus = {
        'docs': [all_corpus['docs'][item] for item in out_domain_indices],
        'labels': [all_corpus['labels'][item] for item in out_domain_indices],
        'corpus': [all_corpus['corpus'][item] for item in out_domain_indices],
    }
    domain_corpus = {
        'docs': [item for item in train_corpus['docs']],
        'labels': [item for item in train_corpus['labels']],
        'corpus': [0] * len(train_corpus['docs']),  # first collect documents from out of domain
    }
    in_domain_corpus = {
        'docs': [all_corpus['docs'][item] for item in in_domain_indices],
        'labels': [all_corpus['labels'][item] for item in in_domain_indices],
        'corpus': [all_corpus['corpus'][item] for item in in_domain_indices],
    }
    
    domain_corpus['docs'].extend(in_domain_corpus['docs'])
    domain_corpus['labels'].extend(in_domain_corpus['labels'])
    domain_corpus['corpus'].extend([1] * len(in_domain_corpus['docs']))

    # 10% for training, 10% for valid, the rest for testing
    test_indices, val_indices, train_indices = data_split(in_domain_corpus)
    in_domain_train = {
        'docs': [in_domain_corpus['docs'][item] for item in train_indices],
        'labels': [in_domain_corpus['labels'][item] for item in train_indices],
        'corpus': [in_domain_corpus['corpus'][item] for item in train_indices]
    }
    train_corpus['docs'].extend(in_domain_train['docs'])
    train_corpus['labels'].extend(in_domain_train['labels'])
    train_corpus['corpus'].extend(in_domain_train['corpus'])

    valid_corpus = {
        'docs': [in_domain_corpus['docs'][item] for item in val_indices],
        'labels': [in_domain_corpus['labels'][item] for item in val_indices],
        'corpus': [in_domain_corpus['corpus'][item] for item in val_indices]
    }
    test_corpus = {
        'docs': [in_domain_corpus['docs'][item] for item in test_indices],
        'labels': [in_domain_corpus['labels'][item] for item in test_indices],
        'corpus': [in_domain_corpus['corpus'][item] for item in test_indices]
    }

    # start to iteratively train and test the proposed approach.
    train_data = TorchDataset(train_corpus, params['domain_name'])
    valid_data = TorchDataset(valid_corpus, params['domain_name'])
    test_data = TorchDataset(test_corpus, params['domain_name'])
    in_domain_train_data = TorchDataset(in_domain_train, params['domain_name'])
    domain_data = TorchDataset(domain_corpus, params['domain_name'])

    train_data_loader = DataLoader(
        train_data, batch_size=params['batch_size'], shuffle=True,
        collate_fn=DataEncoder(params, mtype='bert')
    )
    valid_data_loader = DataLoader(
        valid_data, batch_size=params['batch_size'], shuffle=True,
        collate_fn=DataEncoder(params, mtype='bert')
    )
    test_data_loader = DataLoader(
        test_data, batch_size=params['batch_size'], shuffle=True,
        collate_fn=DataEncoder(params, mtype='bert')
    )
    in_domain_train_data_loader = DataLoader(
        in_domain_train_data, batch_size=params['batch_size'], shuffle=True,
        collate_fn=DataEncoder(params, mtype='bert')
    )
    domain_data_loader = DataLoader(
        domain_data, batch_size=params['batch_size'], shuffle=True,
        collate_fn=DataEncoder(params, mtype='bert')
    )

    omit_optim_names = ['bias', 'LayerNorm.weight']
    regular_model = RegularBERT(params)
    regular_model = regular_model.to(device)
    criterion = nn.BCEWithLogitsLoss().to(device)
    regular_optim = torch.optim.Adam(
        [p for n, p in regular_model.named_parameters() if not any(nd in n for nd in omit_optim_names)],
        lr=params['lr']
    )
    
    indomain_model = RegularBERT(params)
    indomain_model = indomain_model.to(device)
    indomain_optim = torch.optim.Adam(
        [p for n, p in indomain_model.named_parameters() if not any(nd in n for nd in omit_optim_names)],
        lr=params['lr']
    )

    adapt_model = AdaptBERT(params)
    adapt_model = adapt_model.to(device)
    domain_criterion = nn.CrossEntropyLoss().to(device)
    criterion_adapt = nn.BCEWithLogitsLoss(reduction='none').to(device)
    pred_params = [
        param for name, param in adapt_model.named_parameters() if 'domain' not in name and not any(nd in name for nd in omit_optim_names)]
    adapt_pred_optim = torch.optim.Adam(pred_params, lr=params['lr'])
    domain_params = [
        param for name, param in adapt_model.named_parameters() if 'domain' in name and not any(nd in name for nd in omit_optim_names)]
    adapt_domain_optim = torch.optim.Adam(domain_params, lr=params['lr'] * 0.1)

    # train the networks
    print('Start to train...')
    print(params)
    best_valid_regular = 0.
    best_valid_adapt = 0.
    best_valid_indomain = 0.
    
    best_test_regular = 0.
    best_test_adapt = 0.
    best_test_indomain = 0.

    for epoch in tqdm(range(params['epochs'])):
        train_loss_regular = 0.
        train_loss_adapt = 0.
        adapt_model.train()
        regular_model.train()
        indomain_model.train()
        
        # train indomain model for comparison
        for step, train_batch in enumerate(in_domain_train_data_loader):
            train_batch = tuple(t.to(device) for t in train_batch)
            input_docs, input_labels, input_domains = train_batch
            if len(input_docs) == 1:
                continue
            indomain_optim.zero_grad()
            
            # indomain models
            indomain_preds = indomain_model(**{'input_docs': input_docs})
            loss = criterion(indomain_preds, input_labels)
            loss.backward()
            indomain_optim.step()
            
        # train discriminator first
        # adapt_model.freeze_layer(False)
        for step, train_batch in enumerate(domain_data_loader):
            train_batch = tuple(t.to(device) for t in train_batch)
            input_docs, input_labels, input_domains = train_batch
            adapt_domain_optim.zero_grad()
            domain_preds = adapt_model.discriminator(**{
                'input_docs': input_docs
            })
            domain_loss = domain_criterion(domain_preds, input_domains)
            domain_loss.backward()
            adapt_domain_optim.step()

        # train predictor
        # adapt_model.freeze_layer(True)
        for step, train_batch in enumerate(train_data_loader):
            train_batch = tuple(t.to(device) for t in train_batch)
            input_docs, input_labels, input_domains = train_batch
            if len(input_docs) == 1:
                continue
            regular_optim.zero_grad()
            adapt_pred_optim.zero_grad()
            # adapt_domain_optim.zero_grad()

            # regular models
            regular_preds = regular_model(**{
                'input_docs': input_docs
            })
            loss = criterion(regular_preds, input_labels)
            train_loss_regular += loss.item()
            loss_avg_regular = train_loss_regular / (step + 1)

            # adapt models
            adapt_preds = adapt_model(**{
                'input_docs': input_docs
            })
            loss_adapt = criterion_adapt(adapt_preds, input_labels)
            domain_preds = torch.sigmoid(adapt_model.discriminator(**{'input_docs': input_docs}))
            loss_adapt = loss_adapt.mean(axis=1)
            loss_adapt = domain_preds[:, 1] * loss_adapt
            loss_adapt = loss_adapt.mean()
            train_loss_adapt += loss_adapt.item()
            loss_avg_adapt = train_loss_adapt / (step + 1)

            loss_adapt.backward()
            loss.backward()
            # torch.nn.utils.clip_grad_norm_(rnn_model.parameters(), 0.5)
            regular_optim.step()
            adapt_pred_optim.step()

        # fit on in domain corpus.
        for _ in range(3):
            for step, train_batch in enumerate(in_domain_train_data_loader):
                train_batch = tuple(t.to(device) for t in train_batch)
                input_docs, input_labels, input_domains = train_batch
                if len(input_docs) == 1:
                    continue
                adapt_pred_optim.zero_grad()
                adapt_preds = adapt_model(**{
                    'input_docs': input_docs
                })
                loss_adapt = criterion_adapt(adapt_preds, input_labels)
                loss_adapt = loss_adapt.mean()
                loss_adapt.backward()
                adapt_pred_optim.step()
        
        # evaluate on valid data
        regular_model.eval()
        adapt_model.eval()
        indomain_model.eval()
        y_preds_regular = []
        y_preds_adapt = []
        y_preds_indomain = []
        y_trues = []
        for valid_batch in valid_data_loader:
            valid_batch = tuple(t.to(device) for t in valid_batch)
            input_docs, input_labels, input_domains = valid_batch
            with torch.no_grad():
                preds_regular = regular_model(**{'input_docs': input_docs})
                preds_adapt = adapt_model(**{'input_docs': input_docs})
                preds_indomain = indomain_model(**{'input_docs': input_docs})

            logits_regular = (torch.sigmoid(preds_regular) > .5).long().cpu().numpy()
            logits_adapt = (torch.sigmoid(preds_adapt) > .5).long().cpu().numpy()
            logits_indomain = (torch.sigmoid(preds_indomain) > .5).long().cpu().numpy()
            
            y_preds_regular.extend(logits_regular)
            y_preds_adapt.extend(logits_adapt)
            y_preds_indomain.extend(logits_indomain)
            y_trues.extend(input_labels.to('cpu').numpy())

        eval_score_regular = micro_f1_average(y_preds=y_preds_regular, y_truths=y_trues)
        eval_score_adapt = micro_f1_average(y_preds=y_preds_adapt, y_truths=y_trues)
        eval_score_indomain = micro_f1_average(y_preds=y_preds_indomain, y_truths=y_trues)

        # test for regular model
        if eval_score_regular > best_valid_regular:
            best_valid_regular = eval_score_regular
            torch.save(regular_model, params['model_dir'] + 'regular_bert_moral.pth')

            # test
            y_preds = []
            y_trues = []
            # evaluate on the test set
            for test_batch in test_data_loader:
                test_batch = tuple(t.to(device) for t in test_batch)
                input_docs, input_labels, input_domains = test_batch

                with torch.no_grad():
                    preds_regular = regular_model(**{
                        'input_docs': input_docs,
                    })
                logits_regular = (torch.sigmoid(preds_regular) > .5).long().cpu().numpy()
                y_preds.extend(logits_regular)
                y_trues.extend(input_labels.to('cpu').numpy())

            test_score_regular = micro_f1_average(y_preds=y_preds, y_truths=y_trues)
            if best_test_regular < test_score_regular:
                best_test_regular = test_score_regular
            regular_results = 'Test on Regular BERT, Domain {}, Epoch {}, F1-micro-average {}, Valid Score {}\n'.format(
                    domain, epoch, test_score_regular, best_valid_regular)
#             print('Regular Results: ', regular_results)
#             wfile.write(regular_results)
        
        # test for indomain model
        if eval_score_indomain > best_valid_indomain:
            best_valid_indomain = eval_score_indomain
            torch.save(indomain_model, params['model_dir'] + 'regular_bert_moral.pth')

            # test
            y_preds = []
            y_trues = []
            # evaluate on the test set
            for test_batch in test_data_loader:
                test_batch = tuple(t.to(device) for t in test_batch)
                input_docs, input_labels, input_domains = test_batch

                with torch.no_grad():
                    preds_indomain = indomain_model(**{
                        'input_docs': input_docs,
                    })
                logits_indomain = (torch.sigmoid(preds_indomain) > .5).long().cpu().numpy()
                y_preds.extend(logits_indomain)
                y_trues.extend(input_labels.to('cpu').numpy())

            test_score_indomain = micro_f1_average(y_preds=y_preds, y_truths=y_trues)
            if best_test_indomain < test_score_indomain:
                best_test_indomain = test_score_indomain
            indomain_results = 'Test on Indomain BERT, Domain {}, Epoch {}, F1-micro-average {}, Valid Score {}\n'.format(
                    domain, epoch, test_score_indomain, best_valid_indomain)
#             print('Regular Results: ', indomain_results)
#             wfile.write(indomain_results)

        if eval_score_adapt > best_valid_adapt:
            best_valid_adapt = eval_score_adapt

            # test
            y_preds = []
            y_trues = []
            # evaluate on the test set
            for test_batch in test_data_loader:
                test_batch = tuple(t.to(device) for t in test_batch)
                input_docs, input_labels, input_domains = test_batch

                with torch.no_grad():
                    preds_adapt = adapt_model(**{
                        'input_docs': input_docs,
                    })
                logits_adapt = (torch.sigmoid(preds_adapt) > .5).long().cpu().numpy()
                y_preds.extend(logits_adapt)
                y_trues.extend(input_labels.to('cpu').numpy())

            test_score_adapt = micro_f1_average(y_preds=y_preds, y_truths=y_trues)
            if best_test_adapt < test_score_adapt:
                best_test_adapt = test_score_adapt
                torch.save(adapt_model, params['model_dir'] + 'adapt_bert_moral.pth')
                
            test_score_adapt = 'Test on Adapt BERT, Domain {}, Epoch {}, F1-micro-average {}, Valid Score {}\n'.format(
                    domain, epoch, test_score_adapt, best_valid_adapt)
#             print('Adapt Results: ', test_score_adapt)
#             wfile.write(test_score_adapt)
            
    print('Best on Regular BERT, Domain {}, F1-micro-average {}, Valid Score {}\n'.format(
        domain, best_test_regular, best_valid_regular))
    print('Best on InDomain BERT, Domain {}, F1-micro-average {}, Valid Score {}\n'.format(
        domain, best_test_indomain, best_valid_indomain))
    print('Best on Adapt BERT, Domain {}, F1-micro-average {}, Valid Score {}\n'.format(
        domain, best_test_adapt, best_valid_adapt))

In [5]:
# vaccine experiments
vaccine_data = {
    'docs': [],
    'labels': [],
}
# wfile.write('\nVaccine Evaluation---\n')

for idx, row in vaccine_df.iterrows():
    encode_label = [0] * params['num_label']
    for label_index, _ in enumerate(all_labels):
        if np.isnan(np.array(row[all_labels[label_index]], dtype=np.float32)):
            continue
        if int(row[all_labels[label_index]]) == 1:
            encode_label[label_index] = 1
    if sum(encode_label) == 0:
        encode_label[-1] = 1
    vaccine_data['docs'].append(row['text'])
    vaccine_data['labels'].append(encode_label)

vaccine_train_docs, vaccine_test_docs, vaccine_train_labels, vaccine_test_labels = train_test_split(
    vaccine_data['docs'], vaccine_data['labels'], test_size=.50)

vaccine_train = {
    'docs': [item for item in vaccine_train_docs],
    'labels': [item for item in vaccine_train_labels],
    'corpus': [1] * len(vaccine_train_docs)
}
vaccine_test = {
    'docs': [item for item in vaccine_data['docs'][250:]],
    'labels': [item for item in vaccine_data['labels'][250:]],
    'corpus': [1] * len(vaccine_test_docs)
}
all_train = {
    'docs': all_data.text.to_list(),
    'labels': all_data.labels.to_list(),
    'corpus': [0] * len(all_data.labels.to_list())
}
all_train['docs'].extend([item for item in vaccine_train['docs']])
all_train['labels'].extend([item for item in vaccine_train['labels']])
all_train['corpus'].extend([1] * len(vaccine_train['docs']))

all_data_corpus = {
    'docs': all_data.text.to_list(),
    'labels': all_data.labels.to_list(),
    'corpus': [0] * len(all_data.labels.to_list())
}
all_data_corpus['docs'].extend([item for item in vaccine_data['docs']])
all_data_corpus['labels'].extend([item for item in vaccine_data['labels']])
all_data_corpus['corpus'].extend([1] * 500)

vaccine_train_data = TorchDataset(vaccine_train, domain_name=params['domain_name'])
vaccine_test_data = TorchDataset(vaccine_test, domain_name=params['domain_name'])
all_train_data = TorchDataset(all_train, domain_name=params['domain_name'])
all_data_torch = TorchDataset(all_data_corpus, domain_name=params['domain_name'])

vaccine_train_data_loader = DataLoader(
    vaccine_train_data, batch_size=params['batch_size'], shuffle=True,
    collate_fn=DataEncoder(params, mtype='bert')
)
train_data_loader = DataLoader(
    all_train_data, batch_size=params['batch_size'], shuffle=True,
    collate_fn=DataEncoder(params, mtype='bert')
)
valid_data_loader = DataLoader(
    vaccine_train_data, batch_size=params['batch_size'], shuffle=True,
    collate_fn=DataEncoder(params, mtype='bert')
)
test_data_loader = DataLoader(
    vaccine_test_data, batch_size=params['batch_size'], shuffle=False,
    collate_fn=DataEncoder(params, mtype='bert')
)
all_data_loader = DataLoader(
    all_data_torch, batch_size=params['batch_size'], shuffle=True,
    collate_fn=DataEncoder(params, mtype='bert')
)

In [16]:
# adjust parameters
params['epochs'] = 10
params['lr'] = 9e-4
params['emb_dim'] = 300

In [None]:
omit_optim_names = ['bias', 'LayerNorm.weight']
# regular_model = RegularBERT(params)
# regular_model = regular_model.to(device)
# criterion = nn.BCEWithLogitsLoss().to(device)
# regular_optim = torch.optim.Adam(
#     [p for n, p in regular_model.named_parameters() if not any(nd in n for nd in omit_optim_names)],
#     lr=params['lr']
# )

# indomain_model = RegularBERT(params)
# indomain_model = indomain_model.to(device)
# indomain_optim = torch.optim.Adam(
#     [p for n, p in indomain_model.named_parameters() if not any(nd in n for nd in omit_optim_names)],
#     lr=params['lr']
# )

adapt_model = AdaptBERT(params)
adapt_model = adapt_model.to(device)
domain_criterion = nn.CrossEntropyLoss().to(device)
criterion_adapt = nn.BCEWithLogitsLoss(reduction='none').to(device)
pred_params = [param for name, param in adapt_model.named_parameters() if 'domain' not in name and 'bert' not in name]
adapt_pred_optim = torch.optim.Adam(pred_params, lr=params['lr'])
domain_params = [param for name, param in adapt_model.named_parameters() if 'domain' in name]
adapt_domain_optim = torch.optim.Adam(domain_params, lr=params['lr'])

# train the networks
print('Start to train...')
print(params)
# best_valid_regular = 0.
best_valid_adapt = 0.
# best_valid_indomain = 0.

# best_test_regular = 0.
best_test_adapt = 0.
# best_test_indomain = 0.

for epoch in tqdm(range(params['epochs'])):
#     train_loss_regular = 0.
    adapt_model.train()
#     regular_model.train()
#     indomain_model.train()
    
    # train indomain model for comparison
#     for step, train_batch in enumerate(valid_data_loader):
#         train_batch = tuple(t.to(device) for t in train_batch)
#         input_docs, input_labels, input_domains = train_batch
#         indomain_optim.zero_grad()
#         # indomain models
#         indomain_preds = indomain_model(**{'input_docs': input_docs})
#         loss = criterion(indomain_preds, input_labels)
#         loss.backward()
#         indomain_optim.step()

    # train discriminator first
    for _ in range(3):
        for step, train_batch in enumerate(all_data_loader):
            train_batch = tuple(t.to(device) for t in train_batch)
            input_docs, input_labels, input_domains = train_batch
            adapt_domain_optim.zero_grad()
            domain_preds = adapt_model.discriminator(**{'input_docs': input_docs})
            domain_loss = domain_criterion(domain_preds, input_domains)
            domain_loss.backward()
            adapt_domain_optim.step()

    # train predictor
    for step, train_batch in enumerate(train_data_loader):
        train_batch = tuple(t.to(device) for t in train_batch)
        input_docs, input_labels, input_domains = train_batch
        if len(input_docs) == 1:
            continue
#         regular_optim.zero_grad()
        adapt_pred_optim.zero_grad()
        # adapt_domain_optim.zero_grad()

        # regular models
#         regular_preds = regular_model(**{
#             'input_docs': input_docs
#         })
#         loss = criterion(regular_preds, input_labels)
#         train_loss_regular += loss.item()
#         loss_avg_regular = train_loss_regular / (step + 1)

        # adapt models
        adapt_preds = adapt_model(**{
            'input_docs': input_docs
        })
        loss_adapt = criterion_adapt(adapt_preds, input_labels)
        domain_preds = torch.sigmoid(adapt_model.discriminator(**{'input_docs': input_docs}))
        domain_preds_norm = domain_preds[:, 1]
        domain_preds_norm = domain_preds_norm / torch.max(domain_preds_norm)
        loss_adapt = loss_adapt.mean(axis=1)
        loss_adapt = domain_preds_norm * loss_adapt
        loss_adapt = loss_adapt.mean()

        loss_adapt.backward()
#         loss.backward()
        # torch.nn.utils.clip_grad_norm_(rnn_model.parameters(), 0.5)
#         regular_optim.step()
        adapt_pred_optim.step()

    # fit on in domain corpus.
#     for _ in range(1):
#         for step, train_batch in enumerate(valid_data_loader):
#             train_batch = tuple(t.to(device) for t in train_batch)
#             input_docs, input_labels, input_domains = train_batch
#             if len(input_docs) == 1:
#                 continue
#             adapt_pred_optim.zero_grad()
#             adapt_preds = adapt_model(**{'input_docs': input_docs})
#             loss_adapt = criterion_adapt(adapt_preds, input_labels)
#             loss_adapt = loss_adapt.mean()
#             loss_adapt.backward()
#             adapt_pred_optim.step()

    # evaluate on valid data
#     regular_model.eval()
    adapt_model.eval()
#     indomain_model.eval()
#     y_preds_regular = []
    y_preds_adapt = []
#     y_preds_indomain = []
    y_trues = []

    for valid_batch in valid_data_loader:
        valid_batch = tuple(t.to(device) for t in valid_batch)
        input_docs, input_labels, input_domains = valid_batch
        with torch.no_grad():
#             preds_regular = regular_model(**{'input_docs': input_docs})
            preds_adapt = adapt_model(**{'input_docs': input_docs})
#             preds_indomain = indomain_model(**{'input_docs': input_docs})

#         logits_regular = (torch.sigmoid(preds_regular) > .5).long().cpu().numpy()
        logits_adapt = (torch.sigmoid(preds_adapt) > .5).long().cpu().numpy()
#         logits_indomain = (torch.sigmoid(preds_indomain) > .4).long().cpu().numpy()

#         y_preds_regular.extend(logits_regular)
        y_preds_adapt.extend(logits_adapt)
#         y_preds_indomain.extend(logits_indomain)
        y_trues.extend(input_labels.to('cpu').numpy())

#     eval_score_regular = micro_f1_average(y_preds=y_preds_regular, y_truths=y_trues)
    eval_score_adapt = micro_f1_average(y_preds=y_preds_adapt, y_truths=y_trues)
#     eval_score_indomain = micro_f1_average(y_preds=y_preds_indomain, y_truths=y_trues)

    # test for regular model
#     if eval_score_regular > best_valid_regular:
#         best_valid_regular = eval_score_regular

#         # test
#         y_preds = []
#         y_trues = []
#         # evaluate on the test set
#         for test_batch in test_data_loader:
#             test_batch = tuple(t.to(device) for t in test_batch)
#             input_docs, input_labels, input_domains = test_batch

#             with torch.no_grad():
#                 preds_regular = regular_model(**{
#                     'input_docs': input_docs,
#                 })
#             logits_regular = (torch.sigmoid(preds_regular) > .5).long().cpu().numpy()
#             y_preds.extend(logits_regular)
#             y_trues.extend(input_labels.to('cpu').numpy())

#         test_score_regular = micro_f1_average(y_preds=y_preds, y_truths=y_trues)
#         if best_test_regular < test_score_regular:
#             best_test_regular = test_score_regular
#             torch.save(regular_model, params['model_dir'] + 'regular_bert_vaccine.pth')
            
#         regular_results = 'Test on Regular BERT, Domain {}, Epoch {}, F1-micro-average {}, Valid Score {}\n'.format(
#                 'vaccine', epoch, test_score_regular, best_valid_regular)
#         print('Regular Results: ', regular_results)
#         wfile.write(regular_results)

    # test for indomain model
#     if eval_score_indomain > best_valid_indomain:
#         best_valid_indomain = eval_score_indomain

#         # test
#         y_preds = []
#         y_trues = []
#         # evaluate on the test set
#         for test_batch in test_data_loader:
#             test_batch = tuple(t.to(device) for t in test_batch)
#             input_docs, input_labels, input_domains = test_batch

#             with torch.no_grad():
#                 preds_indomain = indomain_model(**{
#                     'input_docs': input_docs,
#                 })
#             logits_indomain = (torch.sigmoid(preds_indomain) > .5).long().cpu().numpy()
#             y_preds.extend(logits_indomain)
#             y_trues.extend(input_labels.to('cpu').numpy())

#         test_score_indomain = micro_f1_average(y_preds=y_preds, y_truths=y_trues)
#         if best_test_indomain < test_score_indomain:
#             best_test_indomain = test_score_indomain
#             torch.save(indomain_model, params['model_dir'] + 'indomain_bert_vaccine.pth')
            
#         indomain_results = 'Test on Indomain BERT, Domain {}, Epoch {}, F1-micro-average {}, Valid Score {}\n'.format(
#                 'vaccine', epoch, test_score_indomain, best_valid_indomain)
#         print('Regular Results: ', indomain_results)
#         wfile.write(indomain_results)

    if eval_score_adapt > best_valid_adapt:
        best_valid_adapt = eval_score_adapt

    # test
    y_preds = []
    y_trues = []
    # evaluate on the test set
    for test_batch in test_data_loader:
        test_batch = tuple(t.to(device) for t in test_batch)
        input_docs, input_labels, input_domains = test_batch

        with torch.no_grad():
            preds_adapt = adapt_model(**{
                'input_docs': input_docs,
            })
        logits_adapt = (torch.sigmoid(preds_adapt) > .5).long().cpu().numpy()
        y_preds.extend(logits_adapt)
        y_trues.extend(input_labels.to('cpu').numpy())

    test_score_adapt = micro_f1_average(y_preds=y_preds, y_truths=y_trues)
    if test_score_adapt > best_test_adapt:
        best_test_adapt = test_score_adapt
        torch.save(adapt_model, params['model_dir'] + 'adapt_bert_vaccine.pth')
    test_score_adapt = 'Test on Adapt BERT, Domain {}, Epoch {}, F1-micro-average {}, Valid Score {}\n'.format(
            'vaccine', epoch, test_score_adapt, best_valid_adapt)
    print('Adapt Results: ', test_score_adapt)
#         wfile.write(test_score_adapt)

# wfile.write('\n\n\n')
# wfile.close()

# print('Best on Regular BERT, Domain {}, F1-micro-average {}, Valid Score {}\n'.format(
#             domain, best_test_regular, best_valid_regular))
# print('Best on InDomain BERT, Domain {}, F1-micro-average {}, Valid Score {}\n'.format(
#             domain, best_test_indomain, best_valid_indomain))
print('Best on Adapt BERT, Domain {}, F1-micro-average {}, Valid Score {}\n'.format(
            domain, best_test_adapt, best_valid_adapt))

  0%|          | 0/10 [00:00<?, ?it/s]

Start to train...
{'result_path': '../resource/results/adapt_bert.txt', 'model_dir': '../resource/model/adapt_bert/', 'dname': 'all', 'dpath': '../data/dataset.tsv', 'max_feature': 15000, 'over_sample': True, 'domain_name': 'corpus', 'epochs': 10, 'batch_size': 16, 'lr': 9e-05, 'max_len': 60, 'dp_rate': 0.2, 'optimizer': 'adam', 'emb_dim': 200, 'unique_domains': ['ALM', 'Baltimore', 'BLM', 'Davidson', 'Election', 'MeToo', 'Sandy'], 'bidirectional': False, 'device': device(type='cuda'), 'bert_name': 'bert-base-uncased', 'num_label': 11}


 10%|█         | 1/10 [05:30<49:34, 330.55s/it]

Adapt Results:  Test on Adapt BERT, Domain vaccine, Epoch 0, F1-micro-average 0.6166666666666666, Valid Score 0.6004140786749482



 20%|██        | 2/10 [11:00<44:04, 330.51s/it]

Adapt Results:  Test on Adapt BERT, Domain vaccine, Epoch 1, F1-micro-average 0.6085106382978723, Valid Score 0.6004140786749482



In [24]:
omit_optim_names = ['bias', 'LayerNorm.weight']
regular_model = RegularBERT(params)
regular_model = regular_model.to(device)
criterion = nn.BCEWithLogitsLoss().to(device)
domain_criterion = nn.CrossEntropyLoss().to(device)
regular_optim = torch.optim.Adam(
    [p for n, p in regular_model.named_parameters() if not any(nd in n for nd in omit_optim_names)],
    lr=params['lr']
)

indomain_model = RegularBERT(params)
indomain_model = indomain_model.to(device)
indomain_optim = torch.optim.Adam(
    [p for n, p in indomain_model.named_parameters() if not any(nd in n for nd in omit_optim_names)],
    lr=params['lr']
)

adapt_model = AdaptBERT(params)
adapt_model = adapt_model.to(device)
criterion_adapt = nn.BCEWithLogitsLoss(reduction='none').to(device)
pred_params = [param for name, param in adapt_model.named_parameters() if 'domain' not in name and not any(nd in name for nd in omit_optim_names)]
adapt_pred_optim = torch.optim.Adam(pred_params, lr=params['lr'])
domain_params = [param for name, param in adapt_model.named_parameters() if 'domain' in name and not any(nd in name for nd in omit_optim_names)]
adapt_domain_optim = torch.optim.Adam(domain_params, lr=params['lr'])

# train the networks
print('Start to train...')
print(params)
best_valid_regular = 0.
best_valid_adapt = 0.
best_valid_indomain = 0.

best_test_regular = 0.
best_test_adapt = 0.
best_test_indomain = 0.

for epoch in tqdm(range(params['epochs'])):
    train_loss_regular = 0.
    train_loss_adapt = 0.
    adapt_model.train()
    regular_model.train()
    indomain_model.train()
    
    # train indomain model for comparison
    for step, train_batch in enumerate(valid_data_loader):
        train_batch = tuple(t.to(device) for t in train_batch)
        input_docs, input_labels, input_domains = train_batch
        indomain_optim.zero_grad()
        # indomain models
        indomain_preds = indomain_model(**{'input_docs': input_docs})
        loss = criterion(indomain_preds, input_labels)
        loss.backward()
        indomain_optim.step()

    # train discriminator first
    for step, train_batch in enumerate(all_data_loader):
        train_batch = tuple(t.to(device) for t in train_batch)
        input_docs, input_labels, input_domains = train_batch
        adapt_domain_optim.zero_grad()
        domain_preds = adapt_model.discriminator(**{'input_docs': input_docs})
        domain_loss = domain_criterion(domain_preds, input_domains)
        domain_loss.backward()
        adapt_domain_optim.step()

    # train predictor
    for step, train_batch in enumerate(train_data_loader):
        train_batch = tuple(t.to(device) for t in train_batch)
        input_docs, input_labels, input_domains = train_batch
        if len(input_docs) == 1:
            continue
        regular_optim.zero_grad()
        adapt_pred_optim.zero_grad()
        # adapt_domain_optim.zero_grad()

        # regular models
        regular_preds = regular_model(**{
            'input_docs': input_docs
        })
        loss = criterion(regular_preds, input_labels)
        train_loss_regular += loss.item()
        loss_avg_regular = train_loss_regular / (step + 1)

        # adapt models
        adapt_preds = adapt_model(**{
            'input_docs': input_docs
        })
        loss_adapt = criterion_adapt(adapt_preds, input_labels)
        domain_preds = torch.sigmoid(adapt_model.discriminator(**{'input_docs': input_docs}))
        loss_adapt = loss_adapt.mean(axis=1)
        loss_adapt = domain_preds[:, 1] * loss_adapt
        loss_adapt = loss_adapt.mean()
        train_loss_adapt += loss_adapt.item()
        loss_avg_adapt = train_loss_adapt / (step + 1)

        loss_adapt.backward()
        loss.backward()
        # torch.nn.utils.clip_grad_norm_(rnn_model.parameters(), 0.5)
        regular_optim.step()
        adapt_pred_optim.step()

    # fit on in domain corpus.
    for _ in range(5):
        for step, train_batch in enumerate(valid_data_loader):
            train_batch = tuple(t.to(device) for t in train_batch)
            input_docs, input_labels, input_domains = train_batch
            if len(input_docs) == 1:
                continue
            adapt_pred_optim.zero_grad()
            adapt_preds = adapt_model(**{'input_docs': input_docs})
            loss_adapt = criterion_adapt(adapt_preds, input_labels)
            loss_adapt = loss_adapt.mean()
            loss_adapt.backward()
            adapt_pred_optim.step()

    # evaluate on valid data
    regular_model.eval()
    adapt_model.eval()
    indomain_model.eval()
    y_preds_regular = []
    y_preds_adapt = []
    y_preds_indomain = []
    y_trues = []

    for valid_batch in valid_data_loader:
        valid_batch = tuple(t.to(device) for t in valid_batch)
        input_docs, input_labels, input_domains = valid_batch
        with torch.no_grad():
            preds_regular = regular_model(**{'input_docs': input_docs})
            preds_adapt = adapt_model(**{'input_docs': input_docs})
            preds_indomain = indomain_model(**{'input_docs': input_docs})

        logits_regular = (torch.sigmoid(preds_regular) > .5).long().cpu().numpy()
        logits_adapt = (torch.sigmoid(preds_adapt) > .5).long().cpu().numpy()
        logits_indomain = (torch.sigmoid(preds_indomain) > .4).long().cpu().numpy()

        y_preds_regular.extend(logits_regular)
        y_preds_adapt.extend(logits_adapt)
        y_preds_indomain.extend(logits_indomain)
        y_trues.extend(input_labels.to('cpu').numpy())

    eval_score_regular = micro_f1_average(y_preds=y_preds_regular, y_truths=y_trues)
    eval_score_adapt = micro_f1_average(y_preds=y_preds_adapt, y_truths=y_trues)
    eval_score_indomain = micro_f1_average(y_preds=y_preds_indomain, y_truths=y_trues)

    # test for regular model
    if eval_score_regular > best_valid_regular:
        best_valid_regular = eval_score_regular

        # test
        y_preds = []
        y_trues = []
        # evaluate on the test set
        for test_batch in test_data_loader:
            test_batch = tuple(t.to(device) for t in test_batch)
            input_docs, input_labels, input_domains = test_batch

            with torch.no_grad():
                preds_regular = regular_model(**{
                    'input_docs': input_docs,
                })
            logits_regular = (torch.sigmoid(preds_regular) > .5).long().cpu().numpy()
            y_preds.extend(logits_regular)
            y_trues.extend(input_labels.to('cpu').numpy())

        test_score_regular = micro_f1_average(y_preds=y_preds, y_truths=y_trues)
        if best_test_regular < test_score_regular:
            best_test_regular = test_score_regular
            torch.save(regular_model, params['model_dir'] + 'regular_bert_vaccine.pth')
            
        regular_results = 'Test on Regular BERT, Domain {}, Epoch {}, F1-micro-average {}, Valid Score {}\n'.format(
                'vaccine', epoch, test_score_regular, best_valid_regular)
#         print('Regular Results: ', regular_results)
#         wfile.write(regular_results)

    # test for indomain model
    if eval_score_indomain > best_valid_indomain:
        best_valid_indomain = eval_score_indomain

        # test
        y_preds = []
        y_trues = []
        # evaluate on the test set
        for test_batch in test_data_loader:
            test_batch = tuple(t.to(device) for t in test_batch)
            input_docs, input_labels, input_domains = test_batch

            with torch.no_grad():
                preds_indomain = indomain_model(**{
                    'input_docs': input_docs,
                })
            logits_indomain = (torch.sigmoid(preds_indomain) > .5).long().cpu().numpy()
            y_preds.extend(logits_indomain)
            y_trues.extend(input_labels.to('cpu').numpy())

        test_score_indomain = micro_f1_average(y_preds=y_preds, y_truths=y_trues)
        if best_test_indomain < test_score_indomain:
            best_test_indomain = test_score_indomain
            torch.save(indomain_model, params['model_dir'] + 'indomain_bert_vaccine.pth')
            
        indomain_results = 'Test on Indomain BERT, Domain {}, Epoch {}, F1-micro-average {}, Valid Score {}\n'.format(
                'vaccine', epoch, test_score_indomain, best_valid_indomain)
#         print('Regular Results: ', indomain_results)
#         wfile.write(indomain_results)

    if eval_score_adapt > best_valid_adapt:
        best_valid_adapt = eval_score_adapt

    # test
    y_preds = []
    y_trues = []
    # evaluate on the test set
    for test_batch in test_data_loader:
        test_batch = tuple(t.to(device) for t in test_batch)
        input_docs, input_labels, input_domains = test_batch

        with torch.no_grad():
            preds_adapt = adapt_model(**{
                'input_docs': input_docs,
            })
        logits_adapt = (torch.sigmoid(preds_adapt) > .5).long().cpu().numpy()
        y_preds.extend(logits_adapt)
        y_trues.extend(input_labels.to('cpu').numpy())

    test_score_adapt = micro_f1_average(y_preds=y_preds, y_truths=y_trues)
    if test_score_adapt > best_test_adapt:
        best_test_adapt = test_score_adapt
        torch.save(adapt_model, params['model_dir'] + 'adapt_bert_vaccine.pth')
    test_score_adapt = 'Test on Adapt BERT, Domain {}, Epoch {}, F1-micro-average {}, Valid Score {}\n'.format(
            'vaccine', epoch, test_score_adapt, best_valid_adapt)
#         print('Adapt Results: ', test_score_adapt)
#         wfile.write(test_score_adapt)

# wfile.write('\n\n\n')
# wfile.close()

print('Best on Regular BERT, Domain {}, F1-micro-average {}, Valid Score {}\n'.format(
            domain, best_test_regular, best_valid_regular))
print('Best on InDomain BERT, Domain {}, F1-micro-average {}, Valid Score {}\n'.format(
            domain, best_test_indomain, best_valid_indomain))
print('Best on Adapt BERT, Domain {}, F1-micro-average {}, Valid Score {}\n'.format(
            domain, best_test_adapt, best_valid_adapt))

  0%|          | 0/10 [00:00<?, ?it/s]

Start to train...
{'result_path': '../resource/results/adapt_bert.txt', 'model_dir': '../resource/model/adapt_bert/', 'dname': 'all', 'dpath': '../data/dataset.tsv', 'max_feature': 15000, 'over_sample': True, 'domain_name': 'corpus', 'epochs': 10, 'batch_size': 64, 'lr': 9e-05, 'max_len': 60, 'dp_rate': 0.2, 'optimizer': 'adam', 'emb_dim': 200, 'unique_domains': ['ALM', 'Baltimore', 'BLM', 'Davidson', 'Election', 'MeToo', 'Sandy'], 'bidirectional': False, 'device': device(type='cuda'), 'bert_name': 'bert-base-uncased', 'num_label': 11}


100%|██████████| 10/10 [37:38<00:00, 225.87s/it]

Best on Regular RNN, Domain Sandy, F1-micro-average 0.7905074592477391, Valid Score 0.9978333054952397

Best on InDomain RNN, Domain Sandy, F1-micro-average 0.625988787343522, Valid Score 0.7001587826129008

Best on Adapt RNN, Domain Sandy, F1-micro-average 0.584, Valid Score 0.6252505010020041




