In [14]:
from collections import Counter
import re
import pickle
import os
import json

import pandas as pd
import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.feature_selection import mutual_info_classif

import numpy as np
from nltk.corpus import stopwords
import nltk
from nltk.tokenize import TweetTokenizer, word_tokenize
import gensim
from imblearn.over_sampling import RandomOverSampler

from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from torch.utils.data import Dataset
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

# nltk.download('stopwords')
spw_set = set(stopwords.words('english'))
spw_set.add('url')
tokenizer = TweetTokenizer()

In [68]:
all_data = pd.read_csv('../data/dataset.tsv', sep='\t', dtype=str)
all_data.tid = all_data.tid.apply(lambda x: str(x))
all_data = all_data[~all_data.text.isna()]
all_data = all_data[~all_data.labels.isna()]
print(len(all_data))
all_data.head()

31849


Unnamed: 0,tid,text,corpus,labels
0,521033092132503552,@fergusonoctober @fox2now #alllivesmatter peac...,ALM,"care,purity"
1,537681598989475841,wholeheartedly support these protests acts of ...,ALM,"subversion,loyalty"
2,624644420705648640,this sandra bland situation man no disrespect ...,ALM,cheating
3,752979765984890884,"commitment to peace, healing and loving neighb...",ALM,"care,purity"
4,548029362348765185,injustice for one is an injustice for all #all...,ALM,"cheating,loyalty"


In [16]:
# preprocessing url
def preprocess(tweet):
    """
    Preprocess a single tweet
    :param tweet:
    :return:
    """
    global tokenizer

    # lowercase
    tweet = tweet.lower()
    # replace url
    tweet = re.sub(r"https?:\S+", "URL", tweet)
    # replace user
#     tweet = re.sub(r'@\w+', 'USER', tweet)
    # replace hashtag
#     tweet = re.sub(r'#\S+', 'HASHTAG', tweet)
    # tokenize
    return [item.strip() for item in tokenizer.tokenize(tweet) if len(item.strip())>0]

In [69]:
# preprocess tweet and remove short tweet
all_data.text = all_data.text.apply(lambda x: preprocess(x))
all_data = all_data[all_data.text.apply(lambda x: len(x) > 3)]
all_data.text = all_data.text.apply(lambda x: ' '.join(x))
print(len(all_data))

30979


In [70]:
def label_encoder(raw_label):
    all_labels = [
        'subversion', 'loyalty', 'care', 'cheating',
        'purity', 'fairness', 'degradation', 'betrayal', 'harm', 'authority'
    ]
    encode_label = [0]*len(all_labels)
    if type(raw_label) != str:
        encode_label[-1] = 1
        return encode_label
    for label in raw_label.split(','):
        if label not in all_labels:
            encode_label[-1] = 1
        else:
            encode_label[all_labels.index(label)] = 1
    return encode_label

In [71]:
all_data.labels = all_data.labels.apply(lambda x: label_encoder(x))

In [126]:
def micro_f1_average(y_preds, y_truths):
    precisions = []
    recalls = []
    for idx, (y_pred, y_truth) in enumerate(zip(y_preds, y_truths)):
        true_positives = np.sum(np.logical_and(y_truth, y_pred))

        # compute the sum of tp + fp across training examples and labels
        l_prec_den = np.sum(y_pred)
        if l_prec_den != 0:
            # compute micro-averaged precision
            precisions.append(true_positives/l_prec_den)
        
        # compute sum of tp + fn across training examples and labels
        l_recall_den = np.sum(y_truth)

        # compute mirco-average recall
        if l_recall_den != 0:
            recalls.append(true_positives/l_recall_den)

    precisions = np.mean(precisions)
    recalls = np.mean(recalls)
    if precisions + recalls == 0:
        return 0
    f1 = 2*precisions*recalls / (precisions + recalls)
    return f1
def multi_label_f1(y_preds, y_truths, mode='weighted'):
    preds = dict()
    truths = dict()
    for idx in range(len(y_truths)):
        for jdx in range(len(y_truths[idx])):
            if jdx not in preds:
                preds[jdx] = []
                truths[jdx] = []
            preds[jdx].append(y_preds[idx][jdx])
            truths[jdx].append(y_truths[idx][jdx])
    results = []
    for jdx in preds:
        results.append(metrics.f1_score(preds[jdx], truths[jdx], average=mode))
    return np.average(results)

In [72]:
# load the vaccine data and test the classifier on the vaccine data
vaccine_df = pd.read_csv('../data/vaccine_morality.csv', dtype=str)
vaccine_df.text = vaccine_df.text.apply(lambda x: preprocess(x))
# vaccine_df = vaccine_df[vaccine_df.text.apply(lambda x: len(x) > 3)]
vaccine_df.text = vaccine_df.text.apply(lambda x: ' '.join(x))
vaccine_df = vaccine_df.sample(frac=1).reset_index(drop=True)
print(len(vaccine_df))

500


In [117]:
vaccine_data = {
    'train_x': [],
    'train_y': [],
}

all_labels = [
    'subversion', 'loyalty', 'care', 'cheating',
    'purity', 'fairness', 'degradation', 'betrayal', 'harm', 'authority'
]

for idx, row in vaccine_df.iterrows():
    encode_label = [0] * len(all_labels)
    for label_index, label in enumerate(all_labels):
        if np.isnan(np.array(row[all_labels[label_index]], dtype=np.float32)):
            continue
        if int(row[all_labels[label_index]]) == 1:
            encode_label[label_index] = 1
    if sum(encode_label) == 0:
        encode_label[-1] = 1
    vaccine_data['train_x'].append(row['text'])
    vaccine_data['train_y'].append(encode_label)

In [118]:
print(vaccine_data['train_x'][:5])
print(vaccine_data['train_y'][:5])

['just got a covid vaccine shout out to fatness :)', '@shurrell1 @badcovid19takes no , they believe the vaccine is deadlier than covid .', '@wcvb why would you take a vaccine for covid when the recovery rate is 99.9 % using therapeutics ? 🇺 🇸 URL', 'just hoping covid gets it ’ s ass kicked as hard as mine did ice skating w / this new vaccine #newprofilepic URL', 'i got my first covid vaccine ! crying tears of joy in the vaccination area , feeling especially proud and grateful for science today ! ! !']
[[0, 0, 0, 0, 0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 1, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 1]]


In [121]:
# encode domains
domain_encoder = list(all_data.corpus.unique()) + ['vaccine']

# use half of the vaccine as train and half as test
all_train = {
    'docs': all_data.text.to_list() + vaccine_data['train_x'][:250],
    'labels': all_data.labels.to_list() + vaccine_data['train_y'][:250],
    'corpus': all_data.corpus.to_list() + ['vaccine'] * 250
}
all_train['corpus'] = [domain_encoder.index(item) for item in all_train['corpus']]
vaccine_train = {
    'docs': vaccine_data['train_x'][:250],
    'labels': vaccine_data['train_y'][:250],
    'corpus':  [len(domain_encoder)-1] * 250
}
vaccine_test = {
    'docs': vaccine_data['train_x'][250:],
    'labels': vaccine_data['train_y'][250:],
    'corpus':  [len(domain_encoder)-1] * 250
}

In [23]:
def build_wt(tkn, emb_path, opath):
    """Build weight using word embedding"""
    embed_len = len(tkn.word_index)
    if embed_len > tkn.num_words:
        embed_len = tkn.num_words

    if emb_path.endswith('.bin'):
        embeds = gensim.models.KeyedVectors.load_word2vec_format(
            emb_path, binary=True, unicode_errors='ignore'
        )
        emb_size = embeds.vector_size
        emb_matrix = list(np.zeros((embed_len + 1, emb_size)))
        for pair in zip(embeds.wv.index2word, embeds.wv.syn0):
            if pair[0] in tkn.word_index and \
                    tkn.word_index[pair[0]] < tkn.num_words:
                emb_matrix[tkn.word_index[pair[0]]] = np.asarray([
                    float(item) for item in pair[1]
                ], dtype=np.float32)
    else:
        dfile = open(emb_path)
        line = dfile.readline().strip().split()
        if len(line) < 5:
            line = dfile.readline().strip().split()
        emb_size = len(line[1:])
        emb_matrix = list(np.zeros((embed_len + 1, emb_size)))
        dfile.close()

        with open(emb_path) as dfile:
            for line in dfile:
                line = line.strip().split()
                if line[0] in tkn.word_index and \
                        tkn.word_index[line[0]] < tkn.num_words:
                    emb_matrix[tkn.word_index[line[0]]] = np.asarray([
                        float(item) for item in line[1:]
                    ], dtype=np.float32)
    # emb_matrix = np.array(emb_matrix, dtype=np.float32)
    np.save(opath, emb_matrix)
    return emb_matrix


def build_tok(docs, max_feature, opath):
    if os.path.exists(opath):
        return pickle.load(open(opath, 'rb'))
    else:
        # load corpus
        tkn = Tokenizer(num_words=max_feature)
        tkn.fit_on_texts(docs)

        with open(opath, 'wb') as wfile:
            pickle.dump(tkn, wfile)
        return tkn

class TorchDataset(Dataset):
    def __init__(self, dataset, domain_name):
        self.dataset = dataset
        self.domain_name = domain_name

    def __len__(self):
        return len(self.dataset['docs'])

    def __getitem__(self, idx):
        if self.domain_name in self.dataset:
            return self.dataset['docs'][idx], self.dataset['labels'][idx], self.dataset[self.domain_name][idx]
        else:
            return self.dataset['docs'][idx], self.dataset['labels'][idx], -1


In [58]:
class RegularRNN(nn.Module):
    def __init__(self, params):
        super(RegularRNN, self).__init__()
        self.params = params

        if 'word_emb_path' in self.params and os.path.exists(self.params['word_emb_path']):
            self.wemb = nn.Embedding.from_pretrained(
                torch.FloatTensor(np.load(
                    self.params['word_emb_path'], allow_pickle=True))
            )
        else:
            self.wemb = nn.Embedding(
                self.params['max_feature'], self.params['emb_dim']
            )
            self.wemb.reset_parameters()
            nn.init.kaiming_uniform_(self.wemb.weight, a=np.sqrt(5))

        if self.params['bidirectional']:
            self.word_hidden_size = self.params['emb_dim'] // 2
        else:
            self.word_hidden_size = self.params['emb_dim']

        # domain adaptation
        self.doc_net_general = nn.GRU(
            self.wemb.embedding_dim, self.word_hidden_size,
            bidirectional=self.params['bidirectional'], dropout=self.params['dp_rate'],
            batch_first=True
        )
        # prediction
        self.predictor = nn.Linear(
            self.params['emb_dim'], self.params['num_label'])

    def forward(self, input_docs):
        # encode the document from different perspectives
        doc_embs = self.wemb(input_docs)
        _, doc_general = self.doc_net_general(doc_embs)  # omit hidden vectors

        # concatenate hidden state
        if self.params['bidirectional']:
            doc_general = torch.cat((doc_general[0, :, :], doc_general[1, :, :]), -1)

        if doc_general.shape[0] == 1:
            doc_general = doc_general.squeeze(dim=0)

        # prediction
        doc_preds = self.predictor(doc_general)
        return doc_preds

In [78]:
class DataEncoder(object):
    def __init__(self, params, mtype='rnn'):
        """

        :param params:
        :param mtype: Model type, rnn or bert
        """
        self.params = params
        self.mtype = mtype
        if self.mtype == 'rnn':
            self.tok = pickle.load(open(
                os.path.join(params['tok_dir'], '{}.tok'.format(params['dname'])), 'rb'))
        elif self.mtype == 'bert':
            self.tok = BertTokenizer.from_pretrained(params['bert_name'])
        else:
            raise ValueError('Only support BERT and RNN data encoders')

    def __call__(self, batch):
        docs = []
        labels = []
        domains = []
        for text, label, domain in batch:
            if self.mtype == 'bert':
                text = self.tok.encode_plus(
                    text, padding='max_length', max_length=self.params['max_len'],
                    return_tensors='pt', return_token_type_ids=False,
                    truncation=True,
                )
                docs.append(text['input_ids'][0])
            else:
                docs.append(text)
            labels.append(label)
            domains.append(domain)

        labels = torch.tensor(labels, dtype=torch.float)
        domains = torch.tensor(domains, dtype=torch.long)
        if self.mtype == 'rnn':
            # padding and tokenize
            docs = self.tok.texts_to_sequences(docs)
            docs = pad_sequences(docs)
            docs = torch.Tensor(docs).long()
        else:
            docs = torch.stack(docs).long()
        return docs, labels, domains

In [45]:
result_dir = '../resource/results/'
if not os.path.exists(result_dir):
    os.mkdir(result_dir)
model_dir = '../resource/model/'
if not os.path.exists(model_dir):
    os.mkdir(model_dir)
model_dir = model_dir + 'vaccine/'
if not os.path.exists(model_dir):
    os.mkdir(model_dir)
        
params = {
    'result_path': os.path.join(result_dir, 'vaccine.txt'),
    'model_dir': model_dir,
    'dname': 'vaccine',
    'max_feature': 15000,
    'over_sample': True,
    'domain_name': 'corpus',
    'epochs': 15,
    'batch_size': 64,
    'lr': 9e-5,
    'max_len': 100,
    'dp_rate': .2,
    'optimizer': 'rmsprop',
    'emb_path': '/data/models/glove.twitter.27B.200d.txt',  # adjust for different languages
    'emb_dim': 200,
    'unique_domains': [],
    'bidirectional': False,
    'device': 'cuda',
    'num_label': len(all_labels),
}

In [50]:
if torch.cuda.is_available() and params['device'] != 'cpu':
    device = torch.device(params['device'])
else:
    device = torch.device('cpu')
params['device'] = device

print('Loading Data...')
params['unique_domains'] = list(all_data.corpus.unique()) + ['vaccine']

# build tokenizer and weight
tok_dir = os.path.dirname(params['model_dir'])
params['tok_dir'] = tok_dir
params['word_emb_path'] = os.path.join(
    tok_dir, params['dname'] + '.npy'
)
tok = build_tok(
    all_data.text.tolist() + vaccine_df.text.tolist(), max_feature=params['max_feature'],
    opath=os.path.join(tok_dir, '{}.tok'.format(params['dname']))
)
if not os.path.exists(params['word_emb_path']):
    build_wt(tok, params['emb_path'], params['word_emb_path'])

Loading Data...


In [139]:
# vaccine data only
params['epochs'] = 50
data_encoder = DataEncoder(params, mtype='rnn')

# # train_indices, val_indices, test_indices = data_split(data)
# train_data = {
#     'docs': [data['docs'][item] for item in train_indices],
#     'labels': [data['labels'][item] for item in train_indices],
#     params['domain_name']: [data[params['domain_name']][item] for item in train_indices],
# }
# valid_data = {
#     'docs': [data['docs'][item] for item in val_indices],
#     'labels': [data['labels'][item] for item in val_indices],
#     params['domain_name']: [data[params['domain_name']][item] for item in val_indices],
# }
# test_data = {
#     'docs': [data['docs'][item] for item in test_indices],
#     'labels': [data['labels'][item] for item in test_indices],
#     params['domain_name']: [data[params['domain_name']][item] for item in test_indices],
# }
# if params['over_sample']:
#     ros = RandomOverSampler(random_state=33)
#     sample_indices = [[item] for item in range(len(train_data['docs']))]
#     sample_indices, _ = ros.fit_resample(sample_indices, train_data['labels'])
#     sample_indices = [item[0] for item in sample_indices]
#     train_data = {
#         'docs': [train_data['docs'][item] for item in sample_indices],
#         'labels': [train_data['labels'][item] for item in sample_indices],
#         params['domain_name']: [train_data[params['domain_name']][item] for item in sample_indices],
#     }

train_data = TorchDataset(vaccine_train, params['domain_name'])
train_data_loader = DataLoader(
    train_data, batch_size=params['batch_size'], shuffle=True,
    collate_fn=data_encoder
)
# valid_data = TorchDataset(vaccine_train, params['domain_name'])
# valid_data_loader = DataLoader(
#     valid_data, batch_size=params['batch_size'], shuffle=False,
#     collate_fn=data_encoder
# )
test_data = TorchDataset(vaccine_test, params['domain_name'])
test_data_loader = DataLoader(
    test_data, batch_size=params['batch_size'], shuffle=False,
    collate_fn=data_encoder
)

# build model
rnn_model = RegularRNN(params)
rnn_model = rnn_model.to(device)
criterion = nn.BCEWithLogitsLoss().to(device)
optimizer = torch.optim.RMSprop(rnn_model.parameters(), lr=params['lr'])

# train the networks
print('Start to train...')
print(params)
best_score = 0.
for epoch in tqdm(range(params['epochs'])):
    train_loss = 0
    rnn_model.train()

    for step, train_batch in enumerate(train_data_loader):
        train_batch = tuple(t.to(device) for t in train_batch)
        input_docs, input_labels, input_domains = train_batch
        optimizer.zero_grad()
        predictions = rnn_model(**{
            'input_docs': input_docs
        })
        loss = criterion(predictions, input_labels)
        train_loss += loss.item()

        loss_avg = train_loss / (step + 1)
        if (step + 1) % 301 == 0:
            print('Epoch: {}, Step: {}'.format(epoch, step))
            print('\tLoss: {}.'.format(loss_avg))
            print('-------------------------------------------------')

        loss.backward()
        # torch.nn.utils.clip_grad_norm_(rnn_model.parameters(), 0.5)
        optimizer.step()

    # evaluate on the valid set
#     y_preds = []
#     y_trues = []
#     rnn_model.eval()
#     for valid_batch in valid_data_loader:
#         valid_batch = tuple(t.to(device) for t in valid_batch)
#         input_docs, input_labels, input_domains = valid_batch
#         with torch.no_grad():
#             predictions = rnn_model(**{
#                 'input_docs': input_docs,
#             })
#         logits = (torch.sigmoid(predictions) > .5).long().cpu().numpy()
#         y_preds.extend(logits)
#         y_trues.extend(input_labels.to('cpu').numpy())
    
#     eval_score = multi_label_f1(y_preds=y_preds, y_truths=y_trues, mode='binary')
#     if eval_score > best_score:
#         best_score = eval_score
#         torch.save(rnn_model, params['model_dir'] + '{}.pth'.format(os.path.basename(__file__)))

    y_preds = []
    y_probs = []
    y_trues = []
    y_domains = []
    # evaluate on the test set
    for test_batch in test_data_loader:
        test_batch = tuple(t.to(device) for t in test_batch)
        input_docs, input_labels, input_domains = test_batch

        with torch.no_grad():
            predictions = rnn_model(**{
                'input_docs': input_docs,
            })
        logits = (torch.sigmoid(predictions) > .5).long().cpu().numpy()
        y_preds.extend(logits)
        y_trues.extend(input_labels.to('cpu').numpy())
        y_probs.extend([item[1] for item in logits])
        y_domains.extend(input_domains.detach().cpu().numpy())
    print('Epoch: ', epoch)
#     print(multi_label_f1(y_preds=y_preds, y_truths=y_trues, mode='binary'))
    print(micro_f1_average(y_preds=y_preds, y_truths=y_trues))

  4%|▍         | 2/50 [00:00<00:03, 14.89it/s]

Start to train...
{'result_path': '../resource/results/vaccine.txt', 'model_dir': '../resource/model/vaccine/', 'dname': 'vaccine', 'max_feature': 15000, 'over_sample': True, 'domain_name': 'corpus', 'epochs': 50, 'batch_size': 64, 'lr': 9e-05, 'max_len': 100, 'dp_rate': 0.2, 'optimizer': 'rmsprop', 'emb_path': '/data/models/glove.twitter.27B.200d.txt', 'emb_dim': 200, 'unique_domains': ['ALM', 'Baltimore', 'BLM', 'Davidson', 'Election', 'MeToo', 'Sandy', 'vaccine'], 'bidirectional': False, 'device': device(type='cuda'), 'num_label': 10, 'tok_dir': '../resource/model/vaccine', 'word_emb_path': '../resource/model/vaccine/vaccine.npy'}
Epoch:  0
0.6432646281867767
Epoch:  1
0.67698670605613
Epoch:  2


  8%|▊         | 4/50 [00:00<00:03, 14.64it/s]

0.67698670605613
Epoch:  3
0.67698670605613
Epoch:  4
0.67698670605613


 16%|█▌        | 8/50 [00:00<00:02, 14.44it/s]

Epoch:  5
0.67698670605613
Epoch:  6
0.67698670605613
Epoch:  7
0.67698670605613


 20%|██        | 10/50 [00:00<00:02, 14.57it/s]

Epoch:  8
0.67698670605613
Epoch:  9
0.67698670605613
Epoch:  10
0.67698670605613


 28%|██▊       | 14/50 [00:00<00:02, 13.78it/s]

Epoch:  11
0.67698670605613
Epoch:  12
0.67698670605613
Epoch:  13
0.67698670605613


 32%|███▏      | 16/50 [00:01<00:02, 14.04it/s]

Epoch:  14
0.67698670605613
Epoch:  15
0.67698670605613
Epoch:  16
0.67698670605613


 40%|████      | 20/50 [00:01<00:02, 13.99it/s]

Epoch:  17
0.67698670605613
Epoch:  18
0.67698670605613
Epoch:  19
0.67698670605613


 44%|████▍     | 22/50 [00:01<00:02, 13.96it/s]

Epoch:  20
0.67698670605613
Epoch:  21
0.67698670605613
Epoch:  22
0.67698670605613


 52%|█████▏    | 26/50 [00:01<00:01, 14.30it/s]

Epoch:  23
0.67698670605613
Epoch:  24
0.67698670605613
Epoch:  25
0.67698670605613


 56%|█████▌    | 28/50 [00:01<00:01, 14.08it/s]

Epoch:  26
0.67698670605613
Epoch:  27
0.67698670605613
Epoch:  28
0.67698670605613


 64%|██████▍   | 32/50 [00:02<00:01, 13.93it/s]

Epoch:  29
0.67698670605613
Epoch:  30
0.67698670605613
Epoch:  31
0.67698670605613


 68%|██████▊   | 34/50 [00:02<00:01, 13.53it/s]

Epoch:  32
0.67698670605613
Epoch:  33
0.67698670605613
Epoch:  34
0.67698670605613


 76%|███████▌  | 38/50 [00:02<00:00, 13.68it/s]

Epoch:  35
0.67698670605613
Epoch:  36
0.67698670605613
Epoch:  37
0.67698670605613


 80%|████████  | 40/50 [00:02<00:00, 14.03it/s]

Epoch:  38
0.67698670605613
Epoch:  39
0.67698670605613
Epoch:  40
0.67698670605613


 88%|████████▊ | 44/50 [00:03<00:00, 13.91it/s]

Epoch:  41
0.67698670605613
Epoch:  42
0.67698670605613
Epoch:  43
0.67698670605613


 92%|█████████▏| 46/50 [00:03<00:00, 13.47it/s]

Epoch:  44
0.67698670605613
Epoch:  45
0.67698670605613
Epoch:  46
0.67698670605613


100%|██████████| 50/50 [00:03<00:00, 13.92it/s]

Epoch:  47
0.67698670605613
Epoch:  48
0.67698670605613
Epoch:  49
0.67698670605613





In [137]:
data_encoder = DataEncoder(params, mtype='rnn')

# # train_indices, val_indices, test_indices = data_split(data)
# train_data = {
#     'docs': [data['docs'][item] for item in train_indices],
#     'labels': [data['labels'][item] for item in train_indices],
#     params['domain_name']: [data[params['domain_name']][item] for item in train_indices],
# }
# valid_data = {
#     'docs': [data['docs'][item] for item in val_indices],
#     'labels': [data['labels'][item] for item in val_indices],
#     params['domain_name']: [data[params['domain_name']][item] for item in val_indices],
# }
# test_data = {
#     'docs': [data['docs'][item] for item in test_indices],
#     'labels': [data['labels'][item] for item in test_indices],
#     params['domain_name']: [data[params['domain_name']][item] for item in test_indices],
# }
# if params['over_sample']:
#     ros = RandomOverSampler(random_state=33)
#     sample_indices = [[item] for item in range(len(train_data['docs']))]
#     sample_indices, _ = ros.fit_resample(sample_indices, train_data['labels'])
#     sample_indices = [item[0] for item in sample_indices]
#     train_data = {
#         'docs': [train_data['docs'][item] for item in sample_indices],
#         'labels': [train_data['labels'][item] for item in sample_indices],
#         params['domain_name']: [train_data[params['domain_name']][item] for item in sample_indices],
#     }

train_data = TorchDataset(all_train, params['domain_name'])
train_data_loader = DataLoader(
    train_data, batch_size=params['batch_size'], shuffle=True,
    collate_fn=data_encoder
)
valid_data = TorchDataset(vaccine_train, params['domain_name'])
valid_data_loader = DataLoader(
    valid_data, batch_size=params['batch_size'], shuffle=False,
    collate_fn=data_encoder
)
test_data = TorchDataset(vaccine_test, params['domain_name'])
test_data_loader = DataLoader(
    test_data, batch_size=params['batch_size'], shuffle=False,
    collate_fn=data_encoder
)

# build model
rnn_model = RegularRNN(params)
rnn_model = rnn_model.to(device)
criterion = nn.BCEWithLogitsLoss().to(device)
optimizer = torch.optim.RMSprop(rnn_model.parameters(), lr=params['lr'])

# train the networks
print('Start to train...')
print(params)
best_score = 0.
for epoch in tqdm(range(params['epochs'])):
    train_loss = 0
    rnn_model.train()

    for step, train_batch in enumerate(train_data_loader):
        train_batch = tuple(t.to(device) for t in train_batch)
        input_docs, input_labels, input_domains = train_batch
        optimizer.zero_grad()
        predictions = rnn_model(**{
            'input_docs': input_docs
        })
        loss = criterion(predictions, input_labels)
#         train_loss += loss.item()

#         loss_avg = train_loss / (step + 1)
#         if (step + 1) % 301 == 0:
#             print('Epoch: {}, Step: {}'.format(epoch, step))
#             print('\tLoss: {}.'.format(loss_avg))
#             print('-------------------------------------------------')

        loss.backward()
        # torch.nn.utils.clip_grad_norm_(rnn_model.parameters(), 0.5)
        optimizer.step()
        
    for _ in range(10):
        for valid_batch in valid_data_loader:
            valid_batch = tuple(t.to(device) for t in valid_batch)
            input_docs, input_labels, input_domains = valid_batch
            optimizer.zero_grad()
            predictions = rnn_model(**{
                'input_docs': input_docs,
            })
            # torch.nn.utils.clip_grad_norm_(rnn_model.parameters(), 0.5)
            loss = criterion(predictions, input_labels)
            loss.backward()
            optimizer.step()

    # evaluate on the valid set
#     y_preds = []
#     y_trues = []
#     rnn_model.eval()
#     for valid_batch in valid_data_loader:
#         valid_batch = tuple(t.to(device) for t in valid_batch)
#         input_docs, input_labels, input_domains = valid_batch
#         with torch.no_grad():
#             predictions = rnn_model(**{
#                 'input_docs': input_docs,
#             })
#         logits = (torch.sigmoid(predictions) > .5).long().cpu().numpy()
#         y_preds.extend(logits)
#         y_trues.extend(input_labels.to('cpu').numpy())
    
#     eval_score = multi_label_f1(y_preds=y_preds, y_truths=y_trues, mode='binary')
#     if eval_score > best_score:
#         best_score = eval_score
#         torch.save(rnn_model, params['model_dir'] + '{}.pth'.format(os.path.basename(__file__)))

    y_preds = []
    y_probs = []
    y_trues = []
    y_domains = []
    # evaluate on the test set
    rnn_model.eval()
    for test_batch in test_data_loader:
        test_batch = tuple(t.to(device) for t in test_batch)
        input_docs, input_labels, input_domains = test_batch

        with torch.no_grad():
            predictions = rnn_model(**{
                'input_docs': input_docs,
            })
        logits = (torch.sigmoid(predictions) > .5).long().cpu().numpy()
        y_preds.extend(logits)
        y_trues.extend(input_labels.to('cpu').numpy())
        y_probs.extend([item[1] for item in logits])
        y_domains.extend(input_domains.detach().cpu().numpy())
    print('Epoch: ', epoch)
#     print(multi_label_f1(y_preds=y_preds, y_truths=y_trues, mode='binary'))
    print(micro_f1_average(y_preds=y_preds, y_truths=y_trues))

  0%|          | 0/15 [00:00<?, ?it/s]

Start to train...
{'result_path': '../resource/results/vaccine.txt', 'model_dir': '../resource/model/vaccine/', 'dname': 'vaccine', 'max_feature': 15000, 'over_sample': True, 'domain_name': 'corpus', 'epochs': 15, 'batch_size': 64, 'lr': 9e-05, 'max_len': 100, 'dp_rate': 0.2, 'optimizer': 'rmsprop', 'emb_path': '/data/models/glove.twitter.27B.200d.txt', 'emb_dim': 200, 'unique_domains': ['ALM', 'Baltimore', 'BLM', 'Davidson', 'Election', 'MeToo', 'Sandy', 'vaccine'], 'bidirectional': False, 'device': device(type='cuda'), 'num_label': 10, 'tok_dir': '../resource/model/vaccine', 'word_emb_path': '../resource/model/vaccine/vaccine.npy'}


  7%|▋         | 1/15 [00:05<01:11,  5.11s/it]

Epoch:  0
0.6703248729733335


 13%|█▎        | 2/15 [00:10<01:07,  5.18s/it]

Epoch:  1
0.6543571206991894


 20%|██        | 3/15 [00:15<01:02,  5.22s/it]

Epoch:  2
0.6671585666154024


 27%|██▋       | 4/15 [00:20<00:57,  5.21s/it]

Epoch:  3
0.6657906263688129


 33%|███▎      | 5/15 [00:26<00:52,  5.21s/it]

Epoch:  4
0.6644082613921976


 40%|████      | 6/15 [00:31<00:47,  5.22s/it]

Epoch:  5
0.6741350286685978


 47%|████▋     | 7/15 [00:36<00:42,  5.27s/it]

Epoch:  6
0.6809760562501299


 53%|█████▎    | 8/15 [00:42<00:37,  5.29s/it]

Epoch:  7
0.6710782524035537


 60%|██████    | 9/15 [00:47<00:31,  5.33s/it]

Epoch:  8
0.6639940255015784


 67%|██████▋   | 10/15 [00:52<00:25,  5.18s/it]

Epoch:  9
0.6743091835058321


 73%|███████▎  | 11/15 [00:57<00:20,  5.17s/it]

Epoch:  10
0.6855179111725374


 80%|████████  | 12/15 [01:02<00:15,  5.21s/it]

Epoch:  11
0.6899440426715533


 87%|████████▋ | 13/15 [01:08<00:10,  5.26s/it]

Epoch:  12
0.7017387967768645


 93%|█████████▎| 14/15 [01:13<00:05,  5.27s/it]

Epoch:  13
0.7068620572138804


100%|██████████| 15/15 [01:18<00:00,  5.25s/it]

Epoch:  14
0.708523895271939





In [140]:
params['epochs'] = 50

data_encoder = DataEncoder(params, mtype='rnn')

# # train_indices, val_indices, test_indices = data_split(data)
# train_data = {
#     'docs': [data['docs'][item] for item in train_indices],
#     'labels': [data['labels'][item] for item in train_indices],
#     params['domain_name']: [data[params['domain_name']][item] for item in train_indices],
# }
# valid_data = {
#     'docs': [data['docs'][item] for item in val_indices],
#     'labels': [data['labels'][item] for item in val_indices],
#     params['domain_name']: [data[params['domain_name']][item] for item in val_indices],
# }
# test_data = {
#     'docs': [data['docs'][item] for item in test_indices],
#     'labels': [data['labels'][item] for item in test_indices],
#     params['domain_name']: [data[params['domain_name']][item] for item in test_indices],
# }
# if params['over_sample']:
#     ros = RandomOverSampler(random_state=33)
#     sample_indices = [[item] for item in range(len(train_data['docs']))]
#     sample_indices, _ = ros.fit_resample(sample_indices, train_data['labels'])
#     sample_indices = [item[0] for item in sample_indices]
#     train_data = {
#         'docs': [train_data['docs'][item] for item in sample_indices],
#         'labels': [train_data['labels'][item] for item in sample_indices],
#         params['domain_name']: [train_data[params['domain_name']][item] for item in sample_indices],
#     }

train_data = TorchDataset(all_train, params['domain_name'])
train_data_loader = DataLoader(
    train_data, batch_size=params['batch_size'], shuffle=True,
    collate_fn=data_encoder
)
valid_data = TorchDataset(vaccine_train, params['domain_name'])
valid_data_loader = DataLoader(
    valid_data, batch_size=params['batch_size'], shuffle=False,
    collate_fn=data_encoder
)
test_data = TorchDataset(vaccine_test, params['domain_name'])
test_data_loader = DataLoader(
    test_data, batch_size=params['batch_size'], shuffle=False,
    collate_fn=data_encoder
)

# build model
rnn_model = RegularRNN(params)
rnn_model = rnn_model.to(device)
criterion = nn.BCEWithLogitsLoss().to(device)
optimizer = torch.optim.RMSprop(rnn_model.parameters(), lr=params['lr'])

# train the networks
print('Start to train...')
print(params)
best_score = 0.
for epoch in tqdm(range(params['epochs'])):
    train_loss = 0
    rnn_model.train()

    for step, train_batch in enumerate(train_data_loader):
        train_batch = tuple(t.to(device) for t in train_batch)
        input_docs, input_labels, input_domains = train_batch
        optimizer.zero_grad()
        predictions = rnn_model(**{
            'input_docs': input_docs
        })
        loss = criterion(predictions, input_labels)
#         train_loss += loss.item()

#         loss_avg = train_loss / (step + 1)
#         if (step + 1) % 301 == 0:
#             print('Epoch: {}, Step: {}'.format(epoch, step))
#             print('\tLoss: {}.'.format(loss_avg))
#             print('-------------------------------------------------')

        loss.backward()
        # torch.nn.utils.clip_grad_norm_(rnn_model.parameters(), 0.5)
        optimizer.step()
        
    for _ in range(10):
        for valid_batch in valid_data_loader:
            valid_batch = tuple(t.to(device) for t in valid_batch)
            input_docs, input_labels, input_domains = valid_batch
            optimizer.zero_grad()
            predictions = rnn_model(**{
                'input_docs': input_docs,
            })
            # torch.nn.utils.clip_grad_norm_(rnn_model.parameters(), 0.5)
            loss = criterion(predictions, input_labels)
            loss.backward()
            optimizer.step()

    # evaluate on the valid set
#     y_preds = []
#     y_trues = []
#     rnn_model.eval()
#     for valid_batch in valid_data_loader:
#         valid_batch = tuple(t.to(device) for t in valid_batch)
#         input_docs, input_labels, input_domains = valid_batch
#         with torch.no_grad():
#             predictions = rnn_model(**{
#                 'input_docs': input_docs,
#             })
#         logits = (torch.sigmoid(predictions) > .5).long().cpu().numpy()
#         y_preds.extend(logits)
#         y_trues.extend(input_labels.to('cpu').numpy())
    
#     eval_score = multi_label_f1(y_preds=y_preds, y_truths=y_trues, mode='binary')
#     if eval_score > best_score:
#         best_score = eval_score
#         torch.save(rnn_model, params['model_dir'] + '{}.pth'.format(os.path.basename(__file__)))

    y_preds = []
    y_probs = []
    y_trues = []
    y_domains = []
    # evaluate on the test set
    rnn_model.eval()
    for test_batch in test_data_loader:
        test_batch = tuple(t.to(device) for t in test_batch)
        input_docs, input_labels, input_domains = test_batch

        with torch.no_grad():
            predictions = rnn_model(**{
                'input_docs': input_docs,
            })
        logits = (torch.sigmoid(predictions) > .5).long().cpu().numpy()
        y_preds.extend(logits)
        y_trues.extend(input_labels.to('cpu').numpy())
        y_probs.extend([item[1] for item in logits])
        y_domains.extend(input_domains.detach().cpu().numpy())
    print('Epoch: ', epoch)
    test_score = micro_f1_average(y_preds=y_preds, y_truths=y_trues)
#     print(multi_label_f1(y_preds=y_preds, y_truths=y_trues, mode='binary'))
    print(test_score)
    if test_score > best_score:
        best_score = test_score
        torch.save(rnn_model, params['model_dir'] + '{}.pth'.format('vaccine_rnn'))

  0%|          | 0/50 [00:00<?, ?it/s]

Start to train...
{'result_path': '../resource/results/vaccine.txt', 'model_dir': '../resource/model/vaccine/', 'dname': 'vaccine', 'max_feature': 15000, 'over_sample': True, 'domain_name': 'corpus', 'epochs': 50, 'batch_size': 64, 'lr': 9e-05, 'max_len': 100, 'dp_rate': 0.2, 'optimizer': 'rmsprop', 'emb_path': '/data/models/glove.twitter.27B.200d.txt', 'emb_dim': 200, 'unique_domains': ['ALM', 'Baltimore', 'BLM', 'Davidson', 'Election', 'MeToo', 'Sandy', 'vaccine'], 'bidirectional': False, 'device': device(type='cuda'), 'num_label': 10, 'tok_dir': '../resource/model/vaccine', 'word_emb_path': '../resource/model/vaccine/vaccine.npy'}


  2%|▏         | 1/50 [00:05<04:21,  5.33s/it]

Epoch:  0
0.6703248729733335


  4%|▍         | 2/50 [00:10<04:17,  5.36s/it]

Epoch:  1
0.6569765018134319


  6%|▌         | 3/50 [00:16<04:13,  5.40s/it]

Epoch:  2
0.6657323307189895


  8%|▊         | 4/50 [00:21<04:09,  5.42s/it]

Epoch:  3
0.6727744800636425


 10%|█         | 5/50 [00:27<04:03,  5.41s/it]

Epoch:  4
0.6727744800636425


 12%|█▏        | 6/50 [00:32<03:57,  5.39s/it]

Epoch:  5
0.6783432644603227


 14%|█▍        | 7/50 [00:37<03:51,  5.39s/it]

Epoch:  6
0.6825931464147722


 16%|█▌        | 8/50 [00:43<03:47,  5.41s/it]

Epoch:  7
0.6784002037736989


 18%|█▊        | 9/50 [00:48<03:41,  5.39s/it]

Epoch:  8
0.6784557412450767


 20%|██        | 10/50 [00:53<03:34,  5.37s/it]

Epoch:  9
0.6887619687552494


 22%|██▏       | 11/50 [00:59<03:30,  5.41s/it]

Epoch:  10
0.7098205804962632


 24%|██▍       | 12/50 [01:04<03:25,  5.40s/it]

Epoch:  11
0.7254899653680671


 26%|██▌       | 13/50 [01:10<03:18,  5.35s/it]

Epoch:  12
0.7348574456972564


 28%|██▊       | 14/50 [01:15<03:12,  5.35s/it]

Epoch:  13
0.739360913827107


 30%|███       | 15/50 [01:20<03:04,  5.26s/it]

Epoch:  14
0.7404514934361075


 32%|███▏      | 16/50 [01:25<02:59,  5.27s/it]

Epoch:  15
0.7065078120917594


 34%|███▍      | 17/50 [01:31<02:54,  5.28s/it]

Epoch:  16
0.7420491527325775


 36%|███▌      | 18/50 [01:36<02:48,  5.27s/it]

Epoch:  17
0.7267994145210566


 38%|███▊      | 19/50 [01:41<02:43,  5.29s/it]

Epoch:  18
0.7357203426889599


 40%|████      | 20/50 [01:46<02:38,  5.28s/it]

Epoch:  19
0.7409142835739836


 42%|████▏     | 21/50 [01:52<02:32,  5.27s/it]

Epoch:  20
0.739819167079484


 44%|████▍     | 22/50 [01:57<02:26,  5.25s/it]

Epoch:  21
0.739819167079484


 46%|████▌     | 23/50 [02:02<02:21,  5.26s/it]

Epoch:  22
0.739242543246298


 48%|████▊     | 24/50 [02:07<02:16,  5.27s/it]

Epoch:  23
0.7401076883350388


 50%|█████     | 25/50 [02:13<02:11,  5.26s/it]

Epoch:  24
0.7402443165756979


 52%|█████▏    | 26/50 [02:18<02:06,  5.27s/it]

Epoch:  25
0.7392681337429443


 54%|█████▍    | 27/50 [02:23<02:00,  5.26s/it]

Epoch:  26
0.7391960997323865


 56%|█████▌    | 28/50 [02:28<01:55,  5.26s/it]

Epoch:  27
0.7407002831399901


 58%|█████▊    | 29/50 [02:34<01:50,  5.28s/it]

Epoch:  28
0.7356105181340223


 60%|██████    | 30/50 [02:39<01:43,  5.15s/it]

Epoch:  29
0.7350974219995965


 62%|██████▏   | 31/50 [02:44<01:39,  5.21s/it]

Epoch:  30
0.7373846225928875


 64%|██████▍   | 32/50 [02:49<01:34,  5.22s/it]

Epoch:  31
0.7387936146848032


 66%|██████▌   | 33/50 [02:55<01:29,  5.28s/it]

Epoch:  32
0.7321240380605785


 68%|██████▊   | 34/50 [03:00<01:25,  5.33s/it]

Epoch:  33
0.7326247758020248


 70%|███████   | 35/50 [03:05<01:20,  5.35s/it]

Epoch:  34
0.7278798477918283


 72%|███████▏  | 36/50 [03:11<01:14,  5.35s/it]

Epoch:  35
0.727201981991376


 74%|███████▍  | 37/50 [03:16<01:09,  5.38s/it]

Epoch:  36
0.7207520215528785


 76%|███████▌  | 38/50 [03:22<01:04,  5.36s/it]

Epoch:  37
0.7229772530953066


 78%|███████▊  | 39/50 [03:27<00:59,  5.37s/it]

Epoch:  38
0.7202507744044693


 80%|████████  | 40/50 [03:32<00:53,  5.38s/it]

Epoch:  39
0.7129904385097263


 82%|████████▏ | 41/50 [03:38<00:48,  5.40s/it]

Epoch:  40
0.7163352101386397


 84%|████████▍ | 42/50 [03:43<00:43,  5.40s/it]

Epoch:  41
0.7162790697674419


 86%|████████▌ | 43/50 [03:49<00:37,  5.39s/it]

Epoch:  42
0.7160997572900911


 88%|████████▊ | 44/50 [03:54<00:32,  5.39s/it]

Epoch:  43
0.7130584799453589


 90%|█████████ | 45/50 [03:59<00:26,  5.31s/it]

Epoch:  44
0.7146417664197704


 92%|█████████▏| 46/50 [04:05<00:21,  5.33s/it]

Epoch:  45
0.7162790697674419


 94%|█████████▍| 47/50 [04:10<00:16,  5.35s/it]

Epoch:  46
0.7131897004729374


 96%|█████████▌| 48/50 [04:15<00:10,  5.36s/it]

Epoch:  47
0.705572598939784


 98%|█████████▊| 49/50 [04:21<00:05,  5.38s/it]

Epoch:  48
0.7003265691138455


100%|██████████| 50/50 [04:26<00:00,  5.33s/it]

Epoch:  49
0.7123357571118766





In [141]:
best_score

0.7420491527325775

In [None]:


#         with open(params['result_path'], 'a') as wfile:
#             wfile.write('{}...............................\n'.format(datetime.datetime.now()))
#             wfile.write('Performance Evaluation for the task: {}\n'.format(params['dname']))
#             wfile.write('F1-weighted score: {}\n'.format(
#                 metrics.f1_score(y_true=y_trues, y_pred=y_preds, average='weighted')
#             ))
#             fpr, tpr, _ = metrics.roc_curve(y_true=y_trues, y_score=y_probs)
#             wfile.write('AUC score: {}\n'.format(
#                 metrics.auc(fpr, tpr)
#             ))
#             report = metrics.classification_report(
#                 y_true=y_trues, y_pred=y_preds, digits=3
#             )
#             print(report)
#             wfile.write(report)
#             wfile.write('\n')

#             wfile.write('Fairness Evaluation\n')

#             wfile.write('...............................\n\n')
#             wfile.flush()