In [1]:
import os
import torch

import sys
sys.path.append('../')
os.chdir('../')


from xlm.utils import AttrDict
from xlm.data.dictionary import Dictionary, BOS_WORD, EOS_WORD, PAD_WORD, UNK_WORD, MASK_WORD
from xlm.model.transformer import TransformerModel

FAISS library was not found.
FAISS not available. Switching to standard nearest neighbors search implementation.


## Reload a pretrained model

In [2]:
model_path = "/projectnb/statnlp/gkuwanto/XLM/dumped/baseline_para_0/q3v4i6kl9t/best-valid_mlm_ppl.pth"
reloaded = torch.load(model_path)
params = AttrDict(reloaded['params'])
print("Supported languages: %s" % ", ".join(params.lang2id.keys()))

Supported languages: en, id


## Build dictionary / update parameters / build model

In [3]:
# build dictionary / update parameters
dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'], reloaded['dico_counts'])
params.n_words = len(dico)
params.bos_index = dico.index(BOS_WORD)
params.eos_index = dico.index(EOS_WORD)
params.pad_index = dico.index(PAD_WORD)
params.unk_index = dico.index(UNK_WORD)
params.mask_index = dico.index(MASK_WORD)

# build model / reload weights
model = TransformerModel(params, dico, True, True)
model.eval()

from collections import OrderedDict
reloaded_model = OrderedDict()
for k, v in reloaded['model'].items():
      reloaded_model[k.replace('module.', '')] = v
model.load_state_dict(reloaded_model)

<All keys matched successfully>


## Get sentence representations

Sentences have to be in the BPE format, i.e. tokenized sentences on which you applied fastBPE.

In [4]:
# Below is one way to bpe-ize sentences
codes = "" # path to the codes of the model
fastbpe = os.path.join(os.getcwd(), 'tools/fastBPE/fast')

def to_bpe(sentences):
    # write sentences to tmp file
    with open('/tmp/sentences.bpe', 'w') as fwrite:
        for sent in sentences:
            fwrite.write(sent + '\n')
    
    # apply bpe to tmp file
    os.system('%s applybpe /tmp/sentences.bpe /tmp/sentences %s' % (fastbpe, codes))
    
    # load bpe-ized sentences
    sentences_bpe = []
    with open('/tmp/sentences.bpe') as f:
        for line in f:
            sentences_bpe.append(line.rstrip())
    
    return sentences_bpe


In [5]:
# Below are already BPE-ized sentences

# list of (sentences, lang)
sentences = [
     'warung ini dimiliki oleh pengusaha pabrik tahu yang sudah puluhan tahun terkenal membuat tahu putih di bandung . tahu berkualitas , dipadu keahlian memasak , dipadu kretivitas , jadilah warung yang menyajikan menu utama berbahan tahu , ditambah menu umum lain seperti ayam . semuanya selera indonesia . harga cukup terjangkau . jangan lewatkan tahu bletoka nya , tidak kalah dengan yang asli dari tegal !',
    'aaa'
]

# bpe-ize sentences
sentences = to_bpe(sentences)
print('\n\n'.join(sentences))

# check how many tokens are OOV
n_w = len([w for w in ' '.join(sentences).split()])
n_oov = len([w for w in ' '.join(sentences).split() if w not in dico.word2id])
print('Number of out-of-vocab words: %s/%s' % (n_oov, n_w))

# add </s> sentence delimiters
sentences = [(('</s> %s </s>' % sent.strip()).split()) for sent in sentences]

warung ini dimiliki oleh pengusaha pabrik tahu yang sudah puluhan tahun terkenal membuat tahu putih di bandung . tahu berkualitas , dipadu keahlian memasak , dipadu kretivitas , jadilah warung yang menyajikan menu utama berbahan tahu , ditambah menu umum lain seperti ayam . semuanya selera indonesia . harga cukup terjangkau . jangan lewatkan tahu bletoka nya , tidak kalah dengan yang asli dari tegal !

aaa
Number of out-of-vocab words: 2/67


### Create batch

In [6]:
bs = len(sentences)
slen = max([len(sent) for sent in sentences])

word_ids = torch.LongTensor(slen, bs).fill_(params.pad_index)
for i in range(len(sentences)):
    sent = torch.LongTensor([dico.index(w) for w in sentences[i]])
    word_ids[:len(sent), i] = sent

lengths = torch.LongTensor([len(sent) for sent in sentences])
                             
# NOTE: No more language id (removed it in a later version)
langs = torch.LongTensor([params.lang2id['id']]).unsqueeze(0).expand(slen, bs) if params.n_langs > 1 else None
# langs = torch.LongTensor([params.lang2id['id']])


In [7]:
word_ids

tensor([[    1,     1],
        [ 3367, 14369],
        [   20,     1],
        [ 1017,     2],
        [   45,     2],
        [ 1891,     2],
        [ 1616,     2],
        [  177,     2],
        [   16,     2],
        [   48,     2],
        [ 2375,     2],
        [   60,     2],
        [  772,     2],
        [   82,     2],
        [  177,     2],
        [  539,     2],
        [   18,     2],
        [  647,     2],
        [   14,     2],
        [  177,     2],
        [ 1089,     2],
        [   15,     2],
        [25377,     2],
        [ 3301,     2],
        [ 2851,     2],
        [   15,     2],
        [25377,     2],
        [    3,     2],
        [   15,     2],
        [ 4504,     2],
        [ 3367,     2],
        [   16,     2],
        [ 3075,     2],
        [  969,     2],
        [  323,     2],
        [ 4369,     2],
        [  177,     2],
        [   15,     2],
        [ 1702,     2],
        [  969,     2],
        [  303,     2],
        [   76, 

### Forward

In [8]:
tensor = model('fwd', x=word_ids, lengths=lengths, langs=langs, causal=False).contiguous()
print(tensor.size()[-1])

1024


In [9]:
from torch import nn

proj = nn.Sequential(*[
    nn.Dropout(params.dropout),
    nn.Linear(1024, 5)
]).cuda()

In [10]:
logits = proj(tensor[0].cuda())

In [11]:
logits

tensor([[ 0.7605,  0.6221, -0.7498,  0.0441, -1.0310],
        [-0.9203,  1.3316,  0.0143,  0.4023, -0.2172]], device='cuda:0',
       grad_fn=<AddmmBackward>)

In [12]:
logits.data.max(1)[1]

tensor([0, 1], device='cuda:0')

The variable `tensor` is of shape `(sequence_length, batch_size, model_dimension)`.

`tensor[0]` is a tensor of shape `(batch_size, model_dimension)` that corresponds to the first hidden state of the last layer of each sentence.

This is this vector that we use to finetune on the GLUE and XNLI tasks.

# finetuning Emot

In [13]:
from xlm_indo_nlu_utils.data_loader_utils import EmotionDetectionDataset, EmotionDetectionDataLoader

import random

import numpy as np
import pandas as pd
import torch
from torch import optim
import torch.nn.functional as F
from tqdm import tqdm


from utils.forward_fn import forward_sequence_classification
from utils.metrics import document_sentiment_metrics_fn



def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
set_seed(13131)

In [14]:
train_dataset_path = './dataset/emot_emotion-twitter/train_preprocess.csv'
valid_dataset_path = './dataset/emot_emotion-twitter/valid_preprocess.csv'
test_dataset_path = './dataset/emot_emotion-twitter/test_preprocess_masked_label.csv'

In [15]:
train_dataset = EmotionDetectionDataset(train_dataset_path, dico, params, lowercase=True)
valid_dataset = EmotionDetectionDataset(valid_dataset_path, dico, params, lowercase=True)
test_dataset = EmotionDetectionDataset(test_dataset_path,dico, params, lowercase=True)

train_loader = EmotionDetectionDataLoader(dataset=train_dataset, params=params, max_seq_len=512, batch_size=16, num_workers=16, shuffle=True)  
valid_loader = EmotionDetectionDataLoader(dataset=valid_dataset, params=params, max_seq_len=512,  batch_size=16, num_workers=16, shuffle=False)  
test_loader = EmotionDetectionDataLoader(dataset=test_dataset, params=params, max_seq_len=512, batch_size=16, num_workers=16, shuffle=False)

In [16]:
train_dataset[0]

(tensor([   3,  677,  367,    3,  252,  425,  252,  958,  252,  877,    3,  877,
          252,  958,    3,  882,  252,  677,  792,    3,  824,  252,  958,  367,
          677,  792,    3,  543,  655,  543,  458,  252,  877,  252,  792,  367,
          252,  868,  252,  677,    3,  357,  252,  252,  710,    3,  458,  367,
          252,  357,  868, 1318,    3, 1044,  749,  710,  749,    3,  458,  655,
         1099,  357,  252,  543,  252,    3,    3,    3,    3,    3,  739,    3,
          655,  710, 1318, 1099,  677,    3, 1044,    3,  877,  655,    3,  367,
          710,  710,  958,  655,    3, 1099,  367,  677,  524,  655,    3,  739,
            3,  252,  824,  824,  882,    3,  655,  655,    3,  877, 1318,  958,
            3,  252,  882]),
 array(4),
 'Ini adalah hal yang paling membahagiakan saat biasku foto bersama ELF #ReturnOfTheLittlePrince #HappyHeeChulDay')

In [17]:
w2i, i2w = EmotionDetectionDataset.LABEL2INDEX, EmotionDetectionDataset.INDEX2LABEL
print(w2i)
print(i2w)

{'sadness': 0, 'anger': 1, 'love': 2, 'fear': 3, 'happy': 4}
{0: 'sadness', 1: 'anger', 2: 'love', 3: 'fear', 4: 'happy'}


In [18]:
import time

## train and test

In [19]:
from xlm_indo_nlu_utils.model_utils import forward_sequence_classification

In [20]:
optimizer_m = optim.Adam(model.parameters(), lr=3e-6)
model = model.cuda()
optimizer_p = optim.Adam(proj.parameters(), lr=3e-4)
proj = proj.cuda()

In [21]:
n_epochs = 5

for epoch in range(n_epochs):
    model.train()
    proj.train()
    torch.set_grad_enabled(True)
    
    total_train_loss = 0

    list_hyp, list_label = [], []
    
    train_pbar = tqdm(train_loader, leave=True, total=len(train_loader))
    for i, batch_data in enumerate(train_pbar):
        # Forward model
        loss, logits, batch_hyp, batch_label = forward_sequence_classification(proj, model, batch_data[:-1], i2w=i2w, device='cuda')
        
        optimizer_m.zero_grad()
        optimizer_p.zero_grad()
        loss.backward()
        optimizer_m.step()
        optimizer_p.step()
        
        tr_loss = loss.item()
        total_train_loss = total_train_loss + tr_loss
        
        list_hyp += batch_hyp
        list_label += batch_label
        
        train_pbar.set_description("(Epoch {}) TRAIN LOSS:{:.4f}".format((epoch+1),
            total_train_loss/(i+1)))
        
        
    # Calculate train metric
    metrics = document_sentiment_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) TRAIN LOSS:{:.4f} {}".format((epoch+1),
        total_train_loss/(i+1),metrics))
    
    
        
        
    # Evaluate on validation
    model.eval()
    proj.eval()
    torch.set_grad_enabled(False)
    
    total_loss, total_correct, total_labels = 0, 0, 0
    
    list_hyp, list_label = [], []

    pbar = tqdm(valid_loader, leave=True, total=len(valid_loader))
    for i, batch_data in enumerate(pbar):
        batch_seq = batch_data[-1]        
        loss, logits, batch_hyp, batch_label = forward_sequence_classification(proj, model, batch_data[:-1], i2w=i2w, device='cuda')
        
        # Calculate total loss
        valid_loss = loss.item()
        total_loss = total_loss + valid_loss

        # Calculate evaluation metrics
        list_hyp += batch_hyp
        list_label += batch_label
        
        pbar.set_description("VALID LOSS:{:.4f}".format(total_loss/(i+1)))
        
    metrics = document_sentiment_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) VALID LOSS:{:.4f} {}".format((epoch+1),
        total_loss/(i+1), metrics))
    

(Epoch 1) TRAIN LOSS:1.6140: 100%|██████████| 221/221 [00:48<00:00,  4.55it/s]


(Epoch 1) TRAIN LOSS:1.6140 {'ACC': 0.2556092019312695, 'F1': 0.21262843014044536, 'REC': 0.2246601268011652, 'PRE': 0.22506197185373916}


VALID LOSS:1.5314: 100%|██████████| 28/28 [00:02<00:00, 10.10it/s]


(Epoch 1) VALID LOSS:1.5314 {'ACC': 0.3181818181818182, 'F1': 0.22680458415540666, 'REC': 0.2731539661319073, 'PRE': 0.2760442773600668}


(Epoch 2) TRAIN LOSS:1.5496: 100%|██████████| 221/221 [00:48<00:00,  4.53it/s]


(Epoch 2) TRAIN LOSS:1.5496 {'ACC': 0.30247088895200225, 'F1': 0.247253289361893, 'REC': 0.2648728559798887, 'PRE': 0.2704362341643616}


VALID LOSS:1.5016: 100%|██████████| 28/28 [00:02<00:00, 10.26it/s]
  _warn_prf(average, modifier, msg_start, len(result))


(Epoch 2) VALID LOSS:1.5016 {'ACC': 0.32727272727272727, 'F1': 0.25964385695145287, 'REC': 0.2858797534165181, 'PRE': 0.28630270703866934}


(Epoch 3) TRAIN LOSS:1.5157: 100%|██████████| 221/221 [00:48<00:00,  4.53it/s]


(Epoch 3) TRAIN LOSS:1.5157 {'ACC': 0.3146833285998296, 'F1': 0.27696039531644867, 'REC': 0.2840499341599402, 'PRE': 0.3078698778808213}


VALID LOSS:1.4824: 100%|██████████| 28/28 [00:02<00:00, 10.23it/s]


(Epoch 3) VALID LOSS:1.4824 {'ACC': 0.3659090909090909, 'F1': 0.29681121086319096, 'REC': 0.32540275492481374, 'PRE': 0.45556220963995353}


(Epoch 4) TRAIN LOSS:1.4813: 100%|██████████| 221/221 [00:48<00:00,  4.55it/s]


(Epoch 4) TRAIN LOSS:1.4813 {'ACC': 0.35245668844078387, 'F1': 0.314444911501365, 'REC': 0.32190024239119, 'PRE': 0.35222993068475755}


VALID LOSS:1.4562: 100%|██████████| 28/28 [00:02<00:00, 10.29it/s]


(Epoch 4) VALID LOSS:1.4562 {'ACC': 0.3477272727272727, 'F1': 0.3077723414425235, 'REC': 0.31817953288541523, 'PRE': 0.3524220687891574}


(Epoch 5) TRAIN LOSS:1.4578: 100%|██████████| 221/221 [00:48<00:00,  4.54it/s]


(Epoch 5) TRAIN LOSS:1.4578 {'ACC': 0.36864527122976426, 'F1': 0.3443539197149434, 'REC': 0.34603650294812266, 'PRE': 0.37035793240403175}


VALID LOSS:1.4674: 100%|██████████| 28/28 [00:02<00:00, 10.31it/s]

(Epoch 5) VALID LOSS:1.4674 {'ACC': 0.34545454545454546, 'F1': 0.30887079217141533, 'REC': 0.3183009392568216, 'PRE': 0.392724862111069}





In [22]:

# Evaluate on test
model.eval()
proj.eval()
torch.set_grad_enabled(False)

total_loss, total_correct, total_labels = 0, 0, 0
list_hyp, list_label = [], []

pbar = tqdm(test_loader, leave=True, total=len(test_loader))
for i, batch_data in enumerate(pbar):
    loss, logits, batch_hyp, batch_label = forward_sequence_classification(proj, model, batch_data[:-1], i2w=i2w, device='cuda')
    list_hyp += batch_hyp

# Save prediction
df = pd.DataFrame({'label':list_hyp}).reset_index()
# df.to_csv('pred.txt', index=False)

print(df)

100%|██████████| 28/28 [00:02<00:00, 10.17it/s]

     index    label
0        0  sadness
1        1    happy
2        2    anger
3        3    happy
4        4    happy
..     ...      ...
435    435  sadness
436    436  sadness
437    437     fear
438    438    anger
439    439    happy

[440 rows x 2 columns]





In [23]:
df['label'].value_counts()

sadness    232
anger      121
happy       39
fear        32
love        16
Name: label, dtype: int64

In [24]:
# df.to_csv('/projectnb/statnlp/gik/XLM/output/pred-emot.csv', index=False)

In [25]:
# torch.save(model.state_dict(), '/projectnb/statnlp/gik/XLM/output/smsa_xlm_finetuned_model.pth')
# torch.save(proj.state_dict(), '/projectnb/statnlp/gik/XLM/output/smsa_proj.pth')