In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import os
import sys
import time
import random
import argparse
import numpy as np

import torch
from sklearn import metrics
import torch.optim as optim

In [3]:
path = '/content/drive/My Drive/NLP_COLAB/homework8/'
sys.path.append(path)
print(sys.path)

['', '/env/python', '/usr/lib/python36.zip', '/usr/lib/python3.6', '/usr/lib/python3.6/lib-dynload', '/usr/local/lib/python3.6/dist-packages', '/usr/lib/python3/dist-packages', '/usr/local/lib/python3.6/dist-packages/IPython/extensions', '/root/.ipython', '/content/drive/My Drive/NLP_COLAB/homework8/']


In [0]:
from utils.core_nns import BiRNN as fNN
from utils.other_utils import Progbar, Timer, SaveloadHP
from utils.data_utils import Vocab, Data2tensor, Txtfile, seqPAD, Embeddings

In [0]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Mar 28 03:38:32 2018

@author: duytinvo
"""


seed_num = 12345
random.seed(seed_num)
torch.manual_seed(seed_num)
np.random.seed(seed_num)


class Classifier(object):
    def __init__(self, args=None):
        
        self.args = args  
        self.device = torch.device("cuda:0" if self.args.use_cuda else "cpu")
        # word_layers = 1
        word_bidirect = True        
        word_HPs = [self.args.word_nnmode, len(self.args.vocab.w2i), self.args.word_dim,
                    self.args.word_pred_embs, self.args.word_hidden_dim, self.args.dropout,
                    self.args.word_layers, word_bidirect, self.args.zero_padding, self.args.word_att]
        
        self.model = fNN(word_HPs=word_HPs, filter_size = self.args.filter_size, out_channels = self.args.out_channels,
        use_batchnorm = self.args.use_batchnorm, num_labels=len(self.args.vocab.l2i)).to(self.device)

        if args.optimizer.lower() == "adamax":
            self.optimizer = optim.Adamax(self.model.parameters(), lr=self.args.lr)
        elif args.optimizer.lower() == "adam":
            self.optimizer = optim.Adam(self.model.parameters(), lr=self.args.lr)
        elif args.optimizer.lower() == "adadelta":
            self.optimizer = optim.Adadelta(self.model.parameters(), lr=self.args.lr)
        elif args.optimizer.lower() == "adagrad":
            self.optimizer = optim.Adagrad(self.model.parameters(), lr=self.args.lr)
        else:
            self.optimizer = optim.SGD(self.model.parameters(), lr=self.args.lr, momentum=0.9)
        
        self.word2idx = self.args.vocab.wd2idx(vocab_words=self.args.vocab.w2i, allow_unk=True, start_end=self.args.start_end)
        self.tag2idx = self.args.vocab.tag2idx(vocab_tags=self.args.vocab.l2i)

    def evaluate_batch(self, eva_data):
        with torch.no_grad():
            wl = self.args.vocab.wl
            batch_size = self.args.batch_size  
             ## set model in eval model
            self.model.eval()
            start = time.time()
            y_true = Data2tensor.idx2tensor([], self.device)
            y_pred = Data2tensor.idx2tensor([], self.device)
            for i,(words, label_ids) in enumerate(self.args.vocab.minibatches(eva_data, batch_size=batch_size)):
                word_ids, sequence_lengths = seqPAD.pad_sequences(words, pad_tok=0, wthres=wl)
        
                data_tensors = Data2tensor.sort_tensors(label_ids, word_ids,sequence_lengths, self.device)
                label_tensor, word_tensor, sequence_lengths, word_seq_recover = data_tensors

                y_true = torch.cat([y_true,label_tensor])
                label_score = self.model(word_tensor, sequence_lengths)
                label_prob, label_pred = self.model.inference(label_score, k=1)
                
                y_pred = torch.cat([y_pred, label_pred])
            measures = Classifier.class_metrics(y_true, y_pred.squeeze())
            #measures = Classifier.class_metrics(y_true.data.cpu().numpy(), y_pred.squeeze().data.cpu().numpy())

            end = time.time() - start
            speed = len(y_true)/end
        return measures, speed

    def train_batch(self,train_data):
        wl = self.args.vocab.wl
        clip_rate = self.args.clip
        
        batch_size = self.args.batch_size
        num_train = len(train_data)
        total_batch = num_train//batch_size+1
        prog = Progbar(target=total_batch)
        ## set model in train model
        self.model.train()
        train_loss = []
        for i,(words, label_ids) in enumerate(self.args.vocab.minibatches(train_data, batch_size=batch_size)):
            word_ids, sequence_lengths = seqPAD.pad_sequences(words, pad_tok=0, wthres=wl)

            data_tensors = Data2tensor.sort_tensors(label_ids, word_ids,sequence_lengths,self.device)
            label_tensor, word_tensor, sequence_lengths, word_seq_recover = data_tensors

            self.model.zero_grad()
            label_score = self.model(word_tensor, sequence_lengths)
            batch_loss = self.model.NLL_loss(label_score, label_tensor)
            train_loss.append(batch_loss.item())
            
            batch_loss.backward()
            
            if clip_rate>0:
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), clip_rate)
                
            self.optimizer.step()
            
            prog.update(i + 1, [("Train loss", batch_loss.item())])
        return np.mean(train_loss)

    def lr_decay(self, epoch):
        lr = self.args.lr/(1+self.args.decay_rate*epoch)
        print("INFO: - Learning rate is setted as: %f"%lr)
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = lr

    def train(self):            
        train_data = Txtfile(self.args.train_file, firstline=False, word2idx=self.word2idx, tag2idx=self.tag2idx)
        dev_data = Txtfile(self.args.dev_file, firstline=False, word2idx=self.word2idx, tag2idx=self.tag2idx)
        test_data = Txtfile(self.args.test_file, firstline=False, word2idx=self.word2idx, tag2idx=self.tag2idx)

        max_epochs = self.args.max_epochs
        saved_epoch = 0
        best_dev = -1
        best_metrics = {}

        nepoch_no_imprv = 0
        epoch_start = time.time()
        for epoch in range(max_epochs):
            if self.args.decay_rate>0: 
                self.lr_decay(epoch)
            print("Epoch: %s/%s" %(epoch,max_epochs))
            train_loss = self.train_batch(train_data)
            # evaluate on developing data
            dev_metrics, dev_speed = self.evaluate_batch(dev_data)
            dev_metric_standard = dev_metrics["prf_macro"][2]
            if dev_metric_standard > best_dev:
                nepoch_no_imprv = 0
                saved_epoch = epoch
                best_dev = dev_metric_standard
                best_metrics = dev_metrics
                print("UPDATES: - New improvement")  
                print("         - Train loss: %.4f"%train_loss)
                print("         - Dev acc: %.2f(%%); Dev P: %.2f(%%); Dev R: %.2f(%%);Dev F1: %.2f(%%); Dev speed: %.2f(sent/s)"%(100*dev_metrics["acc"],
                      100*dev_metrics["prf_macro"][0], 100*dev_metrics["prf_macro"][1], 100*dev_metrics["prf_macro"][2], dev_speed))
                print("         - Save the model to %s at epoch %d"%(self.args.model_name,saved_epoch))
                # Conver model to CPU to avoid out of GPU memory
                self.model.to("cpu")
                torch.save(self.model.state_dict(), self.args.model_name)
                self.model.to(self.device)
            else:
                nepoch_no_imprv += 1
                if nepoch_no_imprv >= self.args.patience:
                    self.model.load_state_dict(torch.load(self.args.model_name))
                    self.model.to(self.device)
                    test_metrics, test_speed = self.evaluate_batch(test_data)
                    print("\nSUMMARY: - Early stopping after %d epochs without improvements"%(nepoch_no_imprv))
                    print("         - Dev acc: %.2f(%%); Dev P: %.2f(%%); Dev R: %.2f(%%);Dev F1: %.2f(%%)"%(100*best_metrics["acc"],
                          100*best_metrics["prf_macro"][0], 100*best_metrics["prf_macro"][1], 100*best_metrics["prf_macro"][2]))
                    print("         - Load the best model from: %s at epoch %d"%(self.args.model_name,saved_epoch))                    
                    print("         - Test acc: %.2f(%%); Test P: %.2f(%%); Test R: %.2f(%%);Test F1: %.2f(%%); "
                          "Test speed: %.2f(sent/s)"%(100*test_metrics["acc"], 100*test_metrics["prf_macro"][0],
                                                      100*test_metrics["prf_macro"][1],
                                                      100*test_metrics["prf_macro"][2], test_speed))
                
                    return

            epoch_finish = Timer.timeEst(epoch_start,(epoch+1)/max_epochs)
            print("\nINFO: - Trained time(Remained time for %d epochs: %s"%(max_epochs, epoch_finish))
        
        self.model.load_state_dict(torch.load(self.args.model_name))
        self.model.to(self.device)
        test_metrics, test_speed = self.evaluate_batch(test_data)
        print("\nSUMMARY: - Completed %d epoches"%(max_epochs))
        print("         - Dev acc: %.2f(%%); Dev P: %.2f(%%); Dev R: %.2f(%%);Dev F1: %.2f(%%)"%(100*best_metrics["acc"],
              100*best_metrics["prf_macro"][0], 100*best_metrics["prf_macro"][1], 100*best_metrics["prf_macro"][2]))
        print("         - Load the best model from: %s at epoch %d"%(self.args.model_name,saved_epoch))
        print("         - Test acc: %.2f(%%); Test P: %.2f (%%); Test R: %.2f(%%);Test F1: %.2f(%%); Test speed: %.2f(sent/s)"%(100*test_metrics["acc"],
              100*test_metrics["prf_macro"][0], 100*test_metrics["prf_macro"][1], 100*test_metrics["prf_macro"][2], test_speed))
        return 

    def predict(self, sent, k=1):
        """

        :param sent: processed sentence
        :param asp: an aspect mentioned inside sent
        :param k: int
        :return: top k predictions
        """
        wl = self.args.vocab.wl
         ## set model in eval model
        self.model.eval()
        
        fake_label = [0]        
        words = self.word2idx(sent)
        word_ids, sequence_lengths = seqPAD.pad_sequences([words], pad_tok=0, wthres=wl)
    
        data_tensors = Data2tensor.sort_tensors(fake_label, word_ids, sequence_lengths, self.device)
        fake_label_tensor, word_tensor, sequence_lengths, word_seq_recover = data_tensors

        label_score = self.model(word_tensor, sequence_lengths)
        label_prob, label_pred = self.model.inference(label_score, k)
        return label_prob, label_pred 
    
    @staticmethod
    def class_metrics(y_true, y_pred):
        acc = metrics.accuracy_score(y_true, y_pred)  
        f1_ma = metrics.precision_recall_fscore_support(y_true, y_pred, average='macro')    
        f1_we = metrics.precision_recall_fscore_support(y_true, y_pred, average='weighted') 
        f1_no = metrics.precision_recall_fscore_support(y_true, y_pred, average=None)  
        measures = {"acc":acc, "prf_macro":f1_ma, "prf_weighted":f1_we, "prf_individual":f1_no}
        return measures


def build_data(args):    
    print("Building dataset...")
    model_dir, _ = os.path.split(args.model_args)
    if not os.path.exists(model_dir): 
        os.mkdir(model_dir)

    vocab = Vocab(wl_th=args.word_thres, cutoff=args.cutoff)
    vocab.build([args.train_file, args.dev_file, args.test_file], firstline=False)
    args.vocab = vocab
    if args.emb_file != "":
        args.word_pred_embs = Embeddings.get_W(args.emb_file,wsize=args.word_dim,vocabx=vocab.w2i)
    else:
        args.word_pred_embs = None
    SaveloadHP.save(args, args.model_args)
    return args


In [6]:
if __name__ == '__main__':
      args= SaveloadHP.load('/content/drive/My Drive/NLP_COLAB/homework8/data/classifier.args')
      args.train_file ="/content/drive/My Drive/NLP_COLAB/homework8/data/train.txt"

      args.dev_file = "/content/drive/My Drive/NLP_COLAB/homework8/data/val.txt"

      args.test_file ="/content/drive/My Drive/NLP_COLAB/homework8/data/test.txt"
      args.emb_file ="/content/drive/My Drive/NLP_COLAB/homework8/data/glove.6B.50d.txt"
      args.model_name ="/content/drive/My Drive/NLP_COLAB/homework8/data/classifier.m"

      args.model_args ="/content/drive/My Drive/NLP_COLAB/homework8/data/classifier.args"
      args.word_dim = 50
      args.word_hidden_dim = 100
      args.use_batchnorm = True
      args.dropout = 0.4
      args.use_cuda = False
      args.word_att = True # enable attention for train lstm
      args.filter_size = [2,3,4,5] # fitler size for cnn network
      args.out_channels = 32 # Choose out_channel for cnn network
    
      args = build_data(args)

      classifier = Classifier(args)

      classifier.train()

Reading hyper-parameters from /content/drive/My Drive/NLP_COLAB/homework8/data/classifier.args
Building dataset...
Extracting vocabulary:
	24988 total samples, 2652560 total tokens, 24988 total labels
	47091 unique tokens, 5 unique labels
	21802 unique tokens appearing at least 2 times
Extracting pretrained embeddings:
	400000 pre-trained word embeddings
Mapping to vocabulary:
	3019 randomly word vectors;
	0 partially word vectors;
	18787 pre-trained embeddings.
Writing hyper-parameters into /content/drive/My Drive/NLP_COLAB/homework8/data/classifier.args
INFO: - Learning rate is setted as: 0.001000
Epoch: 0/32




UPDATES: - New improvement
         - Train loss: 1.3554
         - Dev acc: 49.98(%); Dev P: 50.25(%); Dev R: 49.85(%);Dev F1: 49.06(%); Dev speed: 294.69(sent/s)
         - Save the model to /content/drive/My Drive/NLP_COLAB/homework8/data/classifier.m at epoch 0

INFO: - Trained time(Remained time for 32 epochs: 9m 9s (- 284m 5s)
INFO: - Learning rate is setted as: 0.000952
Epoch: 1/32
UPDATES: - New improvement
         - Train loss: 1.1269
         - Dev acc: 53.14(%); Dev P: 52.57(%); Dev R: 53.04(%);Dev F1: 51.89(%); Dev speed: 298.36(sent/s)
         - Save the model to /content/drive/My Drive/NLP_COLAB/homework8/data/classifier.m at epoch 1

INFO: - Trained time(Remained time for 32 epochs: 18m 19s (- 274m 52s)
INFO: - Learning rate is setted as: 0.000909
Epoch: 2/32
UPDATES: - New improvement
         - Train loss: 1.0490
         - Dev acc: 56.30(%); Dev P: 55.24(%); Dev R: 56.15(%);Dev F1: 54.96(%); Dev speed: 298.22(sent/s)
         - Save the model to /content/drive/My Dr

In [0]:
pip freeze