In [76]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2019 Zimeng Qiu <zimengq@andrew.cmu.edu>
# Licensed under the Apache License v2.0 - http://www.apache.org/licenses/

import torch
import torch.nn as nn
import pdb

#from configs import DEVICE
from utils.data import is_english_word
from utils.model import weight_init


class FNNLM(nn.Module):
    """
    Feed-forward Neural Network Language Model
    """
    def __init__(self, n_words, emb_size, hid_size, num_hist, dropout):
        super(FNNLM, self).__init__()
        self.embedding = nn.Embedding(n_words, emb_size)
        self.fnn = nn.Sequential(
            nn.Linear(num_hist*emb_size, hid_size), nn.Tanh(),
            nn.Dropout(dropout),
            nn.Linear(hid_size, n_words)
        )

    def forward(self, words):
        emb = self.embedding(words)       # 3D Tensor of size [batch_size x num_hist x emb_size]
        feat = emb.view(emb.size(0), -1)  # 2D Tensor of size [batch_size x (num_hist*emb_size)]
        logit = self.fnn(feat)            # 2D Tensor of size [batch_size x nwords]

        return logit


class DualLSTM(nn.Module):
    """
    Dual LSTM Language Model
    """
    def __init__(self, batch_size, hidden_size, embed_size, n_gram, vocab,
                 dropout=0.5, embedding=None, freeze=False, dataset='seame', pretrain=None):
        super(DualLSTM, self).__init__()
        self.batch_size = batch_size
        self.hidden_size = hidden_size
        self.dataset = dataset
        self.pretrain = True if pretrain is not None else False

        if self.pretrain:
            self.vocab = pretrain.vocab
            self.vocab.extend(vocab)
            print("Extended vocab from pre-trained model!")
        else:
            self.vocab = vocab
        self.vocab_size = len(self.vocab)
        print(self.vocab_size)

        if embedding is not None:
            self.embedding = nn.Embedding.from_pretrained(embeddings=embedding, freeze=freeze)
        else:
            self.embedding = nn.Embedding(self.vocab_size, embed_size)

        self.dummy_tok = torch.zeros((1, embed_size)).to(DEVICE)

        self.lstm_en = nn.LSTMCell(input_size=embed_size*n_gram, hidden_size=hidden_size, bias=False).to(DEVICE)
        self.lstm_cn = nn.LSTMCell(input_size=embed_size*n_gram, hidden_size=hidden_size, bias=False).to(DEVICE)

        self.fc = nn.Sequential(
            nn.Linear(2*hidden_size, 2*hidden_size),
            nn.ReLU(),
            nn.Dropout(p=dropout),
            nn.Linear(2*hidden_size, self.vocab_size)
        ).to(DEVICE)

        self.lang_classifier = nn.Linear(2*self.hidden_size, 2)

        # [batch_size, hidden_size]
        self.hidden_en = self.init_hidden()
        self.hidden_cn = self.init_hidden()
        self.cell = self.init_hidden()

        if self.pretrain:
            for name, param in pretrain.state_dict().items():
                if name not in self.state_dict() or name == 'fc':
                    continue
                if isinstance(param, nn.Parameter):
                    # backwards compatibility for serialized parameters
                    param = param.data
                    if name == 'embedding':
                        self.state_dict()[name].copy_(nn.Parameter(
                            torch.cat((param, sample_gumbel(self.vocab_size - param.shape[0])), dim=0)),
                            requires_grad=True)
                    else:
                        self.state_dict()[name].copy_(param)
        else:
            self.init_weights()

    def init_hidden(self):
        return torch.zeros(1, self.hidden_size).to(DEVICE)

    def detach(self):
        self.hidden_en.detach_()
        self.hidden_cn.detach_()
        self.cell.detach_()

    def init_weights(self):
        self.apply(weight_init)

    def forward(self, sentence, lang_ids=None):
        sent_embed, embed_mask = self.embed_sentence(sentence, lang_ids)
        lstm_out = []
        for i in range(len(sent_embed)):
            if self.training:
                lang_id = embed_mask[i]
            else:
                if i == 0:
                    lang_id = 1
                else:
                    lang_id = torch.argmax(self.lang_classifier(torch.cat((self.hidden_en, self.hidden_cn), dim=1)), dim=1)
            if lang_id > 0:
                self.hidden_en, self.cell = self.lstm_en(sent_embed[i], (self.hidden_en, self.cell))
                self.hidden_cn, self.cell = self.lstm_cn(self.dummy_tok, (self.hidden_en, self.cell))
            else:
                self.hidden_cn, self.cell = self.lstm_cn(sent_embed[i], (self.hidden_cn, self.cell))
                self.hidden_en, self.cell = self.lstm_en(self.dummy_tok, (self.hidden_cn, self.cell))
            lstm_out.append(torch.cat((self.hidden_en, self.hidden_cn), dim=1))
        lstm_out = torch.stack(lstm_out)

        prediction = self.fc(torch.squeeze(lstm_out))
        lang_ids_pred = self.lang_classifier(torch.squeeze(lstm_out))
        return prediction, lang_ids_pred

    def embed_sentence(self, sentence, lang_ids=None):
        embedding = []
        if self.dataset == 'seame' or self.dataset == 'qg' or self.dataset == 'ner':
            embed_mask = torch.zeros(len(sentence))
            for idx, token in enumerate(sentence[:-1]):
                try:
                    embedding.append(self.embedding(torch.LongTensor([self.vocab[token]]).to(DEVICE)))
                    embed_mask[idx] = 1. if is_english_word(token) else 0.
                except Exception as e:
                    print(e, sentence, self.vocab_size, token, self.vocab[token])
        else:
            for idx, token in enumerate(sentence[:-1]):
                try:
                    embedding.append(self.embedding(torch.LongTensor([self.vocab[token]]).to(DEVICE)))
                except Exception as e:
                    print(e, sentence, self.vocab_size, token, self.vocab[token])
            if lang_ids is not None:
                embed_mask = lang_ids
            else:
                embed_mask = None
        return torch.stack(embedding).to(DEVICE), embed_mask.to(DEVICE) if embed_mask is not None else embed_mask



In [29]:
import torch
import random
from torch.utils import data
from vocab import Vocab, Vectors
# from configs import DEVICE


class DataSet(data.Dataset):
    def __init__(self, vocab=None, examples=None, padding=True, sort=False, sort_key=None):
        super(DataSet, self).__init__()
        self.examples = examples if examples is not None else []
        self.vocab = vocab
        self.padding = padding

        if sort and sort_key is not None:
            self.examples.sort(key=sort_key)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, item):
        assert item < len(self), 'index out of range'
        return self.examples[item]

    def get_subset(self, start, end):
        assert start < end, 'start index should be less than end index'
        return self.examples[start:end]

    def add(self, example):
        self.examples.append(example)

    def collate(self, batch):
        texts, labels, idxs = zip(*batch)

        # [batch_size]
        lens = torch.LongTensor(
            [len(text) for text in texts]
        )
        max_len = max(lens)
        # [batch_size, max_len]
        texts = torch.LongTensor(
            [
                torch.cat((text, torch.full((max_len - len(text)), self.vocab.stoi['<pad>'])))
                if self.padding and len(text) < max_len else text
                for text in texts
                ]
        )
        labels = torch.LongTensor(labels)

        return texts.to(DEVICE), labels.to(DEVICE)


class BilingualDataSet(DataSet):
    def __init__(self, vocab, examples=None, padding=True, sort=False, sort_key=None):
        super(BilingualDataSet, self).__init__(vocab, examples, padding, sort, sort_key)

    def collate(self, batch):
        # [batch_size]
        lens = [len(text) for text in batch]
        max_len = max(lens)
        # [batch_size, max_len]
        texts = [text + (max_len - len(text)) * ['<pad>']
                 if self.padding and len(text) < max_len else text for text in batch]

        return texts

In [37]:
import logging
import os
from datetime import datetime

timestamp = datetime.now().strftime('%m%d-%H%M%S')
_log_file = '{}/{}.{}.log'.format(log_dir, mode, timestamp)


def init_logger():
    if not os.path.exists(log_dir):
        try:
            os.mkdir(log_dir)
        except IOError:
            print("Can not create log file directory.")
    logging.basicConfig(level=logging.DEBUG,
                        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    logger = logging.getLogger(__name__)
    fh = logging.FileHandler(_log_file)
    fh.setLevel(logging.DEBUG)
    ch = logging.StreamHandler()
    ch.setLevel(logging.ERROR)
    logger.addHandler(fh)
    logger.addHandler(ch)
    return logger


In [91]:
args_epoch=20
args_model='lstm'
args_batch=64
args_hidden=512
args_embed=300
args_ngram=1
args_maxlen=30
args_optim='adam'
args_dp=0.5
args_mode='train'
args_nworkers=12
args_lr=1e-4
args_mm=0.9
args_clip=0.25
args_data= '../NER/split'
args_subset=1.0
args_models_dir='models'
args_log_dir='log'
args_gpu_id=0
args_qg=False
args_dataset='ner'
args_finetune=False
args_model_path='models/best.pt'
args_lm_path='models/best_hd_1024_full.pt'
args_submission_csv=' data/submission.csv'
args_DEVICE = torch.device('cuda:{}'.format(gpu_id) if torch.cuda.is_available() else 'cpu')

In [94]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2019 Zimeng Qiu <zimengq@andrew.cmu.edu>
# Licensed under the Apache License v2.0 - http://www.apache.org/licenses/

import os
import math
import time
import random
import logging
import torch
import torch.nn as nn
# import torch.nn.functional as F
import pdb

# from lm import FNNLM, DualLSTM
from utils.data import *
from utils.model import *
# from configs import *
from vocab import Vocab
# from dataset import BilingualDataSet
from torch.utils.data import DataLoader
# from log import init_logger


def calc_sent_loss(sent, model, criterion, lang_ids=None):
    """
    Calculate the loss value for the entire sentence
    """
    if lang_ids is not None:
        lang_ids = torch.LongTensor([1 if _ == 'eng' or _ == 'engspa' or _ == '<s>' else 0 for _ in lang_ids]).to(DEVICE)
    targets = torch.LongTensor([model.vocab[tok] for tok in sent[1:]]).to(DEVICE)
    logits, lang_ids_pred = model(sent, lang_ids)
    loss = criterion(logits, targets)
    if lang_ids is not None:
        loss += criterion(lang_ids_pred, lang_ids[1:])

    # gen_sent = ' '.join([model.vocab[idx] for idx in torch.argmax(logits, dim=1)])
    # with open('log/{}_gen_sent.txt'.format( args_dataset), 'a+') as f:
    #     f.write(gen_sent + '\n')
    return loss


def generate_sent(model, max_len):
    """
    Generate a sentence
    """
    hist = ['<s>']
    eos = model.vocab['<s>']
    sent = []

    while len(sent) < max_len:
        logits = model(hist + ['<s>'])[0]
        if logits.dim() > 1:
            logits = logits[-1]
        next_word = gumbel_argmax(logits, dim=0)
        if next_word == eos:
            break
        sent.append(model.vocab.itos[next_word])
        hist += [model.vocab.itos[next_word]]

    return sent


def calc_sentence_logprob(model, sentence):
    """
    Calculates the sentence log-prob
    """
    if len(sentence) < 1:
        return -float('inf')

    log_probs = torch.log(F.softmax(model(sentence), dim=0))
    ids = torch.Tensor(sentence[1:]).long()
    sentence_log_prob = torch.sum(log_probs.gather(1, ids.view(-1, 1)))

    return sentence_log_prob.item()

if __name__ == '__main__':
    # initialize logger
    logger = init_logger()
#     logger.info(args)

    # Load data
    logger.info('Loading {} dataset...'.format( args_dataset))
    if  args_dataset.lower() == 'seame' or  args_dataset.lower() == 'qg' or  args_dataset.lower() == 'ner':
        dataset = read_dataset( args_data, dataset= args_dataset)
        dataset = dataset[: int(len(dataset) *  args_subset)]
        train = dataset[: int(len(dataset)*0.8)]
        dev = dataset[int(len(dataset)*0.8) + 1: -1]
        train_ids = None
    elif  args_dataset.lower() == 'miami' or  args_dataset.lower() == 'tagalog':
        train, dev, test, train_ids, dev_ids, test_ids, miami_dict = read_miami_data( args_data)
    elif  args_dataset.lower() == 'opensub':
        train, dev, train_ids, dev_ids = read_opensub_data( args_data)
    else:
        raise NotImplemented

    vocab = Vocab(train)

    if  args_dataset.lower() == 'seame':
        train_chn_tok_num, train_eng_tok_num = 0, 0
        for sent in train:
            for tok in sent:
                if is_chinese_word(tok):
                    train_chn_tok_num += 1
                else:
                    train_eng_tok_num += 1

        dev_chn_tok_num, dev_eng_tok_num = 0, 0
        for sent in dev:
            for tok in sent:
                if is_chinese_word(tok):
                    dev_chn_tok_num += 1
                else:
                    dev_eng_tok_num += 1

        logger.info('#' * 60)
        logger.info('Training samples: {}'.format(len(train)))
        logger.info('Dev samples:      {}'.format(len(dev)))
        logger.info('Vocabulary size:  {}'.format(len(vocab)))
        logger.info('Training CHN token amount: {}'.format(train_chn_tok_num))
        logger.info('Training ENG token amount: {}'.format(train_eng_tok_num))
        logger.info('Dev CHN token amount {}'.format(dev_chn_tok_num))
        logger.info('Dev ENG token amount {}'.format(dev_eng_tok_num))
        logger.info('#' * 60)

    else:
        logger.info('#' * 60)
        logger.info('Training samples: {}'.format(len(train)))
        logger.info('Dev samples:      {}'.format(len(dev)))
        logger.info('Vocabulary size:  {}'.format(len(vocab)))
        logger.info('#' * 60)

    # Initialize the model and the optimizer
    if  args_finetune is True:
        logger.info("Loading pre-trained model...")
        pretrain = torch.load( args_model_path)
    else:
        pretrain = None

    logger.info('Building model...')
    if  args_model.lower() == 'lstm':
        model = DualLSTM(batch_size= args_batch, hidden_size= args_hidden,
                         embed_size= args_embed, n_gram= args_ngram,
                         vocab=vocab, dropout= args_dp, embedding=None,
                         freeze=False, dataset= args_dataset, pretrain=pretrain)
    elif  args_model.lower() == 'fnn':
        model = FNNLM(n_words=len(vocab), emb_size= args_embed,
                      hid_size= args_hidden, num_hist= args_ngram, dropout= args_dp)
    else:
        raise NotImplemented

    model = model.to(DEVICE)

    # Construct loss function and Optimizer.
    criterion = torch.nn.CrossEntropyLoss()
    if  args_optim.lower() == 'adam':
        optimizer = torch.optim.Adam(model.parameters(), lr= args_lr)
    elif  args_optim.lower() == 'sgd':
        optimizer = torch.optim.SGD(model.parameters(), lr= args_lr, momentum= args_mm)
    elif  args_optim.lower() == 'adagrad':
        optimizer = torch.optim.Adagrad(model.parameters())
    else:
        optimizer = torch.optim.Adadelta(model.parameters())

    train_loss_record = []
    valid_loss_record = []
    train_acc_record = []
    valid_acc_record = []
    best_valid_predictions = []
    test_predictions = []
    last_dev = 1e20
    best_dev = 1e20

    if  args_dataset in ['miami', 'tagalog', 'opensub']:
        train = [(sent, idx) for sent, idx in zip(train, train_ids)]
        dev = [(sent, idx) for sent, idx in zip(dev, dev_ids)]

    # Perform training
    for epoch in range( args_epoch):
        # shuffle training data
        random.shuffle(train)
        # set the model to training mode
        model.train()
        train_words, train_loss = 0, 0.0
        train_sents = 0
        start = time.time()
        for idx, sent in enumerate(train):
            if  args_dataset in ['miami', 'tagalog', 'opensub']:
                lang_ids = ['<s>'] + sent[1] + ['<s>']
                sent = ['<s>'] + sent[0] + ['<s>']
                if len(sent) == 2 or len(lang_ids) == 2:
                    continue
                if len(sent) != len(lang_ids):
                    print(sent)
                    continue
            else:
                if len(sent) <= 2:
                    continue
                lang_ids = None
            # TODO: mean or sum loss?
            loss = calc_sent_loss(sent, model, criterion, lang_ids)
            train_loss += loss.data
            train_words += (len(sent) - 2)
            train_sents += 1
            optimizer.zero_grad()
            loss.backward()
            # TODO: add clip_grad?
            # clip_grad_norm helps prevent the exploding gradient problem in RNNs / LSTMs.
            torch.nn.utils.clip_grad_norm_(model.parameters(),  args_clip)
            for p in model.parameters():
                try:
                    p.data.add_(- args_lr, p.grad.data)
                except:
                    continue
            optimizer.step()
            if train_sents % 500 == 0:
                logger.info("--finished %r sentences (sentence/sec=%.2f)"
                            % (train_sents, train_sents / (time.time() - start)))

            model.detach()

        logger.info("Epoch %r: train loss/word=%.4f, ppl=%.4f (word/sec=%.2f)" % (
            epoch, train_loss / train_words, math.exp(train_loss / train_words),
            train_words / (time.time() - start)))

        train_loss_record.append(train_loss)

        # Evaluate on dev set
        # set the model to evaluation mode
        # if  args_dataset == 'opensub':
        #     torch.save(model, "{}/opensub_epoch_{}.pt".format( args_models_dir, epoch))
        #     continue
        model.eval()
        dev_words, dev_loss = 0, 0.0
        start = time.time()
        with torch.no_grad():
            for sent in dev:
                if  args_dataset in ['miami', 'tagalog', 'opensub']:
                    if len(sent[0]) == 0 or len(sent[1]) == 0:
                        print("empty sentence")
                        continue
                    lang_ids = ['<s>'] + sent[1] + ['<s>']
                    sent = ['<s>'] + sent[0] + ['<s>']
                    if len(sent) != len(lang_ids):
                        print(sent)
                        continue
                else:
                    lang_ids = None
                # sentences = batch.to(DEVICE)
                loss = calc_sent_loss(sent, model, criterion, lang_ids)
                dev_loss += loss.data
                dev_words += (len(sent) - 2)

        # Keep track of the development accuracy and reduce the learning rate if it got worse
        if last_dev < dev_loss and hasattr(optimizer, 'learning_rate'):
            optimizer.learning_rate /= 2
        last_dev = dev_loss

        # Keep track of the best development accuracy, and save the model only if it's the best one
        if best_dev > dev_loss:
            if not os.path.exists( args_models_dir):
                try:
                    os.mkdir( args_models_dir)
                except Exception as e:
                    print("Can not create models directory, %s" % e)
            torch.save(model, "{}/best_{}.pt".format( args_models_dir,  args_dataset))
            best_dev = dev_loss

        # Save the model
        logger.info("Epoch %r: dev loss/word=%.4f, ppl=%.4f (word/sec=%.2f)" % (
            epoch, dev_loss / dev_words, math.exp(dev_loss / dev_words),
            dev_words / (time.time() - start)))
        torch.save(model.state_dict(), "{}/epoch_{}.pt".format( args_models_dir, epoch))

        # Generate a few sentences
        for _ in range(5):
            sentence = generate_sent(model,  args_maxlen)
            logger.debug(" ".join([word for word in sentence]))


2020-07-22 01:43:44,676 - __main__ - INFO - Loading ner dataset...


TypeError: exceptions must derive from BaseException

In [88]:
args_dataset

'ner'

In [48]:
dataset = read_dataset( args_data, dataset= args_dataset)
dataset = dataset[: int(len(dataset) *  args_subset)]
train = dataset[: int(len(dataset)*0.8)]
dev = dataset[int(len(dataset)*0.8) + 1: -1]
train_ids = None

Process ForkPoolWorker-7:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/opt/conda/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/opt/conda/lib/python3.6/multiprocessing/queues.py", line 335, in get
    res = self._reader.recv_bytes()
  File "/opt/conda/lib/python3.6/multiprocessing/connection.py", line 216, in recv_bytes
    buf = self._recv_bytes(maxlength)
  File "/opt/conda/lib/python3.6/multiprocessing/connection.py", line 407, in _recv_bytes
    buf = self._recv(4)
  File "/opt/conda/lib/python3.6/multiprocessing/connection.py", line 379, in _recv
    chunk = read(handle, remaining)
KeyboardInterrupt


In [52]:
len(dataset)

294329

In [67]:
data_path = args_data
data_path = '../NER/split'
num_workers=1
dataset = 'seame'

In [93]:
def read_dataset(data_path, num_workers=1, dataset='seame'):
    data = []
    all_file_paths = glob(os.path.join(data_path, '*_ys.txt'), recursive=True)
    #all_file_paths = glob(os.path.join(data_path), recursive=True)
    num_files = len(all_file_paths)
    files_per_worker = num_files // num_workers

    pool = mp.Pool(processes=num_workers)

    if dataset == 'seame':
        extraction_result = pool.map(read_seame_data,
                                     (all_file_paths[start_idx:start_idx+files_per_worker]
                                      for start_idx in range(0, num_files, files_per_worker)))
    elif dataset == 'qg':
        extraction_result = pool.map(read_qg_data,
                                     (all_file_paths[start_idx:start_idx + files_per_worker]
                                      for start_idx in range(0, num_files, files_per_worker)))
    else:
        raise NotImplemented

    for result in extraction_result:
        data.extend(result)
    return data

In [69]:
all_file_paths

['../NER/split/train_ys.txt',
 '../NER/split/dev_ys.txt',
 '../NER/split/test_ys.txt']

In [70]:
num_files

3

In [71]:
data

[['<s>', '歡', '迎', '收', '聽', '創', '設', '市', '集', '<s>'],
 ['<s>',
  '台',
  '灣',
  '科',
  '技',
  '大',
  '學',
  '設',
  '計',
  '系',
  '林',
  '廷',
  '宜',
  '這',
  '個',
  '節',
  '目',
  '是',
  '由',
  '教',
  '育',
  '電',
  '台',
  '和',
  '台',
  '科',
  '大',
  '合',
  '作',
  '製',
  '播',
  '<s>'],
 ['<s>',
  '每',
  '週',
  '四',
  '晚',
  '間',
  '六',
  '點',
  '零',
  '五',
  '分',
  '到',
  '七',
  '點',
  '播',
  '出',
  '<s>'],
 ['<s>',
  '我',
  '和',
  '三',
  '位',
  '老',
  '師',
  '同',
  '仁',
  '共',
  '同',
  '主',
  '持',
  '在',
  '空',
  '中',
  '為',
  '您',
  '服',
  '務',
  '<s>'],
 ['<s>',
  '個',
  '新',
  '節',
  '目',
  '透',
  '過',
  '對',
  '設',
  '計',
  '界',
  '翹',
  '楚',
  '與',
  '新',
  '銳',
  '設',
  '計',
  '師',
  '的',
  '專',
  '訪',
  '分',
  '享',
  '國',
  '際',
  '設',
  '計',
  '趨',
  '勢',
  '<s>'],
 ['<s>',
  '發',
  '展',
  '並',
  '且',
  '藉',
  '由',
  '國',
  '內',
  '外',
  '相',
  '關',
  '案',
  '例',
  '的',
  '分',
  '享',
  '讓',
  '聽',
  '眾',
  '瞭',
  '解',
  '設',
  '計',
  '人',
  '的',
  '思',
  '維',
  '以',
  '及',
  '設

In [98]:
'/'.join(csv_path.split('/')[:-1]) + 

'../NER/baseline/output/baseline/v4'