In [1]:
import os
import pandas as pd
import numpy as np
import itertools
import re
import unicodedata
import json
import tensorflow as tf
from model_v2 import LanguageModel
from tqdm import tqdm

In [2]:
with open('baomoi_punc/word2idx.json', 'r') as inp:
    word2idx = json.load(inp)
with open('baomoi_punc/char2idx.json', 'r') as inp:
    char2idx = json.load(inp)

In [3]:
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
with open('15/checkpoints/model_configs.json', 'r') as inp:
    params = json.load(inp)

model = LanguageModel(**params, is_training=False, is_encoding=True)

model.build_model()
saver = tf.train.Saver([x for x in tf.global_variables() if x not in tf.get_collection('LSTM_SAVED_STATE')])
sess.run(tf.global_variables_initializer())
saver.restore(sess, '15/checkpoints/test/model.cpkt-315616')

INFO:tensorflow:Restoring parameters from 15/checkpoints/test/model.cpkt-315616


In [4]:
def clean_text(x):
    x = str(unicodedata.normalize('NFKC', x.lower()))
    return re.sub('\d+','N', re.sub('[ ]+',' ', re.sub('[\n\r][ \n\r]*',' L ', re.sub(r'(?P<punc>\W)',' \g<punc> ', x))))

def pad_sequence(words):
    maxlen = max(len(x) for x in words)
    arr = np.zeros(shape=(len(words), 1, maxlen))
    for ir in range(len(arr)):
        s = words[ir]
        arr[ir][0][:len(s)] = s
    return arr

def __embed_sequence(sentence):
    unk_char_idx = char2idx['U']
    sentence = [[char2idx.get(x, unk_char_idx) for x in word] for word in sentence]
    seq_len = len(sentence)
    inputs = pad_sequence(sentence)
    embeddings = sess.run(model.concated_timewise_output, feed_dict={
        model.inputs: inputs, model.seq_lens: [seq_len], model.reset_state: True
    })
    return embeddings
def embed_sentence(sentence):
#     sentence = clean_text(sentence).split()
    return __embed_sequence(sentence)

In [5]:
train = [os.path.join(p, x) for p, dn, fn in os.walk('VLSP/train') for x in fn if x.endswith('.txt')]
test = [os.path.join(p, x) for p, dn, fn in os.walk('VLSP/test') for x in fn if x.endswith('.txt')]

In [6]:
pos2idx = {}
ner2idx = {}
def build_data(filenames):
    words = []
    curr_words = []
    pos_tags = []
    curr_pos = []
    ner_tags = []
    curr_ner = []
    flag = False
    for fn in filenames:
        with open(fn, 'r') as inp:
            for line in itertools.islice(inp.readlines(), 3, None):
                line = line.strip().split('\t')
                if len(line) != 5:
                    if flag:
                        flag = not flag
                        if len(curr_words) > 0 and len(curr_pos) > 0 and len(curr_ner) > 0:
                            words.append(curr_words)
                            pos_tags.append(curr_pos)
                            ner_tags.append(curr_ner)
                        else:
                            print(fn)
                        curr_words = []
                        curr_pos = []
                        curr_ner = []
                    else:
                        flag = not flag
                    continue
                word, _, pos, ner, _ = line
                word = word.split('_')
                word = [y for x in word for y in clean_text(x).split()]
                pos = pos.strip()
                pos = [pos if i == 0 or pos == 'O' else 'I-{}'.format(pos[2:]) for i in range(len(word))]
                ner = [ner if i == 0 or ner == 'O' else 'I-{}'.format(ner[2:]) for i in range(len(word))]
                for p in pos:
                    if p not in pos2idx:
                        pos2idx[p] = len(pos2idx)
                    if p != 'O':
                        p = 'I-{}'.format(p[2:])
                        if p not in pos2idx:
                            pos2idx[p] = len(pos2idx)
                for n in ner:
                    if n not in ner2idx:
                        ner2idx[n] = len(ner2idx)
                    if n != 'O':
                        n = 'I-{}'.format(n[2:])
                        if n not in ner2idx:
                            ner2idx[n] = len(ner2idx)
                curr_words.extend(word)
                curr_pos.extend(pos)
                curr_ner.extend(ner)
    return words, pos_tags, ner_tags

In [7]:
train_data = build_data(train)
test_data = build_data(test)

VLSP/train/81724.txt
VLSP/train/90159.txt
VLSP/train/8010.txt
VLSP/train/81914.txt
VLSP/train/81914.txt
VLSP/train/26554.txt
VLSP/train/8160.txt
VLSP/train/90069.txt
VLSP/train/83391.txt
VLSP/train/88105.txt
VLSP/train/83595.txt
VLSP/train/46165.txt
VLSP/train/50501.txt
VLSP/train/8456.txt
VLSP/train/87642.txt
VLSP/train/90324.txt
VLSP/train/103977.txt
VLSP/train/46803.txt
VLSP/train/45817.txt
VLSP/train/5932.txt
VLSP/train/104056.txt
VLSP/train/92260.txt
VLSP/train/89724.txt
VLSP/train/103780.txt
VLSP/train/81911.txt
VLSP/train/89705.txt
VLSP/train/83014.txt
VLSP/train/83121.txt
VLSP/train/45098.txt
VLSP/train/89917.txt
VLSP/train/59256.txt
VLSP/train/89319.txt
VLSP/train/90832.txt
VLSP/train/88517.txt
VLSP/train/46245.txt
VLSP/train/81533.txt
VLSP/train/6825.txt
VLSP/train/88713.txt
VLSP/train/80237.txt
VLSP/train/46273.txt
VLSP/train/89518.txt
VLSP/train/59401.txt
VLSP/train/82846.txt
VLSP/train/59602.txt
VLSP/train/60050.txt
VLSP/train/82853.txt
VLSP/train/89503.txt
VLSP/train/8253

In [8]:
train_embed = [
    np.squeeze(embed_sentence(x), 1) for x in tqdm(train_data[0])
]

100%|██████████| 16859/16859 [05:09<00:00, 54.44it/s]


In [9]:
test_embed = [
    np.squeeze(embed_sentence(x), 1) for x in tqdm(test_data[0])
]

100%|██████████| 2830/2830 [00:55<00:00, 50.93it/s]


In [10]:
for i, (e, p, n) in enumerate(tqdm(zip(train_embed, train_data[1], train_data[2]))):
    np.save('VLSP/train/{}e.npy'.format(i), e)
    np.save('VLSP/train/{}p.npy'.format(i), np.array(p))
    np.save('VLSP/train/{}n.npy'.format(i), np.array(n))

16859it [01:23, 202.43it/s]


In [11]:
for i, (e, p, n) in enumerate(tqdm(zip(test_embed, test_data[1], test_data[2]))):
    np.save('VLSP/test/{}e.npy'.format(i), e)
    np.save('VLSP/test/{}p.npy'.format(i), np.array(p))
    np.save('VLSP/test/{}n.npy'.format(i), np.array(n))

2830it [00:15, 184.46it/s]


In [12]:
with open('pos2idx.json', 'w') as out:
    json.dump(pos2idx, out)
with open('ner2idx.json', 'w') as out:
    json.dump(ner2idx, out)

In [13]:
pos2idx

{'O': 0,
 'B-VP': 1,
 'I-VP': 2,
 'B-NP': 3,
 'I-NP': 4,
 'B-PP': 5,
 'I-PP': 6,
 'B-AP': 7,
 'I-AP': 8,
 'B-IP': 9,
 'I-IP': 10,
 'B-NPb': 11,
 'I-NPb': 12,
 'B-VPb': 13,
 'I-VPb': 14,
 'B-EP': 15,
 'I-EP': 16,
 'B-MP': 17,
 'I-MP': 18,
 'I-RP': 19,
 'B-PER': 20,
 'I-PER': 21}

In [14]:
ner2idx

{'O': 0,
 'B-LOC': 1,
 'I-LOC': 2,
 'B-PER': 3,
 'I-PER': 4,
 'B-ORG': 5,
 'I-ORG': 6,
 'B-MISC': 7,
 'I-MISC': 8}