In [1]:
import pandas as pd
import json
import re
import unicodedata
import tensorflow as tf
import numpy as np
from model_v2 import LanguageModel
from tqdm import tqdm_notebook
from utils import clean_text
from sklearn.model_selection import train_test_split

In [2]:
with open('102/word2idx.json', 'r') as inp:
    word2idx = json.load(inp)
with open('102/char2idx.json', 'r') as inp:
    char2idx = json.load(inp)
idx2word = {k: v for v, k in word2idx.items()}
idx2char = {k: v for v, k in char2idx.items()}

In [3]:
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
with open('102/checkpoints/model_configs.json', 'r') as inp:
    params = json.load(inp)

model = LanguageModel(**params, is_training=False, is_encoding=True)

model.build_model()
saver = tf.train.Saver([x for x in tf.global_variables() if x not in tf.get_collection('LSTM_SAVED_STATE')])
sess.run(tf.global_variables_initializer())

In [4]:
saver.restore(sess, '102/checkpoints/test/model.cpkt-1045430')

INFO:tensorflow:Restoring parameters from 102/checkpoints/test/model.cpkt-1045430


In [5]:
sent_emb = tf.concat((model.layerwise_avg[-1], model.layerwise_encode[-1]), axis=-1)

In [5]:
def pad_sequence(sentences):
    sen_max_len = max(len(x) for x in sentences)
    word_lens = [[len(w) for w in x] for x in sentences]
    word_max_len = max(w for x in word_lens for w in x)
    arr = np.zeros(shape=(len(sentences), sen_max_len, word_max_len))
    lens = np.zeros(shape=(len(sentences), sen_max_len))
    for ir in range(len(sentences)):
        sentence = sentences[ir]
        lens[ir][:len(word_lens[ir])] = word_lens[ir]
        for ic in range(len(sentence)):
            word = sentence[ic]
            arr[ir][ic][:len(word)] = word
    return np.transpose(arr, (1, 0, 2)), np.transpose(lens, (1, 0))

def __embed_sequence(sentences):
    unk_char = [char2idx[x] for x in '<UNK>']
    sentences = [[[char2idx[x] for x in word] if word in word2idx else unk_char for word in sentence] for sentence in sentences]
    seq_len = [len(x) for x in sentences]
    inputs, char_lens = pad_sequence(sentences)
    emb = sess.run(model.layerwise_avg[-1], feed_dict={
        model.inputs: inputs, model.seq_lens: seq_len, model.bptt: 1, model.char_lens: char_lens
    })
    return emb
def embed_sentence(sentences):
#     sentence = re.sub(r'(http[s]?://)?(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\), ]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ', sentence)
#     sentence = re.sub(r"#[^\s]*", ' ', sentence)
    sentences = [clean_text(x, add_bos=True, add_eos=True).split() for x in sentences]
    return __embed_sequence(sentences)

In [6]:
data = pd.read_csv('baomoi_noseg.csv')

In [7]:
titles, _, cate, _ = train_test_split(data['title'], data['cate'], train_size=10000, random_state=20)



In [8]:
def get_batch(texts, bs):
    for i in range(0, len(texts), bs):
        yield texts[i:i+bs]

In [9]:
embedding = [embed_sentence(x) for x in tqdm_notebook(get_batch(titles, 32))]    
# data['embedding'] = embedding

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [10]:
embedding = np.concatenate(embedding, axis=0)

In [11]:
X_train, X_test, y_train, y_test = train_test_split([x for x in zip(embedding, titles)], cate, train_size=1500, random_state=20)

In [12]:
train_data = np.stack([x for x, _ in X_train], axis=0)
test_data = np.stack([x for x, _ in X_test], axis=0)

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

In [14]:
classifier = LogisticRegression(n_jobs=4, solver='lbfgs', multi_class='auto')

In [15]:
classifier.fit(train_data, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='auto', n_jobs=4,
          penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=False)

In [16]:
from sklearn.metrics import classification_report

In [17]:
test_pred = classifier.predict(test_data)

In [18]:
print(classification_report(y_true=y_train, y_pred=classifier.predict(train_data)))

print(classification_report(y_true=y_test, y_pred=classifier.predict(test_data)))

                                 precision    recall  f1-score   support

                       Giáo dục       1.00      1.00      1.00        48
     Giáo dục/Học bổng - Du học       1.00      1.00      1.00        50
      Giáo dục/Đào tạo - Thi cử       1.00      1.00      1.00        50
            Giải trí/Thời trang       1.00      1.00      1.00        41
               Giải trí/Âm nhạc       1.00      1.00      1.00        46
Giải trí/Điện ảnh - Truyền hình       1.00      1.00      1.00        36
      KH - CN/CNTT - Viễn thông       1.00      1.00      1.00        52
    KH - CN/Khoa học - Tự nhiên       1.00      1.00      1.00        64
   KH - CN/Thiết bị - Phần cứng       1.00      1.00      1.00        55
            Kinh tế/Chứng khoán       1.00      1.00      1.00        53
             Kinh tế/Kinh doanh       1.00      1.00      1.00        50
    Kinh tế/Lao động - Việc làm       1.00      1.00      1.00        66
              Kinh tế/Tài chính       1.00      1.