In [1]:
import re
import pickle
from collections import Counter
import pandas as pd
import numpy as np
import tensorflow as tf
from tqdm import tqdm

import keras
from keras.models import Sequential, Model
from keras.layers import *
from keras.optimizers import *
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

Using TensorFlow backend.


In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [3]:
# 設定 gpu memory 動態成長
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
# config.gpu_options.per_process_gpu_memory_fraction = 0.3
config.gpu_options.allow_growth = True
set_session(tf.Session(config=config))

In [4]:
df = pd.read_csv("")
df_train = df[df["source"] == "train"].copy()
df_dev = df[df["source"] == "dev"].copy()
df_test = df[df["source"] == "test"].copy()
len(df_train), len(df_dev), len(df_test)

(42120, 2448, 8062)

In [5]:
def get_content(sentences):
    # session裡去掉0:、1:開頭、斷詞
    sentences = sentences.split("\n")
    sentences = [row[1:].strip() if re.match("[0-9]", row[0]) else row for row in sentences]
    sentences = [" ".join(list(row.replace(" ", ""))) for row in sentences]
    return sentences

In [6]:
# 變成tuple是為了排序
train_list = [] # list of tuple: [ (["first sentence", "second sentence"], l1_label), ... ]
dev_list = []
test_list = []

train_list = [(get_content(row["sentences"]), row["l1_label"]) for index, row in df_train.iterrows()]
dev_list = [(get_content(row["sentences"]), row["l1_label"]) for index, row in df_dev.iterrows()]
test_list = [(get_content(row["sentences"]), row["l1_label"]) for index, row in df_test.iterrows()]

In [7]:
# sort by number of sentences
train_list.sort(key=lambda s: len(s[0]), reverse=True)
dev_list.sort(key=lambda s: len(s[0]), reverse=True)
test_list.sort(key=lambda s: len(s[0]), reverse=True)

In [8]:
# 句子部份
train_text_list = [r[0] for r in train_list]
dev_text_list = [r[0] for r in dev_list]
test_text_list = [r[0] for r in test_list]

# label部份
train_label_list = [r[1] for r in train_list]
dev_label_list = [r[1] for r in dev_list]
test_label_list = [r[1] for r in test_list]

In [None]:
train_text_list[-10]

In [None]:
sent_len = [len(row.split()) for text in train_text_list for row in text]
pd.DataFrame(sent_len, columns=["所有句子長度分布"]).describe([0, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99, 1])

## parameters

In [9]:
min_word_count = 3 # 最少出現次數才要算入vocabulary
PAD_INDEX = 0
UNK_INDEX = 1
START_INDEX = 2

### build vocabulary

In [10]:
word_counter = Counter()
word_index = dict()      # word: index
label_index = dict()     # label: class
inv_label_index = dict() # class: label

In [11]:
# 算字出現的次數
for text in train_text_list: # for each document
    for row in text:         # for each row in a document
        for w in row.split():# for each word in a row
            word_counter[w] += 1

In [12]:
word_counter.most_common(5)

[('的', 559679), ('是', 503225), ('个', 499197), ('您', 466854), ('我', 448510)]

In [13]:
# 出現次數 >= min_word_count 的字進到 vocabulary(word_index)
word_index = dict()
index = START_INDEX
for w, v in word_counter.most_common():
    if v < min_word_count:
        break
    word_index[w] = index
    index += 1
print("total %d words" % len(word_index))

total 2946 words


In [None]:
# label_encoder
label_index = {v:k for k, v in enumerate(set(df_train["l1_label"]))}
print(label_index)
inv_label_index = {v:k for k, v in label_index.items()}
print(inv_label_index)

### vectorizer

In [15]:
def get_feature(sents):
    """
    字->index
    input: 1D array: a document(list of sentences) space separated. eg: ["w11 w12", "w21, w22, w23", .. ]
    return 2D array: shape=(sentence_len, word_len)
    """
    feature = []
    for row in sents:
        seq = [word_index.get(w, UNK_INDEX) for w in row.split()]
        feature.append(seq)
            
    return feature

In [16]:
tr_seq = [get_feature(s) for s in train_text_list]
dev_seq = [get_feature(s) for s in dev_text_list]
test_seq = [get_feature(s) for s in test_text_list]

In [17]:
tr_y = np.array([label_index[w] for w in train_label_list])
dev_y = np.array([label_index[w] for w in dev_label_list])
test_y = np.array([label_index[w] for w in test_label_list])

## pretrain embedding weight

In [None]:
with open("pretrain_emb/1.6m_cleaned_cbow_mincount_5_window_5_cbowmean_1.vec") as fr:
    data = fr.read().strip().split("\n")[2:]
    print("%d words" % len(data))
    
    word_weight = dict()
    for row in data:
        w = row.split()
        word_weight[w[0]] = [float(i) for i in w[1:]]

In [None]:
pretrain_emb = np.random.uniform(-0.05, 0.05, (len(word_index)+2, 300))
count = 0
for word, index in word_index.items():
    if word in word_weight:
        pretrain_emb[index] = word_weight[word]
        count += 1
print("{} / {} = {:.4f} has pretrained weight".format(count, len(word_index), count/len(word_index)))

## one-hot embedding

In [18]:
def one_hot_embedding(x, vocab_size):
    import tensorflow as tf
    e = np.eye(vocab_size)
    e[0, 0] = 0
    one_hot_array = tf.constant(e, dtype="float32")
    return tf.nn.embedding_lookup(one_hot_array, x)

## model

In [18]:
# embedding_size = pretrain_emb.shape[1]
embedding_size = 200
hidden_size = 50
uw_size = 100
us_size = 100
batch_size = 64
max_sent_len = 50 # 一句最多看50個字

#### 先做word-level的model

這個subgraph的   
input是一個句子. eg: ```[30, 288, 7, 0, ..., 0]```   
output是這個句子的representation. 即每個word_representation 的 weighted sum. 在這裡```output shape=(100,)```

In [None]:
class Attention(Layer):
    def __init__(self, **kwargs):
        self.att_size = kwargs.pop("att_size")
        self.output_dim = kwargs.pop("output_dim")
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        # Create a trainable weight variable for this layer.
        self.uw = self.add_weight(
            name="attention_size", 
            shape=(self.att_size, ),
            initializer="uniform",
            trainable=True)
        super(Attention, self).build(input_shape)  # Be sure to call this somewhere!
        
    def call(self, x):
        m = tf.reduce_sum(x * self.uw, axis=-1)
        alpha = tf.nn.softmax(m)
        s = x * tf.expand_dims(alpha, axis=-1)
        return tf.reduce_mean(s, axis=1)

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.output_dim)
    
    def compute_mask(self, x, mask=None):
        return None
    
    def get_config(self):
        config = {
            "att_size": self.att_size,
            "output_dim": self.output_dim,
        }
        base_config = super(Attention, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

In [64]:
# shape = number of words
input_sentence_i = Input(shape=(None,), name="input_sentence_i", dtype="int32")

embedding = Embedding(len(word_index)+2, embedding_size, mask_zero=True,
                      name="word_embedding")(input_sentence_i)
# embedding = Lambda(
#     one_hot_embedding, output_shape=(None, len(word_index)+2),
#     arguments={"vocab_size": len(word_index)+2}, input_shape=(None,))(input_sentence_i)

# output shape = (time_steps, hidden_size*2). 在這裡 time_steps = None
word_h_seq = Bidirectional(GRU(hidden_size, return_sequences=True), name="word_h_seq")(embedding)

# 用 TimeDistributed 對 input=(time_steps, hidden_size*2)的每個 time_steps apply a same Dense layer
# output shape = (time_steps, uw_size)
word_mlp = TimeDistributed(Dense(uw_size, activation="tanh"), name="word_mlp")(word_h_seq)

# 將 word hidden state sequence 丟進去算 attention，得到這句 input_sentence_i representation
# si_representation = Lambda(word_attention, output_shape=(hidden_size*2,), name="si_representation")(word_mlp)
# si_representation = Attention(att_size=uw_size, output_dim=hidden_size*2, name="si_representation")(word_mlp)

In [65]:
si_representation = Lambda(
    lambda x: tf.reduce_mean(x, axis=1), output_shape=(uw_size, ), name="si_representation")(word_mlp)
# sent_representation

In [66]:
sent_encoder = Model(inputs=input_sentence_i, outputs=si_representation)

In [67]:
sent_encoder.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_sentence_i (InputLayer (None, None)              0         
_________________________________________________________________
word_embedding (Embedding)   (None, None, 200)         589600    
_________________________________________________________________
word_h_seq (Bidirectional)   (None, None, 100)         75300     
_________________________________________________________________
word_mlp (TimeDistributed)   (None, None, 100)         10100     
_________________________________________________________________
si_representation (Lambda)   (None, 100)               0         
Total params: 675,000
Trainable params: 675,000
Non-trainable params: 0
_________________________________________________________________


In [None]:
SVG(model_to_dot(sent_encoder).create(prog="dot", format="svg"))

#### sentence-level model

input是一篇文章. eg: ```[ [30, 288, 7, 0, ..., 0], [10, 30, ..., 0, 0], .., [0, 0, ..., 0] ]```   
最後會得到這篇文章的representation. 即每個sentence representation 的 weighted sum.   
output是doc_repre + Dense去分類. ```output shape=(label_size,)```

In [68]:
# shape = (num_sentences, num_words=70)
input_document = Input(shape=(None, max_sent_len), name="input_document", dtype="int32")

# output shape = (time_steps, si_representation=hidden_size*2) 在這裡 time_steps = 句子數 = document length = None
# 把每一句丟到 sent_encoder，得到這句的 sentence representation
sentence_representation = TimeDistributed(sent_encoder, name="sentence_representation")(input_document)

# output shape = (time_steps, hidden_size*2)
sent_h_seq = Bidirectional(GRU(hidden_size, return_sequences=True), name="sent_h_seq")(sentence_representation)

# output shape = (time_steps, us_size)
sent_mlp = TimeDistributed(Dense(us_size, activation="tanh"), name="sent_mlp")(sent_h_seq)

# 將 sentence hidden state sequence 丟進去算 attention，得到這篇 input_document representation
# doc_representation = Lambda(sentence_attention, output_shape=(hidden_size*2,), name="doc_representation")(sent_mlp)
# doc_representation = Attention(att_size=us_size, output_dim=hidden_size*2, name="doc_representation")(sent_mlp)

In [69]:
doc_representation = Lambda(
    lambda x: tf.reduce_mean(x, axis=1), output_shape=(us_size, ), name="doc_representation")(sent_mlp)
# doc_representation

In [70]:
predict = Dense(len(label_index), activation="softmax", name="output")(doc_representation)

In [71]:
model = Model(inputs=input_document, outputs=predict)

In [72]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_document (InputLayer)  (None, None, 50)          0         
_________________________________________________________________
sentence_representation (Tim (None, None, 100)         675000    
_________________________________________________________________
sent_h_seq (Bidirectional)   (None, None, 100)         45300     
_________________________________________________________________
sent_mlp (TimeDistributed)   (None, None, 100)         10100     
_________________________________________________________________
doc_representation (Lambda)  (None, 100)               0         
_________________________________________________________________
output (Dense)               (None, 4)                 404       
Total params: 730,804
Trainable params: 730,804
Non-trainable params: 0
_________________________________________________________________


In [None]:
SVG(model_to_dot(model).create(prog="dot", format="svg"))

In [73]:
model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer="adam",
    metrics=["accuracy"])

### 每個batch內的document pad到相同句子數，所有句子pad到相同長度(=max_sent_len)

In [55]:
def get_minibatches(n, minibatch_size):
    # 取得每個 batch 要 train 的 index
    idx_list = np.arange(0, n, minibatch_size)
    minibatches = []
    for idx in idx_list:
        minibatches.append(np.arange(idx, min(idx + minibatch_size, n)))
    return minibatches

In [None]:
def get_minibatches_for_one_hot(n, minibatch_size):
    # 取得每個 batch 要 train 的 index
    # for one hot 是因為為了一二篇超長文章，把整個 batch pad 到最長會導致 gpu oom
    # 前2個 batch 改成 batch_size=8 試試看
    
    idx_list = np.arange(0, n, minibatch_size)
    minibatches = []
    for idx in idx_list:
        minibatches.append(np.arange(idx, min(idx + minibatch_size, n)))
    
    long_minibatches = []
    for i in range(2): # 要拆掉幾個 minibatches
        mini = minibatches.pop(0)
        batches = np.split(mini, 8)
        long_minibatches += batches
    
    return long_minibatches + minibatches

In [31]:
def gen_samples(sequences, labels, minibatches):
    x = [] # 一個element是之後要train的一個batch
    y = [] # 一個element是之後要train的一個batch
    
    for idx in minibatches:
        r = []
        max_doc_len = max([len(sequences[i]) for i in idx]) # 文章長度 = 這個batch裡最多句子數
        for i in range(len(idx)):
            sample = np.zeros((max_doc_len, max_sent_len), dtype="int32") # padding，一句的長度固定是 max_sent_len=50
            seq = sequences[idx[i]]
            for j in range(len(seq)):
                sample[j, :len(seq[j])] = seq[j][:max_sent_len]
            r.append(sample)
        x.append(np.array(r))
        y.append(np.array(labels[idx]))
    return x, y

In [74]:
minibatches = get_minibatches(len(tr_seq), batch_size)
print("train: %d batches" % len(minibatches))
batch_tr_x, batch_tr_y = gen_samples(tr_seq, tr_y, minibatches)

train: 659 batches


In [None]:
minibatches[0].shape, minibatches[1].shape, minibatches[15].shape, minibatches[16].shape

In [None]:
minibatches[0], minibatches[1], minibatches[15], minibatches[16]

In [75]:
minibatches = get_minibatches(len(dev_seq), 16)
print("dev: %d batches" % len(minibatches))
batch_dev_x, batch_dev_y = gen_samples(dev_seq, dev_y, minibatches)

dev: 153 batches


In [76]:
minibatches = get_minibatches(len(test_seq), 16)
print("test: %d batches" % len(minibatches))
batch_test_x, batch_test_y = gen_samples(test_seq, test_y, minibatches)

test: 504 batches


In [59]:
def run_evaluate(m, bx, by):
    # bx, by: batch_bucket
    
    losses = []
    right = 0
    for i in range(len(bx)):
        l, r = m.evaluate(bx[i], by[i], verbose=0)
        losses.append(l)
        right += r
        
    return sum(losses)/len(losses), right/len(losses)

In [77]:
num_epochs = 15
log_folder = "model8/run3"
tr_loss = []
tr_acc = []
dev_loss = []
dev_acc = []

for epoch in range(num_epochs):
    # 隨機排序batch index
    p = np.random.permutation(len(batch_tr_x))
    for i in tqdm(p):
#     for i in range(len(batch_tr_x)):
#         print(i)
        model.train_on_batch(batch_tr_x[i], batch_tr_y[i])
    
    model.save(log_folder + "/model.{}.h5".format(epoch))
    
    loss, acc = run_evaluate(model, batch_tr_x, batch_tr_y)
    tr_loss.append(loss)
    tr_acc.append(acc)
    
    loss, acc = run_evaluate(model, batch_dev_x, batch_dev_y)
    dev_loss.append(loss)
    dev_acc.append(acc)
    
    print("epoch {}. train loss: {:.6f} acc: {:.6f}. dev loss: {:.6f} acc: {:.6f}".format(
        epoch, tr_loss[-1], tr_acc[-1], dev_loss[-1], dev_acc[-1],
    ))
    log = {"tr_loss": tr_loss, "tr_acc": tr_acc, "dev_loss": dev_loss, "dev_acc": dev_acc}
    pickle.dump(log, open(log_folder + "/log.{}.pickle".format(epoch), "wb"))

100%|██████████| 659/659 [03:57<00:00,  2.78it/s]
  0%|          | 0/659 [00:00<?, ?it/s]

epoch 0. train loss: 0.618052 acc: 0.743124. dev loss: 0.618839 acc: 0.746732


100%|██████████| 659/659 [03:54<00:00,  2.81it/s]
  0%|          | 0/659 [00:00<?, ?it/s]

epoch 1. train loss: 0.551910 acc: 0.771932. dev loss: 0.627221 acc: 0.742647


100%|██████████| 659/659 [03:54<00:00,  2.80it/s]
  0%|          | 0/659 [00:00<?, ?it/s]

epoch 2. train loss: 0.521009 acc: 0.787344. dev loss: 0.586805 acc: 0.745507


100%|██████████| 659/659 [03:54<00:00,  2.80it/s]
  0%|          | 0/659 [00:00<?, ?it/s]

epoch 3. train loss: 0.474337 acc: 0.808446. dev loss: 0.579806 acc: 0.759395


100%|██████████| 659/659 [03:55<00:00,  2.80it/s]
  0%|          | 0/659 [00:00<?, ?it/s]

epoch 4. train loss: 0.445960 acc: 0.824308. dev loss: 0.557102 acc: 0.765523


100%|██████████| 659/659 [03:54<00:00,  2.81it/s]
  0%|          | 0/659 [00:00<?, ?it/s]

epoch 5. train loss: 0.414821 acc: 0.838534. dev loss: 0.578401 acc: 0.761029


100%|██████████| 659/659 [03:54<00:00,  2.81it/s]
  0%|          | 0/659 [00:00<?, ?it/s]

epoch 6. train loss: 0.410433 acc: 0.841284. dev loss: 0.641300 acc: 0.732026


100%|██████████| 659/659 [03:55<00:00,  2.79it/s]
  0%|          | 1/659 [00:00<01:28,  7.47it/s]

epoch 7. train loss: 0.384009 acc: 0.850318. dev loss: 0.671849 acc: 0.743873


100%|██████████| 659/659 [03:54<00:00,  2.81it/s]
  0%|          | 0/659 [00:00<?, ?it/s]

epoch 8. train loss: 0.340540 acc: 0.873459. dev loss: 0.618790 acc: 0.759804


100%|██████████| 659/659 [03:56<00:00,  2.79it/s]
  0%|          | 0/659 [00:00<?, ?it/s]

epoch 9. train loss: 0.315649 acc: 0.884626. dev loss: 0.623527 acc: 0.768791


100%|██████████| 659/659 [03:54<00:00,  2.81it/s]
  0%|          | 0/659 [00:00<?, ?it/s]

epoch 10. train loss: 0.281541 acc: 0.897548. dev loss: 0.656743 acc: 0.767974


100%|██████████| 659/659 [03:54<00:00,  2.81it/s]
  0%|          | 0/659 [00:00<?, ?it/s]

epoch 11. train loss: 0.293235 acc: 0.891526. dev loss: 0.656238 acc: 0.764706


100%|██████████| 659/659 [03:55<00:00,  2.80it/s]
  0%|          | 0/659 [00:00<?, ?it/s]

epoch 12. train loss: 0.234018 acc: 0.919291. dev loss: 0.745364 acc: 0.741830


100%|██████████| 659/659 [03:54<00:00,  2.81it/s]
  0%|          | 0/659 [00:00<?, ?it/s]

epoch 13. train loss: 0.224572 acc: 0.920879. dev loss: 0.769269 acc: 0.749592


100%|██████████| 659/659 [03:55<00:00,  2.80it/s]


epoch 14. train loss: 0.192161 acc: 0.936220. dev loss: 0.829379 acc: 0.736111


### load model to evaluate test set

In [78]:
# m = keras.models.load_model(log_folder + "/model.14.h5", custom_objects={"Attention": Attention})
m = keras.models.load_model(log_folder + "/model.9.h5", custom_objects={"tf": tf})

In [79]:
run_evaluate(m, batch_dev_x, batch_dev_y)

(0.62352723847417268, 0.76879084967320266)

In [80]:
run_evaluate(m, batch_test_x, batch_test_y)

(0.60254765594644211, 0.78268494900493391)

In [None]:
batch_test_x[0][0].shape

In [None]:
batch_test_x[0].shape

In [None]:
batch_test_x[0][0].shape

In [None]:
np.array([batch_test_x[0][0]]).shape

In [None]:
m.evaluate(np.array([batch_test_x[0][0]]), np.array([batch_test_y[0][0]]))

In [None]:
m.predict(np.array([batch_test_x[0][0]]))

### online inference parameters

In [None]:
variables = {
    "word_index": word_index, # v
    "inv_label_index": inv_label_index, # v
    "max_sent_len": max_sent_len, # v
    "min_word_count": min_word_count,
    "hidden_size": hidden_size,
    "uw_size": uw_size,
    "us_size": uw_size,
    "embedding_size": embedding_size,
    "word_counter": word_counter,
    "label_index": label_index,
#     "tr_seq": tr_seq,
#     "tr_y": tr_y,
#     "dev_seq": dev_seq,
#     "dev_y": dev_y,
#     "test_seq": test_seq,
#     "test_y": test_y,
}

In [None]:
pickle.dump(variables, open(log_folder + "/var.pickle", "wb"))