# BiLSTM classifier

In [1]:
import os
import numpy as np
import tensorflow as tf
os.chdir("/home/yick/Projects/github.com/text-classifier")
np.random.seed(42)
tf.set_random_seed(42)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


### build model


In [2]:
import numpy as np
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import GlobalAveragePooling1D
from tensorflow.keras.regularizers import l2
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers  import Adam

WORD_VECTOR_PATH = "/home/yick/Models/tencent/embeddings/tencent-ailab-embedding-zh-d200-v0.2.0-s.txt"

def load_wv(vocab, fpath=WORD_VECTOR_PATH):
    word2vec = {}
    embedding_dim = None
    with open(fpath) as f:
        for i, line in enumerate(f):
            values = line.split()
            if i == 0:
                embedding_dim = int(values[1])
                continue
            if len(values) != embedding_dim + 1:
                print(f"error values: {values[:5]}, values len: {len(values)}")
                continue
            word = values[0]
            coefs = np.asarray(values[1:], dtype="float32")
            word2vec[word] = coefs
    print(f"Found {len(word2vec)} word vectors." )
    vocab_size = len(vocab)
    embedding_mat = np.random.rand(vocab_size+1, embedding_dim)
    word_embedding_cnt = 0
    for word, i in vocab.items():
        if word in word2vec:
            word_embedding_cnt += 1
            embedding_mat[i] = word2vec.get(word)
    print(f"vocab size: {vocab_size}")
    print(f"word_embedding_cnt: {word_embedding_cnt}")
    return embedding_mat, embedding_dim


def build_model(vocab, num_classes, max_len=30):
    embedding_mat, embedding_dim = load_wv(vocab)

    inputs = Input(shape=(max_len,), dtype="int32")
    embeddings = Embedding(
        input_dim=len(vocab)+1,
		output_dim=embedding_dim,
		input_length=max_len,
		weights=[embedding_mat],
		embeddings_regularizer=l2(0.00),
		trainable=True
    )(inputs)
    print(f"embeddings: {embeddings.shape}")
    x = Bidirectional(LSTM(128, return_sequences=True))(embeddings)
    x = GlobalAveragePooling1D()(x)
    # x = Dense(128, activation="relu", kernel_regularizer=l2(1e-4))(x)
    x = Dropout(0.5)(x)
    x = BatchNormalization()(x)
    x = Dense(128, activation="relu", kernel_regularizer=l2(1e-4))(x)
    x = Dropout(0.5)(x)
    x = BatchNormalization()(x)
    outputs = Dense(
        num_classes,
        activation="softmax",
        kernel_regularizer=l2(1e-4)
    )(x)
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(
        loss="categorical_crossentropy",
		optimizer=Adam(lr=5e-3),
		metrics=["accuracy"]
    )
    model.summary()
    return model

###  load data

In [3]:
import pandas as pd
train_df = pd.read_csv("./data/train_data.csv")
test_df = pd.read_csv("./data/test_data.csv")
print(f"train_df shape: {train_df.shape}")
print(f"test_df shape: {test_df.shape}")


train_df shape: (8718, 2)
test_df shape: (741, 2)


### make label encoder

In [4]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(train_df["label"].tolist())
num_classes = len(label_encoder.classes_)
y = to_categorical(labels, num_classes=num_classes)
print(f"y shape: {y.shape}")

y shape: (8718, 54)


### make vocab

In [5]:
import jieba
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
texts = [
    [w.strip() for w in jieba.cut(d.strip()) if w.strip()]
    for d in train_df["text"].tolist()
]
tokenizer = Tokenizer(num_words=None)
tokenizer.fit_on_texts(texts)
X = tokenizer.texts_to_sequences(texts)
X = pad_sequences(X, maxlen=30, padding="pre", truncating="pre")
vocab = tokenizer.word_index
print(f"vocab size: {len(vocab)}")
print(f"X shape: {X.shape}")

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.409 seconds.
Prefix dict has been built successfully.


vocab size: 2390
X shape: (8718, 30)


### model train

In [None]:
from sklearn.utils import class_weight
from tensorflow.keras.callbacks import EarlyStopping
model = build_model(vocab, num_classes, max_len=30)
early_stopping = EarlyStopping(
    monitor="val_loss",
    patience=3,
    mode="min"
)
checkpoint = tf.keras.callbacks.ModelCheckpoint(
    "best_model.h5",
    monitor="val_loss",
    save_weights_only=False,
    save_best_only=True,
    mode="min"
)
weights = class_weight.compute_class_weight(
    class_weight="balanced",
    classes=np.unique(labels),
    y=labels
)
model.fit(
    x=X,
    y=y,
    class_weight=weights,
    batch_size=128,
    epochs=50,
    validation_split=0.05,
    shuffle=True,
    verbose=2,
    callbacks=[early_stopping, checkpoint]
)

### load best model

In [None]:
from tensorflow.keras.models import load_model
model = load_model("best_model.h5")
print("load best model done")


### model test

In [None]:
from sklearn.metrics import classification_report
test_texts = [[w for w in jieba.cut(d)] for d in test_df["text"].tolist()]
test_x = tokenizer.texts_to_sequences(test_texts)
test_x = pad_sequences(test_x, maxlen=30, padding="pre", truncating="pre")
probs = model.predict(test_x)
preds = np.argmax(probs, axis=1)
pred_labels = label_encoder.inverse_transform(preds.tolist())
true_labels = test_df["label"].tolist()
report = classification_report(true_labels, pred_labels, digits=4)
print(report)