# BiLSTM classifier

In [14]:
import os
os.chdir("/home/yick/Projects/github.com/text-classifier")

### build model


In [15]:
import numpy as np
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import GlobalAveragePooling1D
from tensorflow.keras.regularizers import l2
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers  import Adam

WORD_VECTOR_PATH = "/home/yick/Models/tencent/embeddings/light_Tencent_AILab_ChineseEmbedding.txt"

def load_wv(vocab, fpath=WORD_VECTOR_PATH):
    word2vec = {}
    embedding_dim = None
    with open(fpath) as f:
        for i, line in enumerate(f):
            values = line.split()
            if i == 0:
                embedding_dim = int(values[1])
                continue
            if len(values) != embedding_dim + 1:
                print(f"error values: {values[:5]}, values len: {len(values)}")
                continue
            word = values[0]
            coefs = np.asarray(values[1:], dtype="float32")
            word2vec[word] = coefs
    print(f"Found {len(word2vec)} word vectors." )
    vocab_size = len(vocab)
    embedding_mat = np.random.rand(vocab_size+1, embedding_dim)
    word_embedding_cnt = 0
    for word, i in vocab.items():
        if word in word2vec:
            word_embedding_cnt += 1
            embedding_mat[i] = word2vec.get(word)
    print(f"vocab size: {vocab_size}")
    print(f"word_embedding_cnt: {word_embedding_cnt}")
    return embedding_mat, embedding_dim


def build_model(vocab, num_classes, max_len=30):
    embedding_mat, embedding_dim = load_wv(vocab)

    inputs = Input(shape=(max_len,), dtype="int32")
    embeddings = Embedding(
        input_dim=len(vocab)+1,
		output_dim=embedding_dim,
		input_length=max_len,
		weights=[embedding_mat],
		# embeddings_regularizer=l2(0.00),
		trainable=True
    )(inputs)
    print(f"embeddings: {embeddings.shape}")
    x = Bidirectional(LSTM(128, return_sequences=True))(embeddings)
    x = GlobalAveragePooling1D()(x)
    x = Dense(128, activation="relu", kernel_regularizer=l2(1e-4))(x)
    x = BatchNormalization()(x)
    x = Dropout(0.5)(x)
    x = Dense(128, activation="relu", kernel_regularizer=l2(1e-4))(x)
    x = BatchNormalization()(x)
    x = Dropout(0.5)(x)
    outputs = Dense(
        num_classes,
        activation="softmax",
        kernel_regularizer=l2(1e-4)
    )(x)
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(
        loss="categorical_crossentropy",
		optimizer=Adam(lr=5e-3),
		metrics=["accuracy"]
    )
    model.summary()
    return model

###  load data

In [16]:
import pandas as pd
train_df = pd.read_csv("./data/train_data.csv")
test_df = pd.read_csv("./data/test_data.csv")
print(f"train_df shape: {train_df.shape}")
print(f"test_df shape: {test_df.shape}")


train_df shape: (8718, 2)
test_df shape: (741, 2)


### make label encoder

In [17]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(train_df["label"].tolist())
num_classes = len(label_encoder.classes_)
y = to_categorical(labels, num_classes=num_classes)
print(f"y shape: {y.shape}")

y shape: (8718, 54)


### make vocab

In [18]:
import jieba
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
texts = [[w for w in jieba.cut(d)] for d in train_df["text"].tolist()]
tokenizer = Tokenizer(num_words=None)
tokenizer.fit_on_texts(texts)
X = tokenizer.texts_to_sequences(texts)
X = pad_sequences(X, maxlen=30, padding="pre", truncating="pre")
vocab = tokenizer.word_index
print(f"vocab size: {len(vocab)}")
print(f"X shape: {X.shape}")

vocab size: 2392
X shape: (8718, 30)


### model train

In [19]:
from tensorflow.keras.callbacks import EarlyStopping
model = build_model(vocab, num_classes, max_len=30)
early_stopping = EarlyStopping(
    monitor="val_loss",
    patience=3,
    mode="min"
)
model.fit(
    x=X,
    y=y,
    batch_size=32,
    epochs=20,
    validation_split=0.05,
    shuffle=True,
    verbose=2,
    callbacks=[early_stopping]
)

error values: ['中共中央', '国务院关于完善产权保护制度依法保护产权的意见', '0.141571', '-0.006408', '-0.813869'], values len: 202
error values: ['杨', '光', '-0.045217', '-0.197674', '0.007343'], values len: 202
error values: ['王', '琪', '0.052169', '-0.368297', '-0.304854'], values len: 202
error values: ['我', '末代工农兵学员', '0.485441', '0.84239', '0.347323'], values len: 202
error values: ['财政部', '国家税务总局关于非货币性资产投资企业所得税政策问题的通知', '-0.088186', '-0.23139', '0.024681'], values len: 202
Found 143607 word vectors.
vocab size: 2392
word_embedding_cnt: 2034
embeddings: (?, 30, 200)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 30)                0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 30, 200)           478600    
_________________________________________________________________
bidirectional_2 (Bidirection (None, 30, 256)           

<tensorflow.python.keras.callbacks.History at 0x7fd211386a10>

### model test

In [20]:
from sklearn.metrics import classification_report
test_texts = [[w for w in jieba.cut(d)] for d in test_df["text"].tolist()]
test_x = tokenizer.texts_to_sequences(test_texts)
test_x = pad_sequences(test_x, maxlen=30, padding="pre", truncating="pre")
probs = model.predict(test_x)
preds = np.argmax(probs, axis=1)
pred_labels = label_encoder.inverse_transform(preds.tolist())
true_labels = test_df["label"].tolist()
report = classification_report(true_labels, pred_labels, digits=4)
print(report)

                   precision    recall  f1-score   support

              上征信     0.0526    1.0000    0.1000         1
            之前被拒了     0.0000    0.0000    0.0000         1
             人工服务     1.0000    0.4286    0.6000         7
             什么平台     0.9615    0.8065    0.8772        31
          会不会放款失败     0.0000    0.0000    0.0000         4
              利息高     0.2857    0.3333    0.3077         6
加下微信/发信息/发个短信/发资料     0.0000    0.0000    0.0000         0
           号码是哪来的     0.0000    0.0000    0.0000         0
            否定/拒绝     0.9394    0.3690    0.5299        84
          咨询APP名字     0.6250    1.0000    0.7692         5
          咨询利息/费用     0.8667    0.7027    0.7761        37
           咨询提前还款     1.0000    1.0000    1.0000         1
             咨询操作     1.0000    0.2333    0.3784        30
           咨询放款速度     0.2308    0.7500    0.3529         4
          咨询额度-通用     0.5488    0.8333    0.6618        54
        嗯啊哦额/模糊回答     0.5000    0.4444    0.4706       