# BiLSTM classifier

### set random seed

In [90]:
import numpy as np
import tensorflow as tf
np.random.seed(42)
tf.set_random_seed(42)
print("set random seed done")

set random seed done


### build model


In [91]:
import numpy as np
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import GlobalAveragePooling1D
from tensorflow.keras.regularizers import l2
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers  import Adam

# download url: https://ai.tencent.com/ailab/nlp/en/data/tencent-ailab-embedding-zh-d200-v0.2.0-s.tar.gz
WORD_VECTOR_PATH = "/home/yick/Models/tencent/embeddings/tencent-ailab-embedding-zh-d200-v0.2.0-s.txt"

def load_wv(vocab, fpath=WORD_VECTOR_PATH):
    word2vec = {}
    embedding_dim = None
    with open(fpath) as f:
        for i, line in enumerate(f):
            values = line.split()
            if i == 0:
                embedding_dim = int(values[1])
                continue
            if len(values) != embedding_dim + 1:
                print(f"error values: {values[:5]}, values len: {len(values)}")
                continue
            word = values[0]
            coefs = np.asarray(values[1:], dtype="float32")
            word2vec[word] = coefs
    print(f"Found {len(word2vec)} word vectors." )
    vocab_size = len(vocab)
    embedding_mat = np.random.rand(vocab_size+1, embedding_dim)
    word_embedding_cnt = 0
    for word, i in vocab.items():
        if word in word2vec:
            word_embedding_cnt += 1
            embedding_mat[i] = word2vec.get(word)
    print(f"vocab size: {vocab_size}")
    print(f"word_embedding_cnt: {word_embedding_cnt}")
    return embedding_mat, embedding_dim


def build_model(vocab, num_classes, max_len=30):
    embedding_mat, embedding_dim = load_wv(vocab)

    inputs = Input(shape=(max_len,), dtype="int32")
    embeddings = Embedding(
        input_dim=len(vocab)+1,
		output_dim=embedding_dim,
		input_length=max_len,
		weights=[embedding_mat],
		trainable=True
    )(inputs)
    print(f"embeddings: {embeddings.shape}")
    x = Bidirectional(
            LSTM(128, return_sequences=True, dropout=0.5)
    )(embeddings)
    x = GlobalAveragePooling1D()(x)
    x = Dropout(0.5)(x)
    x = BatchNormalization()(x)
    x = Dense(128, activation="relu", kernel_regularizer=l2(1e-4))(x)
    x = Dropout(0.5)(x)
    x = BatchNormalization()(x)
    outputs = Dense(
        num_classes,
        activation="softmax",
        kernel_regularizer=l2(1e-4)
    )(x)
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(
        loss="categorical_crossentropy",
		optimizer=Adam(lr=6e-3),
		metrics=["accuracy"]
    )
    model.summary()
    return model

###  load data

In [92]:
import os
import pandas as pd
data_dir = "/home/yick/Projects/github.com/text-classifier/data"
train_file = os.path.join(data_dir, "train_data.csv")
test_file = os.path.join(data_dir, "test_data.csv")
train_df = pd.read_csv(train_file)
test_df = pd.read_csv(test_file)
print(f"train_df shape: {train_df.shape}")
print(f"test_df shape: {test_df.shape}")

train_df shape: (8718, 2)
test_df shape: (741, 2)


### make label encoder

In [93]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(train_df["label"].tolist())
num_classes = len(label_encoder.classes_)
y = to_categorical(labels, num_classes=num_classes)
print(f"y shape: {y.shape}")

y shape: (8718, 54)


### make vocab

In [94]:
import jieba
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
texts = [
    [w.strip() for w in jieba.cut(d.strip()) if w.strip()]
    for d in train_df["text"].tolist()
]
tokenizer = Tokenizer(num_words=None)
tokenizer.fit_on_texts(texts)
X = tokenizer.texts_to_sequences(texts)
X = pad_sequences(X, maxlen=30, padding="pre", truncating="pre")
vocab = tokenizer.word_index
print(f"vocab size: {len(vocab)}")
print(f"X shape: {X.shape}")

vocab size: 2390
X shape: (8718, 30)


### model train

In [95]:
from sklearn.utils import class_weight
from tensorflow.keras.callbacks import EarlyStopping
model = build_model(vocab, num_classes, max_len=30)
early_stopping = EarlyStopping(
    monitor="val_loss",
    patience=5,
    mode="min"
)
checkpoint = tf.keras.callbacks.ModelCheckpoint(
    "best_model.h5",
    monitor="val_loss",
    save_weights_only=False,
    save_best_only=True,
    mode="min"
)
weights = class_weight.compute_class_weight(
    class_weight="balanced",
    classes=np.unique(labels),
    y=labels
)
model.fit(
    x=X,
    y=y,
    class_weight=weights,
    batch_size=128,
    epochs=50,
    validation_split=0.05,
    shuffle=True,
    verbose=2,
    callbacks=[early_stopping, checkpoint]
)

Found 2000000 word vectors.
vocab size: 2390
word_embedding_cnt: 2165
embeddings: (?, 30, 200)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_12 (InputLayer)        (None, 30)                0         
_________________________________________________________________
embedding_11 (Embedding)     (None, 30, 200)           478200    
_________________________________________________________________
bidirectional_11 (Bidirectio (None, 30, 256)           336896    
_________________________________________________________________
global_average_pooling1d_11  (None, 256)               0         
_________________________________________________________________
dropout_22 (Dropout)         (None, 256)               0         
_________________________________________________________________
batch_normalization_v1_22 (B (None, 256)               1024      
_______________________________________________

<tensorflow.python.keras.callbacks.History at 0x7fa333411c10>

### load best model

In [96]:
from tensorflow.keras.models import load_model
model = load_model("best_model.h5")
print("load best model done")


load best model done


### model test

In [97]:
from sklearn.metrics import classification_report
test_texts = [
    [w.strip() for w in jieba.cut(d.strip()) if w.strip()]
    for d in test_df["text"].tolist()
]
test_x = tokenizer.texts_to_sequences(test_texts)
test_x = pad_sequences(test_x, maxlen=30, padding="pre", truncating="pre")
probs = model.predict(test_x)
preds = np.argmax(probs, axis=1)
pred_labels = label_encoder.inverse_transform(preds.tolist())
true_labels = test_df["label"].tolist()
report = classification_report(true_labels, pred_labels, digits=4)
print(report)

                   precision    recall  f1-score   support

              上征信     0.5000    1.0000    0.6667         1
              不舒服     0.0000    0.0000    0.0000         0
            之前被拒了     0.0000    0.0000    0.0000         1
             人工服务     1.0000    0.8571    0.9231         7
             什么平台     0.8056    0.9355    0.8657        31
          会不会放款失败     0.5000    0.2500    0.3333         4
              利息高     0.5000    0.1667    0.2500         6
加下微信/发信息/发个短信/发资料     0.0000    0.0000    0.0000         0
           号码是哪来的     0.0000    0.0000    0.0000         0
            否定/拒绝     0.8028    0.6786    0.7355        84
          咨询APP名字     0.7143    1.0000    0.8333         5
          咨询利息/费用     0.8684    0.8919    0.8800        37
           咨询提前还款     1.0000    1.0000    1.0000         1
             咨询操作     0.8947    0.5667    0.6939        30
           咨询放款速度     0.4000    0.5000    0.4444         4
          咨询额度-通用     0.7419    0.8519    0.7931       