In [2]:
import sys 
sys.path.append("..") 

import pickle
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import regularizers
# from attention import Attention
from sklearn.model_selection import train_test_split

from utils import data_helper

print(tf.__version__)

2.2.0-dev20200508


In [3]:
data_path='../data/train_preprocessed.csv'

data_df=pd.read_csv(data_path)[['text','cutted_text','label']]

train_df, test_df=train_test_split(data_df,test_size=0.1,random_state=1)
print(train_df['label'].value_counts()) # 分布均匀

1    17244
0    17210
Name: label, dtype: int64


In [4]:
tokenizer_path='../data/tokenizer.pickle'
tokenizer_mode='load'
max_num_words=100000
max_seq_len=128

X=train_df['cutted_text'].values
# Y= train_df['label'].values # np.reshape(train_df['label'].values,(-1,1))
Y = pd.get_dummies(train_df['label']).values

if not os.path.exists(tokenizer_path):
    tokenizer_mode = 'create'

X,lang_tokenizer=data_helper.tokenize(X,mode=tokenizer_mode,path=tokenizer_path,max_num_words=max_num_words,max_sequence_len=max_seq_len)

print('* X shape ', X.shape)
print('* Y shape ', Y.shape)
# print(Y)

** Load tokenzier from:  ../data/tokenizer.pickle
** Total different words: 73745.
* X shape  (34454, 128)
* Y shape  (34454, 2)


In [5]:
embedding_dims = 128
dropout = 0.2
lstm_units = 128
regularizer_factor = 0.005
output_units=2

# build model
model = tf.keras.models.Sequential([
            tf.keras.layers.Embedding(
                max_num_words, embedding_dims, input_length=max_seq_len),
            # tf.keras.layers.SpatialDropout1D(dropout),
            tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(
                lstm_units, return_sequences=True, kernel_regularizer=regularizers.l2(regularizer_factor))),
            tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(
                int(lstm_units), kernel_regularizer=regularizers.l2(regularizer_factor))),
            tf.keras.layers.Dense(
                int(lstm_units/2), activation='relu', kernel_regularizer=regularizers.l2(regularizer_factor)),
            tf.keras.layers.Dropout(dropout),
            tf.keras.layers.Dense(output_units, activation='softmax'),

        ])
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=[
              tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 128, 128)          12800000  
_________________________________________________________________
bidirectional (Bidirectional (None, 128, 256)          263168    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               394240    
_________________________________________________________________
dense (Dense)                (None, 64)                16448     
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 130       
Total params: 13,473,986
Trainable params: 13,473,986
Non-trainable params: 0
____________________________________________

In [6]:
epochs=3
batch_size=64
val_split=0.1 # 验证集划分
checkpoint_path='../saved_models/bilstm.h5'

save_model_cb = tf.keras.callbacks.ModelCheckpoint(
            filepath=checkpoint_path, monitor='val_loss', mode='auto', save_best_only=True, save_weights_only=False, verbose=1, save_freq='epoch')
callbacks=[save_model_cb]

history=model.fit(x=X,y=Y,batch_size=batch_size,epochs=epochs,callbacks=callbacks,validation_split=val_split)

# model.save(checkpoint_path)

Epoch 1/3
Epoch 00001: val_loss improved from inf to 0.14469, saving model to ../saved_models\bilstm.h5
Epoch 2/3
Epoch 00002: val_loss did not improve from 0.14469
Epoch 3/3
Epoch 00003: val_loss improved from 0.14469 to 0.09124, saving model to ../saved_models\bilstm.h5


In [7]:
def f1_score(precision,recall):
    return 2*(precision*recall)/(precision+recall)

train_f1=f1_score(history.history['precision'][-1],history.history['recall'][-1])
val_f1=f1_score(history.history['val_precision'][-1],history.history['val_recall'][-1])


test_X,_=data_helper.tokenize(test_df['cutted_text'].values,mode='load',path=tokenizer_path,max_num_words=max_num_words,max_sequence_len=max_seq_len)
test_Y=pd.get_dummies(test_df['label']).values

_,p,r=model.evaluate(test_X,test_Y)

print('train f1 ',train_f1)
print('val f1 ',val_f1)
print('test_f1 ',f1_score(p,r))

** Load tokenzier from:  ../data/tokenizer.pickle
** Total different words: 73745.
train f1  0.9901960492134094
val f1  0.983749270439148
test_f1  0.9840689301490784
