Ví dụ sử dụng mạng nơ-ron tích chập và véc-tơ từ (fasttext) trong phân loại văn bản. 

Mô hình được tham khảo từ bài báo "Convolutional Neural Networks for Sentence Classification"

https://www.aclweb.org/anthology/D14-1181

In [1]:
import pandas
import numpy
import re
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout
from keras.models import Model, Sequential
from keras import initializers
import keras
%matplotlib inline
import matplotlib.pyplot as plt

Using TensorFlow backend.


In [2]:
def replace_words(text):
    text = regex.sub('[^\p{Latin}\n\t\s\d]', '', text).strip()
    return text

Các hằng số

In [3]:
MAX_SEQUENCE_LENGTH = 5500
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.2
NUM_CLASS = 7

In [4]:
df = pandas.read_csv('bds.csv', delimiter=',')

Chuyển nhãn sang dạng số từ 0 đến NUM_CLASS-1 cho phù hợp với đầu vào và đầu ra của thư viện đang sử dụng.  

In [5]:
label_text = sorted(set(df['HouseTypeCssClass']))
label_to_id = dict((note, number) for number, note in enumerate(label_text))

def to_id(i):
    return label_to_id[i]

df['HouseTypeCssClass']=df['HouseTypeCssClass'].apply(to_id)

In [6]:
label_text

['tp-apartment',
 'tp-building',
 'tp-hotel',
 'tp-house',
 'tp-none',
 'tp-villa',
 'tp-warehouse']

In [7]:
set(df['HouseTypeCssClass'])

{0, 1, 2, 3, 4, 5, 6}

Chuyển tiêu đề và nhãn vào 2 danh sách

In [8]:
texts = []
labels = []


for data in df['Title']:
    texts.append(data)


for data in df['HouseTypeCssClass']:
    labels.append(data)

Tách từ

In [9]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index

Tạo ma trận từ

In [10]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

Chuyển nhãn sang dạng véc-tơ. 

In [11]:
labels = to_categorical(numpy.asarray(labels))

Ví dụ nhãn của ví dụ đầu tiên mang lớp số 3 (tp-house)

In [12]:
labels[0]

array([0., 0., 0., 1., 0., 0., 0.], dtype=float32)

Chia tập huấn luyện thành 2 phần

In [13]:
indices = numpy.arange(data.shape[0])
numpy.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

Đọc file vector và lưu vào từ điển 

In [14]:
embeddings_dict = {}
f = open('wiki.vi.vec')
for line in f:
    values = line.split('\t')
    word = values[0]
    coefs = numpy.asarray(values[1:], dtype='float32')
    embeddings_dict[word] = coefs
f.close()

Tạo ma trận mới mỗi hàng là một véc-tơ tương ứng với từ tại vị trí đó.

In [15]:
embedding_matrix = numpy.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_dict.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

Xóa bộ nhớ

In [16]:
embeddings_dict.clear()

In [17]:
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)

In [18]:
input_sequence = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

In [19]:
embedded_sequences = embedding_layer(input_sequence)

Tạo mô hình CNN

In [24]:
out = Conv1D(filters=8, 
             kernel_size=2, 
             strides=1, 
             padding='same', 
             activation='relu', 
             kernel_initializer=initializers.he_normal(0.001))(embedded_sequences)
out = MaxPooling1D(2)(out)
out = Conv1D(12, 3, activation='relu', kernel_initializer=initializers.he_normal(0.001))(out)
out = MaxPooling1D(2)(out)
out = Conv1D(16, 3, activation='relu', kernel_initializer=initializers.he_normal(0.001))(out)
out = MaxPooling1D(2)(out) 
out = Conv1D(32, 3, activation='relu', kernel_initializer=initializers.he_normal(0.001))(out)
out = MaxPooling1D(2)(out)
out = Conv1D(48, 2, activation='relu', kernel_initializer=initializers.he_normal(0.001))(out)
out = MaxPooling1D(2)(out) 
out = Conv1D(64, 2, activation='relu', kernel_initializer=initializers.he_normal(0.001))(out)
out = MaxPooling1D(2)(out)
out = Conv1D(64, 2, activation='relu', kernel_initializer=initializers.he_normal(0.001))(out)
out = MaxPooling1D(2)(out) 
out = Conv1D(128, 2, activation='relu', kernel_initializer=initializers.he_normal(0.001))(out)
out = MaxPooling1D(2)(out) 
out = Conv1D(128, 2, activation='relu', kernel_initializer=initializers.he_normal(0.001))(out)
out = MaxPooling1D(2)(out)
out = Conv1D(128, 2, activation='relu', kernel_initializer=initializers.he_normal(0.001))(out)
out = MaxPooling1D(2)(out)
out = Conv1D(128, 2, activation='relu', kernel_initializer=initializers.he_normal(0.001))(out)
out = MaxPooling1D(2)(out)
out = Flatten()(out)
out = Dropout(0.4)(out)
out = Dense(256, activation='relu', kernel_initializer=initializers.he_normal(0.001))(out)
out = Dropout(0.4)(out)
out = Dense(128, activation='relu', kernel_initializer=initializers.he_normal(0.001))(out)
out = Dense(NUM_CLASS, activation='softmax')(out)

model = Model(input_sequence, out)


model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adam(lr=0.0001),
              metrics=['accuracy'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 5500)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 5500, 300)         1535400   
_________________________________________________________________
conv1d_12 (Conv1D)           (None, 5500, 8)           4808      
_________________________________________________________________
max_pooling1d_12 (MaxPooling (None, 2750, 8)           0         
_________________________________________________________________
conv1d_13 (Conv1D)           (None, 2748, 12)          300       
_________________________________________________________________
max_pooling1d_13 (MaxPooling (None, 1374, 12)          0         
_________________________________________________________________
conv1d_14 (Conv1D)           (None, 1372, 16)          592       
__________

Huấn luyện

In [28]:
history=model.fit(x_train, y_train, 
                  validation_data=(x_val, y_val),
                  epochs=30, 
                  batch_size=64,
                  verbose=False)

In [None]:
fig1 = plt.figure()
plt.plot(history.history['loss'],'r')
plt.plot(history.history['val_loss'],'b')
plt.legend(['Training loss', 'Validation Loss'])
plt.xlabel('Epochs ')
plt.ylabel('Loss')
plt.show()

In [None]:
fig2=plt.figure()
plt.plot(history.history['acc'],'r')
plt.plot(history.history['val_acc'],'b')
plt.legend(['Training Accuracy', 'Validation Accuracy'])
plt.xlabel('Epochs ')
plt.ylabel('Accuracy')
plt.show()