In [26]:
import gensim, re
import numpy as np
import pandas as pd
import pickle
from os import listdir

from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.models import Sequential, load_model
from keras.layers import LSTM, Dense, Embedding, GRU, Bidirectional

import sys
import os

# Step 1. Mount drive 
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
%cd /content/gdrive/My\ Drive
# Step 2. Load and make pickle file

def txtTokenizer(texts):
    tokenizer = Tokenizer()
    # fit the tokenizer on our text
    tokenizer.fit_on_texts(texts)

    # get all words that the tokenizer knows
    word_index = tokenizer.word_index
    return tokenizer, word_index

def preProcess(sentences):

    text = [re.sub(r'([^\s\w]|_)+', '', sentence) for sentence in sentences if sentence!='']
    text = [sentence.lower().strip().split() for sentence in text]
    #print("Tex=",text)
    return text

def loadData(data_folder):

    texts = []
    labels = []
    #
    for folder in listdir(data_folder):
        #
        if folder != ".DS_Store":
            print("Load cat: ",folder)
            for file in listdir(os.path.join(data_folder , folder)):
                #
                if file!=".DS_Store":
                    print("Load file: ", file)
                    with open(os.path.join(data_folder ,folder , file), 'r', encoding="utf-8") as f:
                        all_of_it = f.read()
                        sentences  = all_of_it.split('.')

                        # Remove garbage
                        sentences = preProcess(sentences)

                        texts = texts + sentences
                        label = [folder for _ in sentences]
                        labels = labels + label
                        del all_of_it, sentences


    return texts, labels

data_folder = "data"
texts, labels = loadData(data_folder)
tokenizer, word_index = txtTokenizer(texts)

# put the tokens in a matrix
X = tokenizer.texts_to_sequences(texts)
X = pad_sequences(X)

# prepare the labels
y = pd.get_dummies(labels)
file = open(os.path.join(data_folder ,"data.pkl"), 'wb')
pickle.dump([X,y, texts],file)
file.close()


In [6]:

print("After loading raw data")
print(X.shape)
print((X[10:30]))
print((y[10:30]))
print((texts[10:30]))

After loading raw data
(129310, 438)
[[   0    0    0 ...   10  120   13]
 [   0    0    0 ...   38  110   48]
 [   0    0    0 ...  428  110  252]
 ...
 [   0    0    0 ...   39  496   13]
 [   0    0    0 ...  662  581 1249]
 [   0    0    0 ...  163    8 2117]]
    Economy  Education  Medical
10        0          1        0
11        0          1        0
12        0          1        0
13        0          1        0
14        0          1        0
15        0          1        0
16        0          1        0
17        0          1        0
18        0          1        0
19        0          1        0
20        0          1        0
21        0          1        0
22        0          1        0
23        0          1        0
24        0          1        0
25        0          1        0
26        0          1        0
27        0          1        0
28        0          1        0
29        0          1        0
[['theo', 'hội', 'đồng', 'thi', 'sơn', 'la', 'các', 'phòng', 'b

In [0]:
# split in train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=True)

# train Word2Vec model on our data
word_model = gensim.models.Word2Vec(texts, size=300, min_count=1, iter=10)
word_model.save(os.path.join(data_folder,"word_model.save"))

In [28]:
# check the most similar word to 'cơm'
print(word_model.wv.most_similar('cơm'))


embedding_matrix = np.zeros((len(word_model.wv.vocab) , 300))
for i, vec in enumerate(word_model.wv.vectors):
  embedding_matrix[i] = vec

[('cháo', 0.7353115081787109), ('nấu', 0.7341228723526001), ('bún', 0.7034357786178589), ('nướng', 0.6905111074447632), ('xào', 0.6796316504478455), ('mì', 0.6782970428466797), ('phở', 0.6766970157623291), ('nát', 0.6602506637573242), ('rán', 0.6399729251861572), ('chén', 0.6389163732528687)]


  if np.issubdtype(vec.dtype, np.int):


In [29]:
# init layer
model = Sequential()
model.add(Embedding(len(word_model.wv.vocab),300,input_length=X.shape[1],weights=[embedding_matrix],trainable=False))
model.add(LSTM(300,return_sequences=False))
model.add(Dense(y.shape[1],activation="softmax"))
model.summary()
model.compile(optimizer="adam",loss="categorical_crossentropy",metrics=['acc'])

batch = 64
epochs = 1
model.fit(X_train,y_train,batch,epochs)
model.save(os.path.join(data_folder , "predict_model.save"))

# LSTM - 1188s 10ms/step - loss: 0.3378 - acc: 0.8716

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 438, 300)          11724600  
_________________________________________________________________
lstm_4 (LSTM)                (None, 300)               721200    
_________________________________________________________________
dense_5 (Dense)              (None, 3)                 903       
Total params: 12,446,703
Trainable params: 722,103
Non-trainable params: 11,724,600
_________________________________________________________________
Epoch 1/1


In [30]:

model.evaluate(X_test,y_test)



[0.2597367326775022, 0.9033330678939819]