# Tải thư viện và các file liên quan

Import một số thư viện

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re

Kết nối xuống driver

In [3]:
#Connect to gg drive
from google.colab import drive
drive.mount('/content/drive')

import os
try:
  os.chdir("drive/My Drive/Colab Notebooks/LSTM_glove_voi_positive_negative")
except:
  print('')

Mounted at /content/drive


Tải trained glove và dataset

In [4]:
raw_data = pd.read_csv('new_edited_LSTM_data.csv')
raw_data.clean_text = raw_data.clean_text.astype(str)

In [5]:
raw_data

Unnamed: 0,text,sentiment,rate,tokens,clean_text
0,The camera is very slow :(,neutral,3.0,"['camera', 'slow']",camera slow
1,I was happy I got a case and a charger that wa...,negative,1.0,"['happy', 'got', 'case', 'charger', 'descripti...",happy got case charger description told phone ...
2,I bought this phone for my wife and I wanted t...,positive,5.0,"['bought', 'phone', 'wife', 'wanted', 'wait', ...",bought phone wife wanted wait month half leave...
3,HTC One in White is good; in black it is bette...,positive,5.0,"['tc', 'one', 'white', 'good', 'black', 'bette...",tc one white good black better love new phone ...
4,This phone is definitely one of the fastest on...,positive,5.0,"['phone', 'definitely', 'one', 'fastest', 'mar...",phone definitely one fastest market moment dro...
...,...,...,...,...,...
89995,Fast but hot. Which makes it run slower in hig...,neutral,3.0,"['fast', 'hot', 'makes', 'run', 'slower', 'hig...",fast hot makes run slower high temperature sit...
89996,no devise help,neutral,3.0,"['no', 'devise', 'help']",no devise help
89997,I love the watch/phone and everything it offer...,neutral,3.0,"['love', 'watch', 'phone', 'everything', 'offe...",love watch phone everything offered keep freez...
89998,Replacement,neutral,3.0,['replacement'],replacement


In [6]:
data = raw_data[raw_data['sentiment']!='neutral']
data

Unnamed: 0,text,sentiment,rate,tokens,clean_text
1,I was happy I got a case and a charger that wa...,negative,1.0,"['happy', 'got', 'case', 'charger', 'descripti...",happy got case charger description told phone ...
2,I bought this phone for my wife and I wanted t...,positive,5.0,"['bought', 'phone', 'wife', 'wanted', 'wait', ...",bought phone wife wanted wait month half leave...
3,HTC One in White is good; in black it is bette...,positive,5.0,"['tc', 'one', 'white', 'good', 'black', 'bette...",tc one white good black better love new phone ...
4,This phone is definitely one of the fastest on...,positive,5.0,"['phone', 'definitely', 'one', 'fastest', 'mar...",phone definitely one fastest market moment dro...
5,Not available or not working good and I HAD TO...,negative,1.0,"['not', 'available', 'not', 'working', 'good',...",not available not working good ad ay shipping ...
...,...,...,...,...,...
72442,I'm sorry I accidentally forgot to send the fi...,negative,1.0,"['sorry', 'accidentally', 'forgot', 'send', 'f...",sorry accidentally forgot send first phone bac...
72443,This phone was defective. Was very unhappy tha...,negative,1.0,"['phone', 'defective', 'unhappy', 'send', 'back']",phone defective unhappy send back
72444,Now the phone is starting shut down on its own...,negative,1.0,"['phone', 'starting', 'shut', 'annoyed', 'also...",phone starting shut annoyed also gets extremel...
72446,This phone is no good i purchase this phone an...,negative,1.0,"['phone', 'no', 'good', 'purchase', 'phone', '...",phone no good purchase phone less month starte...


In [7]:
# Load glove into emmbed_dict
emmbed_dict = {}
with open('pre_train_final_300d_100ep_100th.txt','r') as f:
  for line in f:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:],'float32')
    emmbed_dict[word]=vector

In [8]:
emmbed_dict['the'].shape[0]

300

# Thực hiện tokenize và xây dựng mô hình

Thực hiện Tokenize đoạn văn bản

In [9]:
vocab_size = 15000

tokenizer = Tokenizer(split=' ', num_words = vocab_size)
tokenizer.fit_on_texts(data['clean_text'].values)
print()
X = tokenizer.texts_to_sequences(data['clean_text'].values) #convert data to number
X = pad_sequences(X) # padding
print(X.shape[1]) #check size
MAX_LEN = X.shape[1]
words_to_index = tokenizer.word_index





1369


In [10]:
# Select only top 15000 words in word index
from itertools import islice

def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

top_n_words_index = dict(take(vocab_size, words_to_index.items()))
len(top_n_words_index)

15000

Khởi tạo model LSTM

In [11]:
inp_dim = vocab_size+1 #dimensions for one-hot encoding of embedding layer, +1 means include index 0
embed_dim = emmbed_dict['mobile'].shape[0]
hidden_nodes = 150 #dimension of LSTM output
output_dim = 2

print(vocab_size)
print(inp_dim)

15000
15001


In [12]:
# Create weighten matrix of glove
emb_matrix = np.zeros((inp_dim, embed_dim))
for word, index in top_n_words_index.items():
  embedding_vector = emmbed_dict.get(word)
  if embedding_vector is not None:
    emb_matrix[index, :] = embedding_vector

In [13]:
model = Sequential()
#Embedding each row of X ("X.shape[1]" len) into dense vectors "embed_dim" 
model.add(Embedding(input_dim = inp_dim, output_dim = embed_dim, input_length = MAX_LEN, weights = [emb_matrix], trainable=False)) 
model.add(SpatialDropout1D(0.4))
model.add(LSTM(hidden_nodes, dropout=0.2, recurrent_dropout=0.2))  # add return_sequences=True if you want many-to-many LSTM model
model.add(Dense(output_dim,activation='softmax')) 
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1369, 300)         4500300   
                                                                 
 spatial_dropout1d (SpatialD  (None, 1369, 300)        0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 150)               270600    
                                                                 
 dense (Dense)               (None, 2)                 302       
                                                                 
Total params: 4,771,202
Trainable params: 270,902
Non-trainable params: 4,500,300
_________________________________________________________________
None


# Tạo tập train test và tiến hành huấn luyện mô hình

Tạo tập train và test

In [14]:
Y = pd.get_dummies(data['sentiment']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(48000, 1369) (48000, 2)
(12000, 1369) (12000, 2)


In [15]:
Y_train

array([[1, 0],
       [0, 1],
       [1, 0],
       ...,
       [0, 1],
       [0, 1],
       [1, 0]], dtype=uint8)

Thực hiện train network 

In [16]:
batch_size = 32
model.fit(X_train, Y_train, epochs = 4, batch_size=batch_size, verbose = 1)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f4672bb5c50>

Lưu model xuống drive

In [17]:
#Save model
model.save('LSTM_Model.h5')

In [18]:
#Save training and testing
from numpy import savetxt
savetxt('X_train.csv', X_train, delimiter=',')
savetxt('Y_train.csv', Y_train, delimiter=',')
savetxt('X_test.csv', X_test, delimiter=',')
savetxt('Y_test.csv', Y_test, delimiter=',')

In [19]:
#Save tokenizer
import pickle

with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('MAX_LEN.txt', 'w') as f:
  f.write('%d' % MAX_LEN)

# Thử nghiệm và đánh giá mô hình

Test sử dụng LSTM

In [20]:
test = ["""product bad never buy"""]
#vectorizing the test by the pre-fitted tokenizer instance
test = tokenizer.texts_to_sequences(test)
#padding the test to have exactly the same shape as `embedding_2` input
test = pad_sequences(test, maxlen=MAX_LEN, dtype='int32', value=0)
print(test)
sentiment = model.predict(test,batch_size=1,verbose = 2)[0]
np.argmax(sentiment)
if(np.argmax(sentiment) == 0):
    print("negative")
elif (np.argmax(sentiment) == 1):
    print("neutral")
elif(np.argmax(sentiment) == 2):
  print("positive")

[[ 0  0  0 ... 62 38 21]]
1/1 - 1s - 882ms/epoch - 882ms/step
negative


Validation và evaluation

In [21]:
score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

375/375 - 183s - loss: 0.1500 - accuracy: 0.9434 - 183s/epoch - 487ms/step
score: 0.15
acc: 0.94


Dropout: 20% - https://towardsdatascience.com/choosing-the-right-hyperparameters-for-a-simple-lstm-using-keras-f8e9ed76f046


Tham khảo: https://colab.research.google.com/drive/1fAuZ7L8YQZrF-MyRXrdyR-Dh13AfA_X0

https://towardsdatascience.com/sentiment-analysis-using-lstm-and-glove-embeddings-99223a87fe8e