In [None]:
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.utils.np_utils import to_categorical
from gensim.models.word2vec import Word2Vec
from keras.layers import Embedding
from keras.layers import LSTM, GRU, Bidirectional, TimeDistributed
from keras.layers import Dense, Input, Dropout, Concatenate
from keras.models import Model
from keras.models import load_model
from sklearn.metrics import classification_report

In [None]:
clean_train_comments = pd.read_csv("./corpus/prosa/data_clean/data_train_full.csv")
clean_train_comments['content'] = clean_train_comments['content'].astype('str')
clean_train_comments['sentiment'] = clean_train_comments['polarity'].astype('category').cat.codes
   
clean_train_comments.head()

In [None]:
clean_test_comments = pd.read_csv("./corpus/prosa/data_clean/data_testing_full.csv")
clean_test_comments['content'] = clean_test_comments['content'].astype('str') 
clean_test_comments['sentiment'] = clean_test_comments['polarity'].astype('category').cat.codes

clean_test_comments.head()

In [None]:
max_sen_len = 30
max_sents = 15
emb_dim = 500

lines_train = []
texts_train = []

lines_test = []
texts_test = []

In [None]:
def prepare_data_train(text):
    sentences = text.lower().split('.')
    lines_train.append(sentences)  
    text = text.lower().replace(".", " ")
    texts_train.append(text)
    
def prepare_data_test(text):
    sentences = text.lower().split('.')
    lines_test.append(sentences)  
    text = text.lower().replace(".", " ")
    texts_test.append(text)

In [None]:
clean_train_comments['content'].apply(prepare_data_train)
clean_test_comments['content'].apply(prepare_data_test)
labels_train = clean_train_comments['sentiment'].tolist()
labels_test = clean_test_comments['sentiment'].tolist()

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts_train)
word_index = tokenizer.word_index

In [None]:
data_train = np.zeros((len(texts_train), max_sents, max_sen_len), dtype='int32')

for i, sentences in enumerate(lines_train):
    for j, sent in enumerate(sentences):
        if j< max_sents:
            wordTokens = text_to_word_sequence(sent)
            k = 0
            for _, word in enumerate(wordTokens):
                if k < max_sen_len:
                    data_train[i, j, k] = tokenizer.word_index[word]
                    k = k + 1
                    
data_test = np.zeros((len(texts_test), max_sents, max_sen_len), dtype='int32')

for i, sentences in enumerate(lines_test):
    for j, sent in enumerate(sentences):
        if j< max_sents:
            wordTokens = text_to_word_sequence(sent)
            k = 0
            for _, word in enumerate(wordTokens):
                if k < max_sen_len:
                    data_test[i, j, k] = tokenizer.word_index[word] if word in tokenizer.word_index else 0
                    k = k + 1

In [None]:
x_train = data_train
x_test = data_test
y_train = labels_train
y_test = labels_test

In [None]:
word2vec = Word2Vec.load('./prosa-w2v/prosa.vec')
embedding_matrix = np.random.random((len(word_index) + 1, emb_dim))
for word, i in word_index.items():
    embedding_matrix[i,:] = word2vec[word] if word in word2vec else np.random.rand(emb_dim)

In [None]:
embedding_layer = Embedding(len(word_index) + 1,
                            emb_dim,
                            weights=[embedding_matrix],
                            input_length=max_sen_len,
                            trainable=False)

sentence_input = Input(shape=(max_sen_len,), dtype='int32')
embedded_sequences = embedding_layer(sentence_input)
l_lstm = LSTM(20)(embedded_sequences)
sentEncoder = Model(sentence_input, l_lstm)

doc_input = Input(shape=(max_sents,max_sen_len), dtype='int32')
doc_encoder = TimeDistributed(sentEncoder)(doc_input)
l_lstm_sent = Bidirectional(GRU(20))(doc_encoder)
dense_1 = Dense(20, activation="relu")(l_lstm_sent)
drop_1 = Dropout(0.5)(dense_1)
# drop_1 = Dropout(0.5)(l_lstm_sent)
preds = Dense(3, activation='softmax')(drop_1)
model = Model(doc_input, preds)

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])

model.summary()

In [None]:
model.fit(x_train, to_categorical(y_train), epochs=10, validation_data=(x_test, to_categorical(y_test)), batch_size=128)

In [None]:
model.save('./model/lstm_bi_gru/lstm_bi_gru_model_08.h5')

In [None]:
model = load_model('./model/lstm_bi_gru/lstm_bi_gru_model_08.h5')

y_predict = model.predict(x_test, verbose=1)
y_predict = np.argmax(y_predict, axis=1)
print(classification_report(y_test, y_predict, labels = [0, 1, 2], digits=8))