In [None]:
import gensim
import pandas as pd
import pickle
from gensim.models import Doc2Vec
from gensim.models.word2vec import Word2Vec
import numpy as np
from keras import optimizers
from keras.models import load_model
from keras.utils import to_categorical
from sklearn.metrics import accuracy_score, classification_report
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Bidirectional

In [None]:
clean_train_comments = pd.read_csv("./corpus/prosa/data_clean_punctuation/data_train_full.csv")
clean_train_comments['content'] = clean_train_comments['content'].astype('str') 
clean_train_comments['tokens'] = clean_train_comments['content'].str.split()
clean_train_comments['sentiment'] = clean_train_comments['polarity'].astype('category').cat.codes
   
clean_train_comments.head()

In [None]:
clean_test_comments = pd.read_csv("./corpus/prosa/data_clean_punctuation/data_testing_full.csv")
clean_test_comments['content'] = clean_test_comments['content'].astype('str') 
clean_test_comments["tokens"] = clean_test_comments["content"].str.split()
clean_test_comments['sentiment'] = clean_test_comments['polarity'].astype('category').cat.codes

clean_test_comments.head()

In [None]:
all_training_words = [word for tokens in clean_train_comments["tokens"] for word in tokens]
training_sentence_lengths = [len(tokens) for tokens in clean_train_comments["tokens"]]
TRAINING_VOCAB = sorted(list(set(all_training_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_training_words), len(TRAINING_VOCAB)))
print("Max sentence length is %s" % max(training_sentence_lengths))

In [None]:
all_test_words = [word for tokens in clean_test_comments["tokens"] for word in tokens]
test_sentence_lengths = [len(tokens) for tokens in clean_test_comments["tokens"]]
TEST_VOCAB = sorted(list(set(all_test_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_test_words), len(TEST_VOCAB)))
print("Max sentence length is %s" % max(test_sentence_lengths))

In [None]:
n_text_train = clean_train_comments.shape[0]
n_text_test = clean_test_comments.shape[0]
max_sequences = 95
data_dim = 700
word_size = 500
doc_size = 200

In [None]:
word2vec = Word2Vec.load('./prosa-w2v/prosa.vec')
tfidf = pickle.load(open('./vectorizer/prosa/tfidf.pickle', 'rb'))
model_dbow = Doc2Vec.load('./vectorizer/prosa/model_dbow.model')
model_dmc = Doc2Vec.load('./vectorizer/prosa/model_dmc.model')
model_dmm = Doc2Vec.load('./vectorizer/prosa/model_dmm.model')

def build_doc_Vector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += np.append(model_dbow[word] * tfidf[word], model_dmm[word] * tfidf[word])
            count += 1
        except KeyError: 
            continue
    if count != 0:
        vec /= count
    return vec

def build_Vector(tokens, word_size, doc_size):
    doc_vec = build_doc_Vector(tokens, doc_size)
    vec = np.zeros((max_sequences - len(tokens), doc_size + word_size))
    for word in tokens:
        try:
            word_vec = np.append(doc_vec, word2vec[word])
            vec = np.append(vec, word_vec)
        except KeyError: 
            word_vec = np.append(doc_vec, np.zeros((1, word_size)))
            vec = np.append(vec, word_vec)
            continue
    vec.reshape(max_sequences, doc_size + word_size)
    return vec

In [None]:
data_train = np.zeros((n_text_train, max_sequences, data_dim), dtype='float32')
data_test = np.zeros((n_text_test, max_sequences, data_dim), dtype='float32')

In [None]:
n_train = 0
n_test = 0

def prepare_data_train(tokens):
    global n_train
    data_train[n_train] = build_Vector(tokens, word_size, doc_size).reshape((max_sequences, data_dim))
    n_train += 1
    
def prepare_data_test(tokens):
    global n_test
    data_test[n_test] = build_Vector(tokens, word_size, doc_size).reshape((max_sequences, data_dim))
    n_test += 1

In [None]:
clean_train_comments['tokens'].apply(prepare_data_train)
clean_test_comments['tokens'].apply(prepare_data_test)
data_train[0]

In [None]:
# clean memory
del word2vec
del tfidf 
del model_dbow 
del model_dmc 
del model_dmm

In [None]:
y_train = clean_train_comments['sentiment'].values.reshape((n_text_train, 1))
y_test = clean_test_comments['sentiment'].values.reshape((n_text_test, 1))

In [None]:
batch_size = 256
num_epochs = 10
hidden_size = 10
timesteps = max_sequences

In [None]:
model = Sequential()
model.add(Bidirectional(LSTM(hidden_size, input_shape=(timesteps, data_dim)), merge_mode='concat'))
model.add(Dropout(0.5))
model.add(Dense(3, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(data_train, to_categorical(y_train), epochs=num_epochs, validation_data=(data_test, to_categorical(y_test)))

In [None]:
model.save('./model/bi_lstm_3_pv/bi_lstm_pv_model_02.h5')  

In [None]:
model = load_model('./model/bi_lstm_3_pv/bi_lstm_pv_model_02.h5')
y_pred = model.predict(data_test)
y_pred = np.argmax(y_pred, axis=1)
print(classification_report(y_test, y_pred, labels = [0, 1, 2], digits=8))