In [None]:
import os
import sys
import gensim
import pandas as pd
import pickle
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec
from sklearn.cross_validation import train_test_split
from gensim.models.word2vec import Word2Vec
from tqdm import tqdm
from sklearn import utils
import numpy as np
from keras import optimizers
from keras.models import load_model
from sklearn.metrics import accuracy_score, classification_report
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, GRU, Bidirectional

In [None]:
# train_set = pd.read_csv('./corpus/tripadvisor/train_set.csv')
test_set = pd.read_csv('./corpus/tripadvisor/test.csv')
test_set.head()

In [None]:
def sentiment_label(polarity):
    if polarity=='negative':
        return 0
    else:
        return 1

In [None]:
# train_set['sentiment'] = train_set['polarity'].apply(sentiment_label)
test_set['sentiment'] = test_set['polarity'].apply(sentiment_label)
test_set.head()

In [None]:
SEED = 2000

x_train, x_validation, y_train, y_validation = train_test_split(test_set['content'], test_set['sentiment'], test_size=.1, random_state=SEED)

In [None]:
# x_train = train_set['content']
# x_validation = test_set['content']
# y_train = train_set['sentiment']
# y_validation = test_set['sentiment']

In [None]:
def labelize_text(text,label):
    result = []
    prefix = label
    for i, t in zip(text.index, text):
        result.append(LabeledSentence(t.split(), [prefix + '_%s' % i]))
    return result
  
x_train = labelize_text(x_train, 'TRAIN')
x_validation = labelize_text(x_validation, 'TEST')

In [None]:
MAX_SEQUENCE_LENGTH = 85
data_dim = 700

In [None]:
# word2vec = Word2Vec.load("./vectorizer/tripadvisor/word2vec_300.model")
word2vec = Word2Vec.load('./prosa-w2v/prosa.vec')
tfidf = pickle.load(open('./vectorizer/tripadvisor/tfidf.pickle', 'rb'))
model_dbow = Doc2Vec.load("./vectorizer/tripadvisor/model_dbow.model")
model_dmc = Doc2Vec.load("./vectorizer/tripadvisor/model_dmc.model")
model_dmm = Doc2Vec.load("./vectorizer/tripadvisor/model_dmm.model")

def build_doc_Vector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += np.append(model_dbow[word] * tfidf[word], model_dmm[word] * tfidf[word])
            count += 1
        except KeyError: 
            continue
    if count != 0:
        vec /= count
    return vec

def build_Vector(tokens, word_size, doc_size):
    doc_vec = build_doc_Vector(tokens, doc_size)
    vec = np.zeros((MAX_SEQUENCE_LENGTH - len(tokens), doc_size + word_size))
    for word in tokens:
        try:
            word_vec = np.append(doc_vec, word2vec[word])
            vec = np.append(vec, word_vec)
        except KeyError: 
            word_vec = np.append(doc_vec, np.zeros((1, word_size)))
            vec = np.append(vec, word_vec)
            continue
    vec.reshape(MAX_SEQUENCE_LENGTH, doc_size + word_size)
    return vec

In [None]:
train_vecs = np.concatenate([[build_Vector(z, 500, 200)] for z in tqdm(map(lambda x: x.words, x_train))])
val_vecs = np.concatenate([[build_Vector(z, 500, 200)] for z in tqdm(map(lambda x: x.words, x_validation))])

In [None]:
batch_size = 56
num_epochs = 10
hidden_size = 10
timesteps = MAX_SEQUENCE_LENGTH
num_class = 1

In [None]:
num_data = len(train_vecs)
num_data_val = len(val_vecs)

train_vecs = train_vecs.reshape((num_data, timesteps, data_dim))
y_train = y_train.reshape((num_data, num_class))
val_vecs = val_vecs.reshape((num_data_val, timesteps, data_dim))
y_validation = y_validation.reshape((num_data_val, num_class))

In [None]:
model = Sequential()
model.add(Bidirectional(GRU(hidden_size, input_shape=(timesteps, data_dim)), merge_mode='concat'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(train_vecs, y_train, epochs=num_epochs, validation_data=[val_vecs, y_validation])

In [None]:
# model.save('./model/bi_gru_pv/bi_gru_model_01.h5')  

In [None]:
# model = load_model('./model/bi_gru_pv/bi_gru_model_01.h5')
y_pred = model.predict(val_vecs)
for i in range(len(y_pred)):
    y_pred[i][0] = round(y_pred[i][0])

print("Accuracy: ", accuracy_score(y_validation, y_pred))
print(classification_report(y_validation, y_pred, labels = [0, 1], digits=8))