In [1]:
import os
import sys
import gensim
import pandas as pd
from gensim.models.doc2vec import LabeledSentence
from sklearn.cross_validation import train_test_split
from gensim.models.word2vec import Word2Vec
from tqdm import tqdm
from sklearn import utils
import numpy as np
from keras import optimizers
from keras.models import load_model
from sklearn.metrics import accuracy_score, classification_report
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, GRU, Bidirectional

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# train_set = pd.read_csv('./corpus/tripadvisor/train_set.csv')
test_set = pd.read_csv('./corpus/tripadvisor/test.csv')
test_set.head()

Unnamed: 0,content,polarity
0,lokasi hotel tidak jauh dari komplek mall kali...,negative
1,lokasi hotel tidak jauh dengan taksi ke area m...,negative
2,lokasi hotel yang sangat strategis masih di ko...,negative
3,lokasi hotel yang sangat strategis masih di ko...,negative
4,lokasi hotel yang strategis di tengah kota dan...,negative


In [3]:
def sentiment_label(polarity):
    if polarity=='negative':
        return 0
    else:
        return 1

In [4]:
# train_set['sentiment'] = train_set['polarity'].apply(sentiment_label)
test_set['sentiment'] = test_set['polarity'].apply(sentiment_label)
test_set.head()

Unnamed: 0,content,polarity,sentiment
0,lokasi hotel tidak jauh dari komplek mall kali...,negative,0
1,lokasi hotel tidak jauh dengan taksi ke area m...,negative,0
2,lokasi hotel yang sangat strategis masih di ko...,negative,0
3,lokasi hotel yang sangat strategis masih di ko...,negative,0
4,lokasi hotel yang strategis di tengah kota dan...,negative,0


In [5]:
SEED = 2000

x_train, x_validation, y_train, y_validation = train_test_split(test_set['content'], test_set['sentiment'], test_size=.1, random_state=SEED)

In [6]:
# x_train = train_set['content']
# x_validation = test_set['content']
# y_train = train_set['sentiment']
# y_validation = test_set['sentiment']

In [6]:
def labelize_text(text,label):
    result = []
    prefix = label
    for i, t in zip(text.index, text):
        result.append(LabeledSentence(t.split(), [prefix + '_%s' % i]))
    return result
  
x_train = labelize_text(x_train, 'TRAIN')
x_validation = labelize_text(x_validation, 'TEST')

  """


In [7]:
MAX_SEQUENCE_LENGTH = 85
data_dim = 500

In [8]:
word2vec = Word2Vec.load('./prosa-w2v/prosa.vec')
# word2vec = Word2Vec.load("./vectorizer/tripadvisor/word2vec_300.model")

In [9]:
def build_Word_Vector(tokens, size):
    vec = np.zeros((MAX_SEQUENCE_LENGTH - len(tokens), size))
    for word in tokens:
        try:
            vec = np.append(vec, word2vec[word])
        except KeyError: 
            vec = np.append(vec, np.zeros((1, size)))
            continue
    vec.reshape(MAX_SEQUENCE_LENGTH, size)
    return vec

In [10]:
train_vecs = np.concatenate([[build_Word_Vector(z, 500)] for z in tqdm(map(lambda x: x.words, x_train))])
val_vecs = np.concatenate([[build_Word_Vector(z, 500)] for z in tqdm(map(lambda x: x.words, x_validation))])

  """
746it [00:01, 432.52it/s]
83it [00:00, 413.39it/s]


In [11]:
batch_size = 56
num_epochs = 10
hidden_size = 10
timesteps = MAX_SEQUENCE_LENGTH
num_class = 1

In [12]:
num_data = len(train_vecs)
num_data_val = len(val_vecs)

train_vecs = train_vecs.reshape((num_data, timesteps, data_dim))
y_train = y_train.reshape((num_data, num_class))
val_vecs = val_vecs.reshape((num_data_val, timesteps, data_dim))
y_validation = y_validation.reshape((num_data_val, num_class))

  """
  import sys


In [13]:
model = Sequential()
model.add(Bidirectional(GRU(hidden_size, input_shape=(timesteps, data_dim)), merge_mode='concat'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(train_vecs, y_train, epochs=num_epochs, validation_data=[val_vecs, y_validation])

Train on 746 samples, validate on 83 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10


Epoch 10/10


<keras.callbacks.History at 0x253aaef6c50>

In [None]:
# model.save('./model/bi_gru/bi_gru_model_01.h5')  

In [14]:
# model = load_model('./model/bi_gru/bi_gru_model_01.h5')
y_pred = model.predict(val_vecs)
for i in range(len(y_pred)):
    y_pred[i][0] = round(y_pred[i][0])

print("Accuracy: ", accuracy_score(y_validation, y_pred))
print(classification_report(y_validation, y_pred, labels = [0, 1], digits=8))

Accuracy:  0.8192771084337349
             precision    recall  f1-score   support

          0  0.91666667 0.80000000 0.85436893        55
          1  0.68571429 0.85714286 0.76190476        28

avg / total  0.83875502 0.81927711 0.82317620        83

