In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, Flatten, Dropout, Concatenate
import keras.layers.merge
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.layers import LSTM, Bidirectional
from keras.models import Model
from keras.callbacks import EarlyStopping
import gensim
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import re
import codecs
import matplotlib.pyplot as plt
from gensim.models.word2vec import Word2Vec
from keras.models import load_model
from sklearn.metrics import accuracy_score, classification_report

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [17]:
EMBEDDING_DIM = 500 # how big is each word vector
MAX_VOCAB_SIZE = 175303 # how many unique words to use (i.e num rows in embedding vector)
MAX_SEQUENCE_LENGTH = 80 # max number of words in a comment to use

#training params
batch_size = 256  
num_epochs = 10 

In [3]:
def sentiment_label(polarity):
    if polarity=='negative':
        return 0
    else:
        return 1

In [5]:
tokenizer = RegexpTokenizer(r'\w+')
clean_train_comments = pd.read_csv("./corpus/tripadvisor/train_set.csv")
clean_train_comments['content'] = clean_train_comments['content'].astype('str') 
clean_train_comments.dtypes
clean_train_comments["tokens"] = clean_train_comments["content"].apply(tokenizer.tokenize)
clean_train_comments['sentiment'] = clean_train_comments['polarity'].apply(sentiment_label)
   
clean_train_comments.head()

Unnamed: 0,content,polarity,tokens,sentiment
0,<number> ruangan itu tidak siap oleh <number>...,negative,"[number, ruangan, itu, tidak, siap, oleh, numb...",0
1,ada tidak ada pantai dan <number> menit berjal...,negative,"[ada, tidak, ada, pantai, dan, number, menit, ...",0
2,ada tidak ada restoran karena dalam perbaikan ...,negative,"[ada, tidak, ada, restoran, karena, dalam, per...",0
3,ada tidak ada restoran karena dalam perbaikan ...,negative,"[ada, tidak, ada, restoran, karena, dalam, per...",0
4,ada yang bilang harga tidak boong atau ada har...,negative,"[ada, yang, bilang, harga, tidak, boong, atau,...",0


In [6]:
clean_test_comments = pd.read_csv("./corpus/tripadvisor/test_set.csv")
clean_test_comments['content'] = clean_test_comments['content'].astype('str') 
clean_test_comments.dtypes
clean_test_comments["tokens"] = clean_test_comments["content"].apply(tokenizer.tokenize)
clean_test_comments['sentiment'] = clean_test_comments['polarity'].apply(sentiment_label)

clean_test_comments.head()

Unnamed: 0,content,polarity,tokens,sentiment
0,kekecewaan untuk ritz standar menginap <number...,negative,"[kekecewaan, untuk, ritz, standar, menginap, n...",0
1,kekecewaan untuk ritz standar menginap <number...,negative,"[kekecewaan, untuk, ritz, standar, menginap, n...",0
2,kekurangan <number> tidak ada fasilitas apapun...,negative,"[kekurangan, number, tidak, ada, fasilitas, ap...",0
3,kelebihan * lokasi strategis * breakfast stand...,negative,"[kelebihan, lokasi, strategis, breakfast, stan...",0
4,kelebihan + kamar luas dan ada balkon di setia...,negative,"[kelebihan, kamar, luas, dan, ada, balkon, di,...",0


In [7]:
all_training_words = [word for tokens in clean_train_comments["tokens"] for word in tokens]
training_sentence_lengths = [len(tokens) for tokens in clean_train_comments["tokens"]]
TRAINING_VOCAB = sorted(list(set(all_training_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_training_words), len(TRAINING_VOCAB)))
print("Max sentence length is %s" % max(training_sentence_lengths))

593489 words total, with a vocabulary size of 19911
Max sentence length is 79


In [8]:
all_test_words = [word for tokens in clean_test_comments["tokens"] for word in tokens]
test_sentence_lengths = [len(tokens) for tokens in clean_test_comments["tokens"]]
TEST_VOCAB = sorted(list(set(all_test_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_test_words), len(TEST_VOCAB)))
print("Max sentence length is %s" % max(test_sentence_lengths))

113774 words total, with a vocabulary size of 7737
Max sentence length is 71


In [10]:
word2vec = Word2Vec.load('./prosa-w2v/prosa.vec')
# word2vec = Word2Vec.load("./vectorizer/tripadvisor/word2vec_300.model")

In [8]:
# def get_average_word2vec(tokens_list, vector, generate_missing=False, k=100):
#     if len(tokens_list)<1:
#         return np.zeros(k)
#     if generate_missing:
#         vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
#     else:
#         vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
#     length = len(vectorized)
#     summed = np.sum(vectorized, axis=0)
#     averaged = np.divide(summed, length)
#     return averaged

# def get_word2vec_embeddings(vectors, clean_comments, generate_missing=False):
#     embeddings = clean_comments['tokens'].apply(lambda x: get_average_word2vec(x, vectors, 
#                                                                                 generate_missing=generate_missing))
#     return list(embeddings)

In [78]:
# training_embeddings = get_word2vec_embeddings(word2vec, clean_train_comments, generate_missing=True)

  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()


In [11]:
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, lower=True, char_level=False)
tokenizer.fit_on_texts(clean_train_comments["content"].tolist())
training_sequences = tokenizer.texts_to_sequences(clean_train_comments["content"].tolist())

train_word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(train_word_index))

train_cnn_data = pad_sequences(training_sequences, maxlen=MAX_SEQUENCE_LENGTH)

train_embedding_weights = np.zeros((len(train_word_index)+1, EMBEDDING_DIM))
for word,index in train_word_index.items():
    train_embedding_weights[index,:] = word2vec[word] if word in word2vec else np.random.rand(EMBEDDING_DIM)
print(train_embedding_weights.shape)

Found 19908 unique tokens.


  
  


(19909, 500)


In [12]:
test_sequences = tokenizer.texts_to_sequences(clean_test_comments["content"].tolist())
test_cnn_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [13]:
def ConvNet(embeddings, max_sequence_length, num_words, embedding_dim, labels_index, trainable=False, extra_conv=True):
    
    embedding_layer = Embedding(num_words,
                            embedding_dim,
                            weights=[embeddings],
                            input_length=max_sequence_length,
                            trainable=trainable)

    sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)

    # Yoon Kim model (https://arxiv.org/abs/1408.5882)
    convs = []
    filter_sizes = [3,4,5]

    for filter_size in filter_sizes:
        l_conv = Conv1D(filters=128, kernel_size=filter_size, activation='relu')(embedded_sequences)
        l_pool = MaxPooling1D(pool_size=3)(l_conv)
        convs.append(l_pool)

    #l_merge = Merge(mode='concat', concat_axis=1)(convs)
    l_merge = Concatenate(axis=1)(convs)

    # add a 1D convnet with global maxpooling, instead of Yoon Kim model
    conv = Conv1D(filters=128, kernel_size=3, activation='relu')(embedded_sequences)
    pool = MaxPooling1D(pool_size=3)(conv)

    if extra_conv==True:
        x = Dropout(0.5)(l_merge)  
    else:
        # Original Yoon Kim model
        x = Dropout(0.5)(pool)
    x = Flatten()(x)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.5)(x)
    preds = Dense(labels_index, activation='sigmoid')(x)

    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['acc']) 
    model.summary()
    return model

In [14]:
y_tr = clean_train_comments['sentiment'].values
y_ts = clean_test_comments['sentiment'].values

In [15]:
x_train = train_cnn_data
y_train = y_tr

x_test = test_cnn_data
y_test = y_ts

In [16]:
model = ConvNet(train_embedding_weights, MAX_SEQUENCE_LENGTH, len(train_word_index)+1, EMBEDDING_DIM, 
                1, False)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 80)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 80, 500)      9954500     input_1[0][0]                    
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 78, 128)      192128      embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 77, 128)      256128      embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_3 (

In [16]:
#define callbacks
# early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.1, patience=4, verbose=1)
# callbacks_list = [early_stopping]

In [18]:
hist = model.fit(x_train, y_train, epochs=num_epochs, validation_data=(x_test, y_test), batch_size=batch_size) #callbacks=callbacks_list

Train on 12389 samples, validate on 2429 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [87]:
# model.save('./model/yoon_kim/cnn_model_09.h5')  

In [19]:
# model = load_model('./model/yoon_kim/cnn_model_09.h5')
score = model.evaluate(x_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

y_predict = model.predict(test_cnn_data, batch_size=256, verbose=1)
for i in range(len(y_predict)):
    y_predict[i][0] = round(y_predict[i][0])
print(classification_report(y_test, y_predict, labels = [0, 1], digits=8))

Test loss: 0.7350773357373297
Test accuracy: 0.8130918072054247
             precision    recall  f1-score   support

          0  0.83359498 0.81441718 0.82389449      1304
          1  0.79047619 0.81155556 0.80087719      1125

avg / total  0.81362436 0.81309181 0.81323395      2429



In [None]:
#generate plots
# plt.figure()
# plt.plot(hist.history['loss'], lw=2.0, color='b', label='train')
# plt.plot(hist.history['val_loss'], lw=2.0, color='r', label='val')
# plt.title('CNN sentiment')
# plt.xlabel('Epochs')
# plt.ylabel('Cross-Entropy Loss')
# plt.legend(loc='upper right')
# plt.show()

In [None]:
# plt.figure()
# plt.plot(hist.history['acc'], lw=2.0, color='b', label='train')
# plt.plot(hist.history['val_acc'], lw=2.0, color='r', label='val')
# plt.title('CNN sentiment')
# plt.xlabel('Epochs')
# plt.ylabel('Accuracy')
# plt.legend(loc='upper left')
# plt.show()