In [None]:
import numpy as np
import pandas as pd
import keras
from tqdm import tqdm
from sklearn.metrics import accuracy_score, classification_report
from gensim.models.doc2vec import LabeledSentence
from keras.models import load_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, Flatten, Dropout, Concatenate
import keras.layers.merge
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.layers import LSTM, Bidirectional
from keras.models import Model
from keras.callbacks import EarlyStopping
import gensim
from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import re
import codecs
import matplotlib.pyplot as plt
from gensim.models import Doc2Vec
import pickle

In [None]:
EMBEDDING_DIM = 700 
MAX_VOCAB_SIZE = 19911 
MAX_SEQUENCE_LENGTH = 80 

#training params
batch_size = 256  
num_epochs = 20 

In [None]:
def sentiment_label(polarity):
    if polarity=='negative':
        return 0
    else:
        return 1

In [None]:
test_set = pd.read_csv('./corpus/tripadvisor/test.csv')
test_set['sentiment'] = test_set['polarity'].apply(sentiment_label)
test_set.head()

In [None]:
from sklearn.cross_validation import train_test_split
SEED = 2000

x_train, x_validation, y_train, y_validation = train_test_split(test_set['content'], test_set['sentiment'], test_size=.1, random_state=SEED)

In [None]:
# tokenizer = RegexpTokenizer(r'\w+')
# clean_train_comments = pd.read_csv("./corpus/tripadvisor/train_set.csv")
# clean_train_comments['content'] = clean_train_comments['content'].astype('str') 
# clean_train_comments["tokens"] = clean_train_comments["content"].apply(tokenizer.tokenize)
# clean_train_comments['sentiment'] = clean_train_comments['polarity'].apply(sentiment_label)
   
# clean_train_comments.head()

In [None]:
# clean_test_comments = pd.read_csv("./corpus/tripadvisor/test_set.csv")
# clean_test_comments['content'] = clean_test_comments['content'].astype('str') 
# clean_test_comments["tokens"] = clean_test_comments["content"].apply(tokenizer.tokenize)
# clean_test_comments['sentiment'] = clean_test_comments['polarity'].apply(sentiment_label)

# clean_test_comments.head()

In [None]:
# all_training_words = [word for tokens in clean_train_comments["tokens"] for word in tokens]
# training_sentence_lengths = [len(tokens) for tokens in clean_train_comments["tokens"]]
# TRAINING_VOCAB = sorted(list(set(all_training_words)))
# print("%s words total, with a vocabulary size of %s" % (len(all_training_words), len(TRAINING_VOCAB)))
# print("Max sentence length is %s" % max(training_sentence_lengths))

In [None]:
# all_test_words = [word for tokens in clean_test_comments["tokens"] for word in tokens]
# test_sentence_lengths = [len(tokens) for tokens in clean_test_comments["tokens"]]
# TEST_VOCAB = sorted(list(set(all_test_words)))
# print("%s words total, with a vocabulary size of %s" % (len(all_test_words), len(TEST_VOCAB)))
# print("Max sentence length is %s" % max(test_sentence_lengths))

In [None]:
def labelize_text(text,label):
    result = []
    prefix = label
    for i, t in zip(text.index, text):
        result.append(LabeledSentence(t.split(), [prefix + '_%s' % i]))
    return result
  
# x_train = labelize_text(clean_train_comments["content"], 'TRAIN')
# x_validation = labelize_text(clean_test_comments["content"], 'TEST')

x_train = labelize_text(x_train, 'TRAIN')
x_validation = labelize_text(x_validation, 'TEST')

In [None]:
word2vec = Word2Vec.load('./prosa-w2v/prosa.vec')
# word2vec = Word2Vec.load('./vectorizer/prosa/word2vec.model')

In [None]:
tfidf = pickle.load(open('./vectorizer/tripadvisor/tfidf.pickle', 'rb'))
model_dbow = Doc2Vec.load("./vectorizer/tripadvisor/model_dbow.model")
model_dmc = Doc2Vec.load("./vectorizer/tripadvisor/model_dmc.model")
model_dmm = Doc2Vec.load("./vectorizer/tripadvisor/model_dmm.model")

In [None]:
def build_doc_Vector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += np.append(model_dbow[word] * tfidf[word], model_dmm[word] * tfidf[word])
            count += 1
        except KeyError: 
            continue
    if count != 0:
        vec /= count
    return vec

def build_Vector(tokens, word_size, doc_size):
    doc_vec = build_doc_Vector(tokens, doc_size)
    vec = np.zeros((MAX_SEQUENCE_LENGTH - len(tokens), doc_size + word_size))
    for word in tokens:
        try:
            word_vec = np.append(doc_vec, word2vec[word])
            vec = np.append(vec, word_vec)
        except KeyError: 
            word_vec = np.append(doc_vec, np.zeros((1, word_size)))
            vec = np.append(vec, word_vec)
            continue
    vec.reshape(MAX_SEQUENCE_LENGTH, doc_size + word_size)
    return vec

In [None]:
train_vecs = np.concatenate([[build_Vector(z, 500, 200)] for z in tqdm(map(lambda x: x.words, x_train))])
val_vecs = np.concatenate([[build_Vector(z, 500, 200)] for z in tqdm(map(lambda x: x.words, x_validation))])

In [None]:
num_data = len(train_vecs)
num_data_val = len(val_vecs)

train_vecs = train_vecs.reshape((num_data, MAX_SEQUENCE_LENGTH, EMBEDDING_DIM))
val_vecs = val_vecs.reshape((num_data_val, MAX_SEQUENCE_LENGTH, EMBEDDING_DIM))

In [None]:
def ConvNet(max_sequence_length, embedding_dim, labels_index, trainable=False, extra_conv=True):

    sequence_input = Input(shape=(max_sequence_length, embedding_dim,), dtype='float32')

    # Yoon Kim model (https://arxiv.org/abs/1408.5882)
    convs = []
    filter_sizes = [3,4,5]

    for filter_size in filter_sizes:
        l_conv = Conv1D(filters=128, kernel_size=filter_size, activation='relu')(sequence_input)
        l_pool = MaxPooling1D(pool_size=3)(l_conv)
        convs.append(l_pool)

    #l_merge = Merge(mode='concat', concat_axis=1)(convs)
    l_merge = Concatenate(axis=1)(convs)

    # add a 1D convnet with global maxpooling
    conv = Conv1D(filters=128, kernel_size=3, activation='relu')(sequence_input)
    pool = MaxPooling1D(pool_size=3)(conv)

    if extra_conv==True:
        x = Dropout(0.5)(l_merge)  
    else:
        # Original Yoon Kim model
        x = Dropout(0.5)(pool)
    x = Flatten()(x)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.5)(x)
    preds = Dense(labels_index, activation='sigmoid')(x)

    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['acc']) 
    model.summary()
    return model

In [None]:
# y_tr = clean_train_comments['sentiment'].values
# y_ts = clean_test_comments['sentiment'].values
y_tr = y_train.values
y_ts = y_validation.values

In [None]:
x_train = train_vecs
y_train = y_tr

x_test = val_vecs
y_test = y_ts

In [None]:
model = ConvNet(MAX_SEQUENCE_LENGTH, EMBEDDING_DIM, 1, False)

In [None]:
hist = model.fit(x_train, y_train, epochs=num_epochs, validation_data=(x_test, y_test), batch_size=batch_size)

In [None]:
# model.save('./model/yoon_kim_pv/cnn_model_04.h5') 

In [None]:
# model = load_model('./model/yoon_kim_pv/cnn_model_04.h5')

score = model.evaluate(x_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

y_predict = model.predict(x_test, batch_size=256, verbose=1)
for i in range(len(y_predict)):
    y_predict[i][0] = round(y_predict[i][0])
print(classification_report(y_test, y_predict, labels = [0, 1], digits=8))