In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from collections import Counter
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
import re
#print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [2]:
def preprocess_text(s):
    s = re.sub("\n", "", s.lower())
    s = re.sub("[()$!\-/\\\\]", "SYMBOL", s)
    s = re.sub("\d{1,5}", "NUMBER", s)
    s = re.sub("\.", ". ", s)
    return s

In [3]:
df = pd.read_csv(os.path.join("data", "yelp_reviews", "yelp_review.csv"), nrows = 100000)

In [4]:
df = df[df['stars'] != 3]

In [5]:
reviews = df['text'].apply(lambda s: preprocess_text(s))
labels = df['stars']

label2idx = {l: i for i, l in enumerate(labels.unique())}
NUM_LABELS = len(labels.unique())

In [6]:
from nltk import tokenize

In [7]:
vocab = Counter([word for review in reviews for word in tokenize.word_tokenize(review)])

most_frequent_words = [k for k, c in vocab.items() if c > 5]
word2idx = {k: i + 1 for i, k in enumerate(most_frequent_words)}

In [8]:
VOCAB_SIZE = len(word2idx) + 2
EMBEDDING_DIM = 300

In [9]:
f = open(os.path.join("data", "embeddings", "wiki-news-300d-1M.vec"), encoding="utf8")
contents = f.readlines()

In [10]:
fasttext_words = {}
for line in contents[1:]:
    line = re.sub("\n", "", line).strip()
    line = line.split(" ")
    if word2idx.get(line[0]):
        fasttext_words[line[0]] = np.array([float(n) for n in line[1:]])

In [11]:
embedding_mat = np.empty(shape=(VOCAB_SIZE, EMBEDDING_DIM))
for word, idx in word2idx.items():
    if fasttext_words.get(word) is not None:
        embedding_mat[idx,:] = fasttext_words[word]
    else:
        embedding_mat[idx,:] = np.random.uniform(-0.05, 0.05, size = EMBEDDING_DIM)

In [12]:
# random initialization for the unknown token
embedding_mat[-1,:] = np.random.uniform(-0.05, 0.05, size = EMBEDDING_DIM)

In [13]:
f.close()

In [14]:
MAX_NUM_SENT = 6
MAX_NUM_WORDS = 30

In [15]:
def review_tokenizer(review, word2idx, num_sent, num_words, unknown_token):
    tokenized_reviews = np.zeros((num_sent, num_words), dtype=np.int32)
    for n_s, s in enumerate(tokenize.sent_tokenize(review)[:num_sent]):
        for n_w, w in enumerate(s.strip().split(" ")[:num_words]):
                tokenized_reviews[n_s, n_w] = word2idx.get(w, unknown_token)
    return tokenized_reviews

In [16]:
tokenized_reviews = reviews.apply(lambda r: review_tokenizer(r, word2idx, MAX_NUM_SENT, MAX_NUM_WORDS, VOCAB_SIZE))
tokenized_reviews = np.stack(tokenized_reviews.values, axis=0)

In [17]:
multi_class_labels = labels.values - 1
binary_labels = labels.apply(lambda x: 1 if x > 3 else 0).values

In [18]:
BATCH_SIZE = 64

In [19]:
from keras.models import Model
from keras.layers import Input, Embedding, TimeDistributed, Flatten, Dense, Lambda, Reshape, Concatenate, Multiply
from keras.layers import Conv1D, MaxPooling1D, Dropout, GlobalMaxPooling1D, BatchNormalization
from keras.losses import binary_crossentropy
from keras.initializers import Ones, Constant, TruncatedNormal, RandomUniform
from keras import regularizers
from keras.optimizers import Adam
import keras.backend as K

import tensorflow as tf

Using TensorFlow backend.


In [20]:
# Pairwise sentence similarity for full batch
def pairwise_dist(x):
    new_shape = K.int_shape(x)[2:]
    x = K.reshape(x, (-1,) + new_shape)
    x1 = K.expand_dims(x, len(new_shape) - 1)
    x2 = K.expand_dims(x, len(new_shape))
    sq_diff = K.square(x1 - x2)
    c = K.sqrt(K.sum(sq_diff, axis = -1))
    c = c / (2 * (0.5 ** 2))
    sims = K.exp(-c)
    return tf.matrix_band_part(sims, -1, 0)

# Pairwise prediction difference for full batch
def y_derivative(y):
    new_shape = K.int_shape(y)[2:]
    y = K.reshape(y, (-1,) + new_shape)
    y1 = K.expand_dims(y, len(new_shape) - 1)
    y2 = K.expand_dims(y, len(new_shape))
    sq_diff = K.square(y1 - y2)
    return tf.matrix_band_part(sq_diff, -1, 0)

# similarity loss function
def custom_sim_loss(encoded_reviews, y_hat, batch_size):
    sims, pred_sims = pairwise_dist(encoded_reviews), y_derivative(y_hat)
    loss = K.sum(K.dot(sims, pred_sims)) / (batch_size ** 2) 
    return loss

# Full custome loss function
def custom_loss_wrapper(encoded_reviews, y_hat, batch_size, l, a):
    def loss(y_true, y_pred):
        ent_loss = binary_crossentropy(y_true, y_pred)
        ent_loss = K.reshape(ent_loss, (-1, 1))
        sim_loss = custom_sim_loss(encoded_reviews, y_hat, batch_size)
        sim_loss = K.reshape(ent_loss, (-1, 1))
        return (l * ent_loss) +  (a * sim_loss)
    return loss

In [21]:
def ConvMax1D(layer, n_layer, ks, padding, activation, dropout_prob):
    x = Conv1D(128, ks, padding="same", activation="relu", name="channel_{}".format(n_layer))(layer)
    #x = MaxPooling1D(2)(x)
    x = Dropout(dropout_prob)(x)
    return x

In [22]:
def get_sent_encoder(max_num_words, max_num_sent, vocab_size, dropout_prob, embedding_dim, embedding_mat, embedding_trainable):
    sent = Input((max_num_words,), name="sent_input")
    embed = Embedding(vocab_size, embedding_dim, weights=[embedding_mat], trainable=embedding_trainable, name="sent_embed")(sent)
    channels = [ConvMax1D(embed, i, ks, "same", "relu", dropout_prob) for i, ks in enumerate([2,3,4,5,6])]
    x = Concatenate()(channels)
    x = Conv1D(128, 3, padding = "same", activation="relu")(x)
    #x = BatchNormalization()(x)
    x = MaxPooling1D(2)(x)
    x = Dropout(dropout_prob)(x)
    x = Conv1D(256, 3, padding = "same")(x)
    x = MaxPooling1D(2)(x)
    x = Dropout(dropout_prob)(x)
    x = Conv1D(512, 3, padding = "same")(x)
    x = MaxPooling1D(2)(x)
    x = Dropout(dropout_prob)(x)
    x = Conv1D(512, 3, padding = "same", kernel_regularizer=regularizers.l2(0.01))(x)
    x = MaxPooling1D(2)(x)
    x = Dropout(dropout_prob)(x)
    x = GlobalMaxPooling1D()(x)
    x = Dense(2048, activation = "relu")(x)
    x = Dense(2048, activation = "relu")(x)
    x = Dropout(dropout_prob)(x)
    sent_encode = x
    
    sent_encoder = Model(inputs=sent, outputs=sent_encode)
    return sent_encoder

In [23]:
def get_model(sent_encoder, max_num_words, max_num_sent, dropout_prob):
    review = Input((max_num_sent, max_num_words))
    mask = Input((max_num_sent,1))
    
    encoded_reviews = TimeDistributed(sent_encoder)(review)
    encoded_reviews = Dropout(dropout_prob)(encoded_reviews)
    
    # predictionson sentence sentiments
    y_hat = Dense(1, activation="sigmoid", name="sent_sentiment")(encoded_reviews)
    y_hat = Multiply(name="masked_sent_sentiment")([y_hat, mask])

    sent_avg_out = Lambda(lambda x: K.sum(x[0], axis=[-2]) / K.sum(x[1], axis=[-2]), name="sent_agg_pred")([y_hat, mask])
    
    #x = Dense(512, activation="relu")(encoded_reviews)
    #x = Dense(1024, activation="relu")(x)
    #x = Flatten()(x)
    #multi_class_out = Dense(5, activation="softmax")(x)
    model = Model(inputs = [review, mask], outputs = sent_avg_out)
    
    return model, encoded_reviews, y_hat

In [24]:
sent_encoder = get_sent_encoder(MAX_NUM_WORDS, MAX_NUM_SENT, VOCAB_SIZE, 0.3, EMBEDDING_DIM, embedding_mat, False)

model, encoded_reviews, y_hat = get_model(sent_encoder, MAX_NUM_WORDS, MAX_NUM_SENT, 0.4)

sent_sentiment_layer = model.get_layer("masked_sent_sentiment")
sent_sentiment_model = Model(inputs = model.input, 
                             outputs = sent_sentiment_layer.output)

In [25]:
model.compile(Adam(),
              loss=custom_loss_wrapper(encoded_reviews, y_hat, BATCH_SIZE, 1, 0),
              #loss_weights = [0, 1],
              metrics=["accuracy"])

In [26]:
mask_mat = np.sum(tokenized_reviews,axis=-1).reshape(-1, MAX_NUM_SENT, 1)
mask_mat[mask_mat > 0] = 1

In [27]:
from keras.callbacks import EarlyStopping, LearningRateScheduler

In [28]:
def decaying_lr(epoch):
    initial_lr = 0.001
    return initial_lr / (2 ** np.floor(epoch / 2))

In [29]:
lr_scheduler = LearningRateScheduler(decaying_lr)

In [30]:
earlystopping = EarlyStopping(patience = 2)

In [None]:
model.fit([tokenized_reviews, mask_mat], 
          binary_labels, BATCH_SIZE, validation_split=0.2, epochs = 10, callbacks=[earlystopping, lr_scheduler])

Train on 70553 samples, validate on 17639 samples
Epoch 1/10


In [None]:
sent_sentiment_pred = sent_sentiment_model.predict([tokenized_reviews, mask_mat])

In [None]:
for i in range(170, 180, 1):
    print("Review ratings: {}".format(labels.iloc[i]))
    for s_i, sent in enumerate(tokenize.sent_tokenize(reviews.iloc[i])[:MAX_NUM_SENT]):
        sent_pred = sent_sentiment_pred[i][s_i]
        
        if sent_pred == 0:
            break
        elif sent_pred > 0.7:
            print('\033[1;42m{}\033[1;m'.format(sent))
        elif sent_pred < 0.3:
            print('\033[1;41m{}\033[1;m'.format(sent))
        else:
            print('\033[1;47m{}\033[1;m'.format(sent))
    print("\n")