In [2]:
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import sklearn as sk
import os

from numpy import asarray

from keras.utils import np_utils
from keras.preprocessing import sequence
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Activation, Reshape, Flatten, Embedding, LSTM, Bidirectional, Conv2D, MaxPooling2D
from keras.preprocessing.text import Tokenizer
from sklearn.metrics import classification_report
from keras.preprocessing.text import hashing_trick, text_to_word_sequence
from keras.regularizers import l2

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [3]:
mixeddataname = "mixed.pkl"
modelname = 'BLSTM+CNN.h5'
num_adv = 20568

num_class = 2
max_features = 100000
maxlen = 100
batch_size = 10
epochs = 5

learning_rate = 0.1
dim_embed = 300
num_hidden_unit_lstm = 300
num_filter = 100
size_batch = 10
size_pool = 2
size_window = 3
dropout_embed = 0.5
dropout_blstm = 0.2
dropout_penultimate = 0.4
l2_lambda = 0.00001

onehot_encoder = OneHotEncoder(sparse=False)
t = Tokenizer()
# t = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\'', lower=True)

In [4]:
def getEmbeddingMatrix(vocabulary_size):
    embeddings_index = dict()
    f = open('glove.6b/glove.6B.300d.txt', encoding='utf-8')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    print('Loaded %s word vectors.' % len(embeddings_index))
    embedding_matrix = np.zeros((vocabulary_size + 1, dim_embed))
    for word, i in t.word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

### 2 classes(0: 1-2 star, 1: 4-5 star)

In [5]:
def getData(filename):
    print("---------------")
    print("| Getting data...")
    print("---------------")
    data = pd.read_pickle(filename)
    
    reviewTextList = data.reviewText.values
    t.fit_on_texts(reviewTextList)
    text = t.texts_to_sequences(reviewTextList)
    text = sequence.pad_sequences(text, maxlen=maxlen)
    print('text shape:', text.shape)
    
    data_X = np.array(text)
    X = data_X[0: data_X.shape[0] - 4 * num_adv - 1]
    X_adv = data_X[data_X.shape[0] - 4 * num_adv -1: data_X.shape[0] - 3 * num_adv -1]
    X_adv1 = data_X[data_X.shape[0] - 3 * num_adv -1: data_X.shape[0] - 2 * num_adv -1]
    X_adv2 = data_X[data_X.shape[0] - 2 * num_adv -1: data_X.shape[0] - num_adv -1]
    X_adv3 = data_X[data_X.shape[0] - num_adv -1: -1]
    
    data_Y = np_utils.to_categorical(data['overall'], num_class)
    Y = data_Y[0: data_X.shape[0] - 4 * num_adv - 1]
    Y_adv = data_Y[data_X.shape[0] - 4 * num_adv -1: data_X.shape[0] - 3 * num_adv -1]
    Y_adv1 = data_Y[data_X.shape[0] - 3 * num_adv -1: data_X.shape[0] - 2 * num_adv -1]
    Y_adv2 = data_Y[data_X.shape[0] - 2 * num_adv -1: data_X.shape[0] - num_adv -1]
    Y_adv3 = data_Y[data_X.shape[0] - num_adv -1: -1]
    
    return X, Y, X_adv, Y_adv, X_adv1, Y_adv1, X_adv2, Y_adv2, X_adv3, Y_adv3, len(t.word_index)

In [6]:
def splitData(x, y):
    X_trains, X_test, Y_trains, Y_test = train_test_split(x, y, test_size=0.2, random_state=42)
    X_train, X_validation, Y_train, Y_validation = train_test_split(X_trains, Y_trains, test_size=0.2, random_state=42)
    return X_train, X_validation, X_test, Y_train, Y_validation, Y_test

In [7]:
def train(containsGlove, x_train, x_validation, y_train, y_validation, dim_input, embedding_matrix):
    print("---------------")
    print("| Training...")
    print("---------------")
    
    model = Sequential()
    if (containsGlove == False):
        model.add(Embedding(input_dim=dim_input, output_dim=dim_embed, input_length=maxlen))
    else:
        model.add(Embedding(input_dim=dim_input + 1, output_dim=dim_embed, input_length=maxlen, weights=[embedding_matrix], trainable=False))
    model.add(Dropout(0.5))
    model.add(
        Bidirectional(
            LSTM(
                num_hidden_unit_lstm, 
                return_sequences=True, 
                kernel_regularizer=l2(l2_lambda)
            ), 
            merge_mode="sum"
        ))
    model.add(Dropout(0.2))
    model.add(Reshape((maxlen, dim_embed, 1)))
    model.add(Conv2D(
        num_filter,
        kernel_size=(size_window,size_window),
        padding='valid',
        activation='relu',
        strides=1,
        kernel_regularizer=l2(l2_lambda)))
    model.add(MaxPooling2D(pool_size=size_pool))
    model.add(Dropout(0.2))
    model.add(Flatten())
    model.add(Dense(num_class, kernel_regularizer=l2(l2_lambda)))
    model.add(Activation('sigmoid'))

    print(model.summary())
    model.compile(
        loss='categorical_crossentropy', 
        optimizer='Adadelta',
        metrics=['categorical_accuracy'])
    
    history = model.fit(x_train, y_train,
              batch_size=size_batch,
              epochs=epochs,
              validation_data=(x_validation, y_validation),)
    
#     plt.figure('categorical_accuracy')
#     plt.xlabel('Iterations')
#     plt.ylabel('Categorical Accuracy')
#     plt.plot(history.history['categorical_accuracy'])
#     plt.show()
    
    model.save(modelname)
    del model

In [8]:
def test(x_test, y_test):
    print("---------------")
    print("| Testing...")
    print("---------------")
    model = load_model(modelname)
    
    score = model.evaluate(x_test,y_test, batch_size = size_batch)
    print('Test loss:',score[0])
    print('Test accuracy:',score[1])
    
    y_test=np.argmax(y_test,axis=1)
    y_pred=model.predict_classes(x_test)
    
    print(classification_report(y_test,y_pred))

### Test with Adversasial Dataset

In [8]:
# def run():
X, Y, X_adv, Y_adv, X_adv1, Y_adv1, X_adv2, Y_adv2, X_adv3, Y_adv3, dim_input = getData(mixeddataname)
X_train, X_validation, X_test, Y_train, Y_validation, Y_test = splitData(X, Y)

---------------
| Getting data...
---------------
text shape: (185110, 100)


In [9]:
embeddingMatrix = getEmbeddingMatrix(dim_input)

Loaded 400000 word vectors.


In [10]:
train(True, X_train, X_validation, Y_train, Y_validation, dim_input, embeddingMatrix) 

---------------
| Training...
---------------
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 300)          27510600  
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 300)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 300)          1442400   
_________________________________________________________________
dropout_2 (Dropout)          (None, 100, 300)          0         
_________________________________________________________________
reshape_1 (Reshape)          (None, 100, 300, 1)       0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 98, 298, 100)      1000      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (

In [11]:
test(X_test, Y_test)

---------------
| Testing...
---------------
Test loss: 0.38248311260078305
Test accuracy: 0.9025184681145703
             precision    recall  f1-score   support

          0       0.91      0.90      0.90     10321
          1       0.90      0.91      0.90     10247

avg / total       0.90      0.90      0.90     20568



In [9]:
test(X_adv, Y_adv)

---------------
| Testing...
---------------
Test loss: 0.5099321414511151
Test accuracy: 0.8364935774299617
             precision    recall  f1-score   support

          0       0.80      0.91      0.85     10364
          1       0.89      0.76      0.82     10204

avg / total       0.84      0.84      0.84     20568



In [10]:
test(X_adv1, Y_adv1)

---------------
| Testing...
---------------
Test loss: 0.38860158811065065
Test accuracy: 0.9063107664716795
             precision    recall  f1-score   support

          0       0.91      0.90      0.91     10363
          1       0.90      0.91      0.91     10205

avg / total       0.91      0.91      0.91     20568



In [11]:
test(X_adv2, Y_adv2)

---------------
| Testing...
---------------
Test loss: 0.4123528675983639
Test accuracy: 0.8978996425635246
             precision    recall  f1-score   support

          0       0.89      0.90      0.90     10363
          1       0.90      0.89      0.90     10205

avg / total       0.90      0.90      0.90     20568



In [8]:
X, Y, X_adv, Y_adv, X_adv1, Y_adv1, X_adv2, Y_adv2, X_adv3, Y_adv3, dim_input = getData(mixeddataname)

---------------
| Getting data...
---------------
text shape: (185110, 100)


In [9]:
test(X_adv3, Y_adv3)

---------------
| Testing...
---------------
Test loss: 0.43893659002542823
Test accuracy: 0.8863768891972218
             precision    recall  f1-score   support

          0       0.87      0.90      0.89     10363
          1       0.90      0.87      0.88     10205

avg / total       0.89      0.89      0.89     20568



### Without Glove

In [8]:
modelname = 'BLSTM+CNNWithG.h5'

In [9]:
X, Y, X_adv, Y_adv, X_adv1, Y_adv1, X_adv2, Y_adv2, X_adv3, Y_adv3, dim_input = getData(mixeddataname)
X_train, X_validation, X_test, Y_train, Y_validation, Y_test = splitData(X, Y)

---------------
| Getting data...
---------------
text shape: (185110, 100)


In [10]:
train(False, X_train, X_validation, Y_train, Y_validation, dim_input, []) 

---------------
| Training...
---------------
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 300)          27510300  
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 300)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 300)          1442400   
_________________________________________________________________
dropout_2 (Dropout)          (None, 100, 300)          0         
_________________________________________________________________
reshape_1 (Reshape)          (None, 100, 300, 1)       0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 98, 298, 100)      1000      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (

In [10]:
test(X_test, Y_test)

---------------
| Testing...
---------------
Test loss: 0.2754522282242607
Test accuracy: 0.9065538628214485
             precision    recall  f1-score   support

          0       0.87      0.95      0.91     10321
          1       0.95      0.86      0.90     10247

avg / total       0.91      0.91      0.91     20568



In [10]:
test(X_adv, Y_adv)

---------------
| Testing...
---------------
Test loss: 0.43349540734954123
Test accuracy: 0.8038700861499172
             precision    recall  f1-score   support

          0       0.73      0.97      0.83     10364
          1       0.95      0.64      0.76     10204

avg / total       0.84      0.80      0.80     20568



In [10]:
test(X_adv1, Y_adv1)

---------------
| Testing...
---------------
Test loss: 0.27508308078140276
Test accuracy: 0.9106378763166442
             precision    recall  f1-score   support

          0       0.87      0.96      0.92     10363
          1       0.95      0.86      0.91     10205

avg / total       0.91      0.91      0.91     20568



In [10]:
test(X_adv2, Y_adv2)

---------------
| Testing...
---------------
Test loss: 0.3039883074449547
Test accuracy: 0.8956631585486154
             precision    recall  f1-score   support

          0       0.85      0.96      0.90     10363
          1       0.95      0.83      0.89     10205

avg / total       0.90      0.90      0.90     20568



In [10]:
test(X_adv3, Y_adv3)

---------------
| Testing...
---------------
Test loss: 0.336660547578256
Test accuracy: 0.877722668811789
             precision    recall  f1-score   support

          0       0.83      0.96      0.89     10363
          1       0.95      0.79      0.87     10205

avg / total       0.89      0.88      0.88     20568



### Adversarial Test

In [8]:
modelname = 'BLSTM+CNN_AdvTrain.h5'
mixeddataname = "adversarialTest.pkl"

In [9]:
def getDataAdvTrain(filename):
    print("---------------")
    print("| Getting data...")
    print("---------------")
    data = pd.read_pickle(filename)
    
    reviewTextList = data.reviewText.values
    t.fit_on_texts(reviewTextList)
    text = t.texts_to_sequences(reviewTextList)
    text = sequence.pad_sequences(text, maxlen=maxlen)
    print('text shape:', text.shape)
    
    data_X = np.array(text)
    X = data_X[0: data_X.shape[0] - 5 * num_adv - 1]
    X_test = data_X[data_X.shape[0] - 5 * num_adv -1: data_X.shape[0] - 4 * num_adv -1]
    X_adv = data_X[data_X.shape[0] - 4 * num_adv -1: data_X.shape[0] - 3 * num_adv -1]
    X_adv1 = data_X[data_X.shape[0] - 3 * num_adv -1: data_X.shape[0] - 2 * num_adv -1]
    X_adv2 = data_X[data_X.shape[0] - 2 * num_adv -1: data_X.shape[0] - num_adv -1]
    X_adv3 = data_X[data_X.shape[0] - num_adv -1: -1]
    
    data_Y = np_utils.to_categorical(data['overall'], num_class)
    Y = data_Y[0: data_X.shape[0] - 5 * num_adv - 1]
    Y_test = data_Y[data_X.shape[0] - 5 * num_adv -1: data_X.shape[0] - 4 * num_adv -1]
    Y_adv = data_Y[data_X.shape[0] - 4 * num_adv -1: data_X.shape[0] - 3 * num_adv -1]
    Y_adv1 = data_Y[data_X.shape[0] - 3 * num_adv -1: data_X.shape[0] - 2 * num_adv -1]
    Y_adv2 = data_Y[data_X.shape[0] - 2 * num_adv -1: data_X.shape[0] - num_adv -1]
    Y_adv3 = data_Y[data_X.shape[0] - num_adv -1: -1]
    
    return X, Y, X_test, Y_test, X_adv, Y_adv, X_adv1, Y_adv1, X_adv2, Y_adv2, X_adv3, Y_adv3, len(t.word_index)

In [12]:
X, Y, X_test, Y_test, X_adv, Y_adv, X_adv1, Y_adv1, X_adv2, Y_adv2, X_adv3, Y_adv3, dim_input = getDataAdvTrain(mixeddataname)
X_train, X_validation, Y_train, Y_validation = X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=0.2, random_state=42)

---------------
| Getting data...
---------------
text shape: (267380, 100)


In [13]:
embeddingMatrix = getEmbeddingMatrix(dim_input)

Loaded 400000 word vectors.


In [14]:
print(len(X))
print(len(X_test))
print(len(X_adv))
print(len(X_adv1))
print(len(X_adv2))
print(len(X_adv3))

164539
20568
20568
20568
20568
20568


In [15]:
train(True, X_train, X_validation, Y_train, Y_validation, dim_input, embeddingMatrix) 

---------------
| Training...
---------------
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 300)          32454300  
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 300)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 300)          1442400   
_________________________________________________________________
dropout_2 (Dropout)          (None, 100, 300)          0         
_________________________________________________________________
reshape_1 (Reshape)          (None, 100, 300, 1)       0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 98, 298, 100)      1000      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (

In [16]:
test(X_test, Y_test)

---------------
| Testing...
---------------
Test loss: 0.38954063940058653
Test accuracy: 0.9013516067623953
             precision    recall  f1-score   support

          0       0.89      0.92      0.90     10363
          1       0.91      0.89      0.90     10205

avg / total       0.90      0.90      0.90     20568



In [17]:
test(X_adv, Y_adv)

---------------
| Testing...
---------------
Test loss: 0.41306706746871347
Test accuracy: 0.8885647534385167
             precision    recall  f1-score   support

          0       0.88      0.90      0.89     10363
          1       0.89      0.88      0.89     10205

avg / total       0.89      0.89      0.89     20568



In [10]:
X, Y, X_test, Y_test, X_adv, Y_adv, X_adv1, Y_adv1, X_adv2, Y_adv2, X_adv3, Y_adv3, dim_input = getDataAdvTrain(mixeddataname)

---------------
| Getting data...
---------------
text shape: (267380, 100)


In [11]:
test(X_adv1, Y_adv1)

---------------
| Testing...
---------------
Test loss: 0.3903908535594701
Test accuracy: 0.9010598912620674
             precision    recall  f1-score   support

          0       0.89      0.91      0.90     10363
          1       0.91      0.89      0.90     10205

avg / total       0.90      0.90      0.90     20568



In [12]:
test(X_adv2, Y_adv2)

---------------
| Testing...
---------------
Test loss: 0.3920514760839596
Test accuracy: 0.9007195570893392
             precision    recall  f1-score   support

          0       0.89      0.91      0.90     10363
          1       0.91      0.89      0.90     10205

avg / total       0.90      0.90      0.90     20568



In [13]:
test(X_adv3, Y_adv3)

---------------
| Testing...
---------------
Test loss: 0.39646543742518675
Test accuracy: 0.8999902688021885
             precision    recall  f1-score   support

          0       0.89      0.91      0.90     10363
          1       0.91      0.89      0.90     10205

avg / total       0.90      0.90      0.90     20568



### Using word2vec

In [9]:
modelname = 'BLSTM+CNN_word2vec.h5'

In [10]:
import gensim
from gensim.models import Word2Vec
from gensim.models.keyedvectors import KeyedVectors



In [11]:
t = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\'', lower=True)

X, Y, X_adv, Y_adv, X_adv1, Y_adv1, X_adv2, Y_adv2, X_adv3, Y_adv3, dim_input = getData(mixeddataname)
X_train, X_validation, X_test, Y_train, Y_validation, Y_test = splitData(X, Y)

---------------
| Getting data...
---------------
text shape: (185110, 100)


In [53]:
def getEmbeddingMatrix(vocabulary_size):    
    word_vectors = KeyedVectors.load_word2vec_format('word2vec/GoogleNews-vectors-negative300.bin', binary=True)

    embedding_matrix = np.zeros((vocabulary_size, dim_embed))
    for word, i in t.word_index.items():
        try:
            embedding_vector = word_vectors[word]
            embedding_matrix[i] = embedding_vector
        except KeyError:
            embedding_matrix[i] = np.random.normal(0, np.sqrt(0.25), dim_embed)
    del(word_vectors)
    
    return embedding_matrix

In [54]:
embeddingMatrix = getEmbeddingMatrix(dim_input + 1)

In [55]:
train(True, X_train, X_validation, Y_train, Y_validation, dim_input, embeddingMatrix) 

---------------
| Training...
---------------
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 300)          25826100  
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 300)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 300)          1442400   
_________________________________________________________________
dropout_2 (Dropout)          (None, 100, 300)          0         
_________________________________________________________________
reshape_1 (Reshape)          (None, 100, 300, 1)       0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 98, 298, 100)      1000      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (

In [56]:
test(X_test, Y_test)

---------------
| Testing...
---------------
Test loss: 0.31010268047996786
Test accuracy: 0.9076234852407564
             precision    recall  f1-score   support

          0       0.89      0.93      0.91     10321
          1       0.92      0.89      0.91     10247

avg / total       0.91      0.91      0.91     20568



In [12]:
test(X_adv, Y_adv)

---------------
| Testing...
---------------
Test loss: 0.44240627606344657
Test accuracy: 0.8213243834103466
             precision    recall  f1-score   support

          0       0.76      0.94      0.84     10364
          1       0.92      0.70      0.80     10204

avg / total       0.84      0.82      0.82     20568



In [57]:
test(X_adv1, Y_adv1)

---------------
| Testing...
---------------
Test loss: 0.3123963754711066
Test accuracy: 0.9104920188446817
             precision    recall  f1-score   support

          0       0.90      0.93      0.91     10363
          1       0.93      0.89      0.91     10205

avg / total       0.91      0.91      0.91     20568



In [13]:
test(X_adv2, Y_adv2)

---------------
| Testing...
---------------
Test loss: 0.33615400829652736
Test accuracy: 0.8980455005600126
             precision    recall  f1-score   support

          0       0.87      0.93      0.90     10363
          1       0.93      0.86      0.89     10205

avg / total       0.90      0.90      0.90     20568



In [14]:
test(X_adv3, Y_adv3)

---------------
| Testing...
---------------
Test loss: 0.3609772014411008
Test accuracy: 0.8826332100341404
             precision    recall  f1-score   support

          0       0.85      0.93      0.89     10363
          1       0.92      0.83      0.88     10205

avg / total       0.89      0.88      0.88     20568

