In [1]:
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import sklearn as sk
import os

from numpy import asarray

from keras.utils import np_utils
from keras.preprocessing import sequence
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Activation, Reshape, Flatten, Embedding, LSTM, Bidirectional, Conv2D, MaxPooling2D
from keras.preprocessing.text import Tokenizer
from sklearn.metrics import classification_report
from keras.preprocessing.text import hashing_trick, text_to_word_sequence
from keras.regularizers import l2

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [2]:
mixeddataname = "mixed_20568.pkl"
modelname = 'BLSTM+CNN.h5'

num_class = 2
# max_features = 100000
maxlen = 100
batch_size = 10
epochs = 5

learning_rate = 0.1
dim_embed = 300
num_hidden_unit_lstm = 300
num_filter = 100
size_batch = 10
size_pool = 2
size_window = 3
dropout_embed = 0.5
dropout_blstm = 0.2
dropout_penultimate = 0.4
l2_lambda = 0.00001

onehot_encoder = OneHotEncoder(sparse=False)
t = Tokenizer()

In [4]:
def getEmbeddingMatrix(vocabulary_size):
    embeddings_index = dict()
    f = open('glove.6b/glove.6B.300d.txt', encoding='utf-8')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    print('Loaded %s word vectors.' % len(embeddings_index))
    embedding_matrix = np.zeros((vocabulary_size + 1, dim_embed))
    for word, i in t.word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

### 2 classes(0: 1-2 star, 1: 4-5 star)

In [3]:
def getData(filename):
    print("---------------")
    print("| Getting data...")
    print("---------------")
    data = pd.read_pickle(filename)
    
    reviewTextList = data.reviewText.values
    t.fit_on_texts(reviewTextList)
    text = t.texts_to_sequences(reviewTextList)
    text = sequence.pad_sequences(text, maxlen=maxlen)
    print('text shape:', text.shape)
    
    data_X = np.array(text)
    lenData = data_X.shape[0] - 20568 - 1
    X = data_X[0: lenData]
    X_adv = data_X[lenData: -1]
    
    data_Y = np_utils.to_categorical(data['overall'], num_class)
    Y = data_Y[0: lenData]
    Y_adv = data_Y[lenData: -1]
    
    return X, Y, X_adv, Y_adv, len(t.word_index)

In [4]:
def splitData(x, y):
    X_trains, X_test, Y_trains, Y_test = train_test_split(x, y, test_size=0.2, random_state=42)
    X_train, X_validation, Y_train, Y_validation = train_test_split(X_trains, Y_trains, test_size=0.2, random_state=42)
    return X_train, X_validation, X_test, Y_train, Y_validation, Y_test

In [5]:
def train(containsGlove, x_train, x_validation, y_train, y_validation, dim_input, embedding_matrix):
    print("---------------")
    print("| Training...")
    print("---------------")
    
    model = Sequential()
    if (containsGlove == False):
        model.add(Embedding(input_dim=dim_input, output_dim=dim_embed, input_length=maxlen))
    else:
        model.add(Embedding(input_dim=dim_input + 1, output_dim=dim_embed, input_length=maxlen, weights=[embedding_matrix], trainable=False))
    model.add(Dropout(0.5))
    model.add(
        Bidirectional(
            LSTM(
                num_hidden_unit_lstm, 
                return_sequences=True, 
                kernel_regularizer=l2(l2_lambda)
            ), 
            merge_mode="sum"
        ))
    model.add(Dropout(0.2))
    model.add(Reshape((maxlen, dim_embed, 1)))
    model.add(Conv2D(
        num_filter,
        kernel_size=(size_window,size_window),
        padding='valid',
        activation='relu',
        strides=1,
        kernel_regularizer=l2(l2_lambda)))
    model.add(MaxPooling2D(pool_size=size_pool))
    model.add(Dropout(0.2))
    model.add(Flatten())
    model.add(Dense(num_class, kernel_regularizer=l2(l2_lambda)))
    model.add(Activation('sigmoid'))

    print(model.summary())
    model.compile(
        loss='categorical_crossentropy', 
        optimizer='Adadelta',
        metrics=['categorical_accuracy'])
    
    history = model.fit(x_train, y_train,
              batch_size=size_batch,
              epochs=epochs,
              validation_data=(x_validation, y_validation),)
    
#     plt.figure('categorical_accuracy')
#     plt.xlabel('Iterations')
#     plt.ylabel('Categorical Accuracy')
#     plt.plot(history.history['categorical_accuracy'])
#     plt.show()
    
    model.save(modelname)
    del model

In [5]:
def test(x_test, y_test):
    print("---------------")
    print("| Testing...")
    print("---------------")
    model = load_model(modelname)
    
    score = model.evaluate(x_test,y_test, batch_size = size_batch)
    print('Test loss:',score[0])
    print('Test accuracy:',score[1])
    
    y_test=np.argmax(y_test,axis=1)
    y_pred=model.predict_classes(x_test)
    
    print(classification_report(y_test,y_pred))

### Test with Adversasial Dataset

In [9]:
# def run():
X, Y, X_adv, Y_adv, dim_input = getData(mixeddataname)
X_train, X_validation, X_test, Y_train, Y_validation, Y_test = splitData(X, Y)
embeddingMatrix = getEmbeddingMatrix(dim_input)

---------------
| Getting data...
---------------
text shape: (123406, 100)
Loaded 400000 word vectors.


In [10]:
train(True, X_train, X_validation, Y_train, Y_validation, dim_input, embeddingMatrix) 

---------------
| Training...
---------------
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 300)          22484700  
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 300)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 300)          1442400   
_________________________________________________________________
dropout_2 (Dropout)          (None, 100, 300)          0         
_________________________________________________________________
reshape_1 (Reshape)          (None, 100, 300, 1)       0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 98, 298, 100)      1000      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (

In [11]:
test(X_test, Y_test)

---------------
| Testing...
---------------
Test loss: 0.4301742969500281
Test accuracy: 0.9002333658068897
             precision    recall  f1-score   support

          0       0.89      0.92      0.90     10392
          1       0.91      0.88      0.90     10176

avg / total       0.90      0.90      0.90     20568



In [6]:
test(X_adv, Y_adv)

---------------
| Testing...
---------------
Test loss: 0.5134533019479218
Test accuracy: 0.8537533980157068
             precision    recall  f1-score   support

          0       0.81      0.93      0.86     10330
          1       0.92      0.78      0.84     10238

avg / total       0.86      0.85      0.85     20568



### Without Glove

In [7]:
modelname = 'BLSTM+CNNWithG.h5'

In [8]:
X, Y, X_adv, Y_adv, dim_input = getData(mixeddataname)
X_train, X_validation, X_test, Y_train, Y_validation, Y_test = splitData(X, Y)

---------------
| Getting data...
---------------
text shape: (123406, 100)


In [9]:
print(modelname)

BLSTM+CNNWithG.h5


In [10]:
train(False, X_train, X_validation, Y_train, Y_validation, dim_input, []) 

---------------
| Training...
---------------
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 300)          22484400  
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 300)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 300)          1442400   
_________________________________________________________________
dropout_2 (Dropout)          (None, 100, 300)          0         
_________________________________________________________________
reshape_1 (Reshape)          (None, 100, 300, 1)       0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 98, 298, 100)      1000      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (

In [12]:
test(X_test, Y_test)

---------------
| Testing...
---------------
Test loss: 0.2613612111268086
Test accuracy: 0.9067969586524877
             precision    recall  f1-score   support

          0       0.88      0.94      0.91     10392
          1       0.94      0.87      0.90     10176

avg / total       0.91      0.91      0.91     20568



In [10]:
test(X_adv, Y_adv)

---------------
| Testing...
---------------
Test loss: 0.4024347115057375
Test accuracy: 0.8443698892243082
             precision    recall  f1-score   support

          0       0.80      0.92      0.86     10330
          1       0.91      0.77      0.83     10238

avg / total       0.85      0.84      0.84     20568

