In [1]:
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import sklearn as sk
import os
from keras.utils import np_utils

from keras.preprocessing import sequence
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Activation, Reshape, Flatten, Embedding, LSTM, Bidirectional, Conv2D, MaxPooling2D
from keras.preprocessing.text import Tokenizer
from sklearn.metrics import classification_report
from keras.preprocessing.text import hashing_trick, text_to_word_sequence
from keras.regularizers import l2

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [2]:
dataname = "rawdata_100.pkl"
modelname = 'BLSTM+CNN.h5'

num_class = 2
max_features = 100000
maxlen = 100
batch_size = 10
epochs = 5

learning_rate = 0.1
dim_embed = 300
num_hidden_unit_lstm = 300
num_filter = 100
size_batch = 10
size_pool = 2
size_window = 3
dropout_embed = 0.5
dropout_blstm = 0.2
dropout_penultimate = 0.4
l2_lambda = 0.00001

onehot_encoder = OneHotEncoder(sparse=False)
t = Tokenizer(num_words=max_features)

In [3]:
def getEmbeddingMatrix(vocabulary_size):
    embeddings_index = dict()
    f = open('glove.6b/glove.6B.300d.txt', encoding='utf-8')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    print('Loaded %s word vectors.' % len(embeddings_index))
    embedding_matrix = np.zeros((vocabulary_size, dim_embed))
    for word, i in t.word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

### 2 classes(0: 1-2 star, 1: 4-5 star)

In [3]:
def getData(filename):
    print("---------------")
    print("| Getting data...")
    print("---------------")
    data = pd.read_pickle(filename)
    
    reviewTextList = data.reviewText.values
    t.fit_on_texts(reviewTextList)
    text = t.texts_to_sequences(reviewTextList)
    text = sequence.pad_sequences(text, maxlen=maxlen)
    print('text shape:', text.shape)
    
    data_X = np.array(text)
    data_Y = np_utils.to_categorical(data['overall'], num_class)
    return data_X, data_Y

In [5]:
def splitData(x, y):
    X_trains, X_test, Y_trains, Y_test = train_test_split(x, y, test_size=0.2, random_state=42)
    X_train, X_validation, Y_train, Y_validation = train_test_split(X_trains, Y_trains, test_size=0.2, random_state=42)
    return X_train, X_validation, X_test, Y_train, Y_validation, Y_test

In [6]:
def train(containsGlove, x_train, x_validation, y_train, y_validation, embedding_matrix):
    print("---------------")
    print("| Training...")
    print("---------------")
    
    model = Sequential()
#     if (containsGlove == False):
    print(x_train.shape)
    model.add(Embedding(input_dim=max_features, output_dim=dim_embed, input_length=maxlen))
#     else:
#         model.add(Embedding(input_dim=vocabulary_size, output_dim=embed_dim, input_length=maxlen, weights=[embedding_matrix], trainable=False))
    model.add(Dropout(0.5))
    model.add(
        Bidirectional(
            LSTM(
                num_hidden_unit_lstm, 
                return_sequences=True, 
                kernel_regularizer=l2(l2_lambda)
            ), 
            merge_mode="sum"
        ))
    model.add(Dropout(0.2))
    model.add(Reshape((maxlen, dim_embed, 1)))
    model.add(Conv2D(
        num_filter,
        kernel_size=(size_window,size_window),
        padding='valid',
        activation='relu',
        strides=1,
        kernel_regularizer=l2(l2_lambda)))
    model.add(MaxPooling2D(pool_size=size_pool))
    model.add(Dropout(0.2))
    model.add(Flatten())
    model.add(Dense(num_class, kernel_regularizer=l2(l2_lambda)))
    model.add(Activation('sigmoid'))

    print(model.summary())
    model.compile(
        loss='categorical_crossentropy', 
        optimizer='Adadelta',
        metrics=['categorical_accuracy'])
    
    history = model.fit(x_train, y_train,
              batch_size=size_batch,
              epochs=epochs,
              validation_data=(x_validation, y_validation),)
    
#     plt.figure('categorical_accuracy')
#     plt.xlabel('Iterations')
#     plt.ylabel('Categorical Accuracy')
#     plt.plot(history.history['categorical_accuracy'])
#     plt.show()
    
    model.save(modelname)
    del model

In [4]:
def test(x_test, y_test):
    print("---------------")
    print("| Testing...")
    print("---------------")
    model = load_model(modelname)
    
    score = model.evaluate(x_test,y_test, batch_size = size_batch)
    print('Test loss:',score[0])
    print('Test accuracy:',score[1])
    
    y_test=np.argmax(y_test,axis=1)
    y_pred=model.predict_classes(x_test)
    
    print(classification_report(y_test,y_pred))

### Run without GloVe for 5 epochs

In [8]:
def run():
    X,Y = getData(dataname)
    X_train, X_validation, X_test, Y_train, Y_validation, Y_test = splitData(X, Y)
    train(False, X_train, X_validation, Y_train, Y_validation, [])
#     test(X_test, Y_test)

In [9]:
run()

---------------
| Getting data...
---------------
text shape: (102838, 100)
---------------
| Training...
---------------
(65816, 100)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 300)          30000000  
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 300)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 300)          1442400   
_________________________________________________________________
dropout_2 (Dropout)          (None, 100, 300)          0         
_________________________________________________________________
reshape_1 (Reshape)          (None, 100, 300, 1)       0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 98, 298, 100)      1000      
_______

In [5]:
X,Y = getData(dataname)
X_trains, X_test, Y_trains, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

---------------
| Getting data...
---------------
text shape: (102838, 100)


In [6]:
# X,Y = getData(dataname)
# X_trains, X_test, Y_trains, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
test(X_test, Y_test)

---------------
| Testing...
---------------
Test loss: 0.24642248967106775
Test accuracy: 0.9070400544951186
             precision    recall  f1-score   support

          0       0.90      0.91      0.91     10330
          1       0.91      0.90      0.91     10238

avg / total       0.91      0.91      0.91     20568



In [7]:
# def continueTraining():
#     model = load_model(modelname)
#     model.fit(x_train, y_train,
#               batch_size=size_batch,
#               epochs=epochs,
#               validation_data=(x_validation, y_validation),)    
#     model.save(modelname)
#     del model

In [8]:
# test for adersasial
filename_adv = "data_adversial_wordnet.pkl"

In [9]:
def newTest(filename):
    x_newtest, y_newtest = getData(filename)
    test(x_newtest, y_newtest)

In [8]:
def testWithoutClassificationReport(x_test, y_test):
    print("---------------")
    print("| Testing...")
    print("---------------")
    model = load_model(modelname)
    
    score = model.evaluate(x_test,y_test, batch_size = size_batch)
    print('Test loss:',score[0])
    print('Test accuracy:',score[1])

In [9]:
def newTestWithoutClassificationReport(filename):
    x_newtest, y_newtest = getData(filename)
    testWithoutClassificationReport(x_newtest, y_newtest)

In [10]:
filename_adv = "data_adversial_wordnet.pkl"
newTestWithoutClassificationReport(filename_adv)

---------------
| Getting data...
---------------
text shape: (102664, 100)
---------------
| Testing...
---------------
Test loss: 0.9573833066646437
Test accuracy: 0.48363594531819604
