In [9]:
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import sklearn as sk
import os
from keras.utils import np_utils

from keras.preprocessing import sequence
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Activation, Reshape, Flatten, Embedding, LSTM, Bidirectional, Conv2D, MaxPooling2D
from keras.preprocessing.text import Tokenizer
from sklearn.metrics import classification_report
from keras.preprocessing.text import hashing_trick, text_to_word_sequence
from keras.regularizers import l2

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [10]:
dataname = "rawdata_100.pkl"
modelname = 'BLSTM+CNN.h5'

num_class = 2
max_features = 100000
maxlen = 100
batch_size = 10
epochs = 5

learning_rate = 0.1
dim_embed = 300
num_hidden_unit_lstm = 300
num_filter = 100
size_batch = 10
size_pool = 2
size_window = 3
dropout_embed = 0.5
dropout_blstm = 0.2
dropout_penultimate = 0.4
l2_lambda = 0.00001

onehot_encoder = OneHotEncoder(sparse=False)
t = Tokenizer(num_words=max_features)

In [11]:
def getEmbeddingMatrix(vocabulary_size):
    embeddings_index = dict()
    f = open('glove.6b/glove.6B.300d.txt', encoding='utf-8')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    print('Loaded %s word vectors.' % len(embeddings_index))
    embedding_matrix = np.zeros((vocabulary_size, dim_embed))
    for word, i in t.word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

### 2 classes(0: 1-2 star, 1: 4-5 star)

In [12]:
def getData(filename):
    print("---------------")
    print("| Getting data...")
    print("---------------")
    data = pd.read_pickle(filename)
    
    reviewTextList = data.reviewText.values
    t.fit_on_texts(reviewTextList)
    text = t.texts_to_sequences(reviewTextList)
    text = sequence.pad_sequences(text, maxlen=maxlen)
    print('text shape:', text.shape)
    
    data_X = np.array(text)
    data_Y = np_utils.to_categorical(data['overall'], num_class)
    return data_X, data_Y

In [13]:
def splitData(x, y):
    X_trains, X_test, Y_trains, Y_test = train_test_split(x, y, test_size=0.2, random_state=42)
    X_train, X_validation, Y_train, Y_validation = train_test_split(X_trains, Y_trains, test_size=0.2, random_state=42)
    return X_train, X_validation, X_test, Y_train, Y_validation, Y_test

In [14]:
def train(containsGlove, x_train, x_validation, y_train, y_validation, embedding_matrix):
    print("---------------")
    print("| Training...")
    print("---------------")
    
    model = Sequential()
#     if (containsGlove == False):
    print(x_train.shape)
    model.add(Embedding(input_dim=max_features, output_dim=dim_embed, input_length=maxlen))
#     else:
#         model.add(Embedding(input_dim=vocabulary_size, output_dim=embed_dim, input_length=maxlen, weights=[embedding_matrix], trainable=False))
    model.add(Dropout(0.5))
    model.add(
        Bidirectional(
            LSTM(
                num_hidden_unit_lstm, 
                return_sequences=True, 
                kernel_regularizer=l2(l2_lambda)
            ), 
            merge_mode="sum"
        ))
    model.add(Dropout(0.2))
    model.add(Reshape((maxlen, dim_embed, 1)))
    model.add(Conv2D(
        num_filter,
        kernel_size=(size_window,size_window),
        padding='valid',
        activation='relu',
        strides=1,
        kernel_regularizer=l2(l2_lambda)))
    model.add(MaxPooling2D(pool_size=size_pool))
    model.add(Dropout(0.2))
    model.add(Flatten())
    model.add(Dense(num_class, kernel_regularizer=l2(l2_lambda)))
    model.add(Activation('sigmoid'))

    print(model.summary())
    model.compile(
        loss='categorical_crossentropy', 
        optimizer='Adadelta',
        metrics=['categorical_accuracy'])
    
    history = model.fit(x_train, y_train,
              batch_size=size_batch,
              epochs=epochs,
              validation_data=(x_validation, y_validation),)
    
    plt.figure('categorical_accuracy')
    plt.xlabel('Iterations')
    plt.ylabel('Categorical Accuracy')
    plt.plot(history.history['categorical_accuracy'])
    plt.show()
    
    model.save(modelname)
    del model

In [15]:
def test(x_test, y_test):
    print("---------------")
    print("| Testing...")
    print("---------------")
    model = load_model('SVM.h5')
    
    score = model.evaluate(x_test,y_test, batch_size = size_batch)
    print('Test loss:',score[0])
    print('Test accuracy:',score[1])
    
    y_test=np.argmax(y_test,axis=1)
    y_pred=model.predict_classes(x_test)
    
    print(classification_report(y_test,y_pred))

### Run without GloVe for 5 epochs

In [16]:
def run():
    X,Y = getData(dataname)
    X_train, X_validation, X_test, Y_train, Y_validation, Y_test = splitData(X, Y)
    train(False, X_train, X_validation, Y_train, Y_validation, [])
    test(X_test, Y_test)

In [17]:
run()

---------------
| Getting data...
---------------


KeyboardInterrupt: 

In [None]:
# test for adersasial

In [None]:
def newTest(filename):
    x_newtest, y_newtest = getData(filename)
    test(x_newtest, y_newtest)

In [None]:
filename_adv = "data_adversial_wordnet.pkl"

In [None]:
newTest(filename_adv)

In [22]:
vocabulary_size = 0
num_class = 2
for filename in filenames:
    print("-----------------------------")
    print(filename + ":")
    X, Y, vocabulary_size = getXY(filename, num_class)
    for train_index, test_index in randomShuffle.split(X):
        X_train = X[train_index]
        Y_train = Y[train_index, :]
        X_test = X[test_index]
        Y_test = Y[test_index, :]
        
        bidirectionalLSTMCNNModel(False, num_class, 'categorical_crossentropy', X_train, X_test, Y_train, Y_test, 10, 5, vocabulary_size, [], 'BLSTM+CNN.h5')

-----------------------------
rawdata_100.pkl:
text shape: (102838, 100)
vocabulary size: 72438
x_train:  (82270, 100)
y_train:  (82270, 2)
x_test:  (20568, 100)
y_test:  (20568, 2)
vocabulary_size: 72438
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 300)          21731400  
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 300)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 300)          1442400   
_________________________________________________________________
dropout_2 (Dropout)          (None, 100, 300)          0         
_________________________________________________________________
reshape_1 (Reshape)          (None, 100, 300, 1)       0         
_________________________________________________________________
con

KeyboardInterrupt: 

In [1]:
model = Sequential()
model.save('new.h5')

NameError: name 'Sequential' is not defined

In [None]:
model = load_model('BLSTM+CNN.h5')

In [7]:
def getXY(filename, num_class):
    cat = [0, 1]
    def cat_y(y):
        if y<=2.0:
            return cat[0]
        elif y>=4.0:
            return cat[1]

    def get_reviews(path, n_samples):
        dt = {}
        i=0
        with open(path) as f:
            for d in f.readlines():
                dt[i] = eval(d)
                i += 1
        df = pd.DataFrame.from_dict(dt, orient='index')[['reviewText','overall']]
        print("before: ", len(df))
        df = df.drop(df[df.overall == 3.0].index)
        print("after: ", len(df))
        df = df[df['reviewText'].apply(lambda x: len(x.split())>=45)]
        df['bucket'] = df['overall'].apply(cat_y)

        df = df.groupby('bucket').apply(lambda x: x.sample(n=n_samples))
        return df

    data = get_reviews('data/reviews_Home_and_Kitchen_5.json', 10000)
    
    good_columns = [3]
    onehot_encoder = OneHotEncoder(sparse=False)
    data_X = data.iloc[:, 0]
    data_Y = data.iloc[:, 2]
    
    reviewTextList = data.reviewText.values
    t.fit_on_texts(reviewTextList)
    text = t.texts_to_sequences(reviewTextList)
    text = sequence.pad_sequences(text, maxlen=maxlen)
    print('text shape:', text.shape)
    vocabulary_size = len(t.word_index) + 1
    print('vocabulary size:', vocabulary_size)
    
    data_X = np.array(text)
    data_Y = np_utils.to_categorical(data_Y, num_class)
    return data_X, data_Y, vocabulary_size

In [10]:
vocabulary_size = 0
num_class = 2
for filename in filenames:
    print("-----------------------------")
    print(filename + ":")
    X, Y, vocabulary_size = getXY(filename, num_class)
    for train_index, test_index in randomShuffle.split(X):
        X_train = X[train_index]
        Y_train = Y[train_index, :]
        X_test = X[test_index]
        Y_test = Y[test_index, :]
        
        bidirectionalLSTMCNNModel(False, num_class, 'categorical_crossentropy', X_train, X_test, Y_train, Y_test, 10, 5, vocabulary_size, [], 'BLSTM+CNN.h5')

-----------------------------
reviews_Home_and_Kitchen_5.json:
before:  551682
after:  506623
text shape: (20000, 100)
vocabulary size: 59641
x_train:  (16000, 100)
y_train:  (16000, 2)
x_test:  (4000, 100)
y_test:  (4000, 2)
vocabulary_size: 59641
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 300)          17892300  
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 300)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 300)          1442400   
_________________________________________________________________
dropout_2 (Dropout)          (None, 100, 300)          0         
_________________________________________________________________
reshape_1 (Reshape)          (None, 100, 300, 1)       0         
_________________________

In [32]:
model = load_model('BLSTM+CNN.h5')
filename_adv = "data_adversial_wordnet.pkl"
num_class = 2
onehot_encoder = OneHotEncoder(sparse=False)

In [37]:
data = pd.read_pickle(filename_adv)

In [34]:
def getXY(filename, num_class):
    data = pd.read_pickle(filename)
    
    data_X = data.iloc[:, 3]
    data_Y = data.iloc[:, 2]
    
    reviewTextList = data.reviewText.values
    t.fit_on_texts(reviewTextList)
    text = t.texts_to_sequences(reviewTextList)
    text = sequence.pad_sequences(text, maxlen=maxlen)
    print('text shape:', text.shape)
    vocabulary_size = len(t.word_index) + 1
    print('vocabulary size:', vocabulary_size)
    
    data_X = np.array(text)
    data_Y = np_utils.to_categorical(data_Y, num_class)
    return data_X, data_Y, vocabulary_size

In [35]:
X_test_adv, Y_test_adv, input_dim_test_adv = getXY(filename_adv, num_class)

text shape: (102664, 100)
vocabulary size: 70318


In [36]:
score=model.evaluate(X_test_adv,Y_test_adv)
print('Test loss:',score[0])
print('Test accuracy:',score[1])
Y_test_adv=np.argmax(Y_test_adv,axis=1)
Y_pred_adv=model.predict_classes(X_test_adv)
print(classification_report(Y_test_adv,Y_pred_adv))



InvalidArgumentError: indices[28,83] = 59647 is not in [0, 59641)
	 [[Node: embedding_1_3/embedding_lookup = GatherV2[Taxis=DT_INT32, Tindices=DT_INT32, Tparams=DT_FLOAT, _class=["loc:@dropout_1_3/cond/Switch_1"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](embedding_1_3/embeddings/read, embedding_1_3/Cast, embedding_1_3/embedding_lookup/axis)]]