In [1]:
%load_ext autoreload
%autoreload

In [41]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder
import re
from keras.layers import Input, Dense
from keras.models import Model, load_model
from keras.callbacks import EarlyStopping, TensorBoard
from keras import metrics
import json
from helper_functions import import_data
from sklearn.metrics import confusion_matrix
from os import listdir



In [42]:
len(listdir('jsons'))

900

In [43]:
test_json_path = 'jsons/part-00001-3a3b13a1-b857-4d1c-965f-ce7bf860f3bc-c000.json'
dense_matrix, y_vals = import_data(test_json_path)
X_test = dense_matrix[[0:1000, :]]
X_train = dense_matrix[1000:, :]

y_test = y_vals[0:1000]
y_train = y_vals[1000:]
X_test.shape

SyntaxError: invalid syntax (<ipython-input-43-9aa0fec60091>, line 3)

In [47]:
def autoencoder_model(X_train):
    '''
    defines autoencoder model
    input: X_train (2D np array)
    output: autoencoder (compiled autoencoder model)
    '''
    # this is our input placeholder
    input_img = Input(shape=(X_train.shape[1],))

    # first encoding layer
    encoded1 = Dense(units = 1000, activation = 'relu', name='layer1_256')(input_img)

    # second encoding layer
    # note that each layer is multiplied by the layer before
    encoded2 = Dense(units = 200, activation='relu', name='layer2_64')(encoded1)

    # first decoding layer
    decoded1 = Dense(units = 1000, activation='relu', name='layer3_256')(encoded2)

    # second decoding layer - this produces the output
    decoded2 = Dense(units = X_train.shape[1], activation='sigmoid', name='layer4_output')(decoded1)

    # this model maps an input to its reconstruction
    autoencoder = Model(input_img, decoded2)

    # compile model
    autoencoder.compile(optimizer = 'adam', loss = 'mean_squared_error', metrics=['mse'])

    return autoencoder

In [57]:
def batch_generator(file_dir, batch_size, autoencoder_layer=True):
    #creates list of filename strings from jsons directory
    dir_list = listdir('jsons')
    
    #Determines number of batches to produce
    number_of_batches = samples_per_epoch/batch_size
    counter=0
    file_counter = 0
    
    #Creates random shuffle index
    file_shuffle_index = np.arange(len(dir_list))
    np.random.shuffle(file_shuffle_index)
    
    #Uses first random shuffle index to select a file
    X, y = import_data('jsons/' + dir_list[file_shuffle_index[file_counter]])
    
    #Creates shuffle index arrays to randomize the data rows as well as the files used
    data_shuffle_index = np.arange(np.shape(X)[0])
    np.random.shuffle(data_shuffle_index)
    
    while 1:
        #Randomizes selection of data rows
        index_batch = data_shuffle_index[batch_size*counter:batch_size*(counter+1)]
        X = X[index_batch, :]
        y = y[index_batch]
        
        #Saves the smallest count of good/bad reviews 
        smallest_good_or_bad_count = min([y[y == 0].shape[0], y[y == 0].shape[0]])
        
        #Creates an array with a 50/50 split of good/bad reviews
        X_batch_good = X[y == 1][0:smallest_good_or_bad_count, :]
        X_batch_bad = X[y == 0][0:smallest_good_or_bad_count, :]
        y_batch_good = y[y == 1][0:smallest_good_or_bad_count]
        y_batch_bad = y[y == 0][0:smallest_good_or_bad_count]
        
        #Stacks the arrays
        X_batch = np.vstack((X_batch_good, X_batch_bad))
        y_batch = np.vstack((y_batch_good.reshape(-1, 1), y_batch_bad.reshape(-1,1)))
        
        #Randomizes the reviews so there isn't a block of good/bad reviews
        batch_shuffle_index = np.arange(X_batch.shape[0])
        np.random.shuffle(batch_shuffle_index)
        X_batch = X_batch[batch_shuffle_index, :]
        y_batch = y_batch[batch_shuffle_index]
        
        counter += 1
        
        if autoencoder_layer:
            yield X_batch, X_batch
        else:
            yield X_batch, y_batch
        
        
        if (counter >= number_of_batches):
            counter=0
            file_counter +=1
            
            if file_counter == len(dir_list):
                np.random.shuffle(file_shuffle_index)
                file_count = 0
                
            np.random.shuffle(data_shuffle_index)
            X, y = import_data('jsons/' + dir_list[file_shuffle_index[file_counter]])
            

In [58]:
X_test, y_test = import_data('jsons_test/' + listdir('jsons_test')[0])

In [None]:
autoencoder_model_created = False
model_path = 'models/basic_autoencoder1.h5'

if not autoencoder_model_created:
    model = autoencoder_model(X_test)

    batch_size = 1000
    nb_epoch = 10
    samples_per_epoch = 15

    # instantiate callbacks
    tensorboard = TensorBoard(log_dir='./autoencoder_logs', histogram_freq=2, batch_size=batch_size, write_graph=True, write_grads=True, write_images=True)
    earlystopping = EarlyStopping(monitor='val_loss', patience=2)

    # try different number of epochs - 10 gives good performanace 
    """model.fit(X_train, X_train, epochs=10, batch_size=batch_size, verbose=1,
              validation_split=0.1, callbacks = [earlystopping, tensorboard])""" # cross val to estimate test error


    model.fit_generator(generator=batch_generator('jsons', batch_size),
                        epochs=nb_epoch,
                        steps_per_epoch=samples_per_epoch)


    scores = model.evaluate(X_test, X_test)
    print('Test mse = {}'.format(scores[0]))

    X_test_decoded = model.predict(X_test)
    
    model.save(model_path)

else:
    model = load_model(model_path)
    scores = model.evaluate(X_test, X_test)
    print('Test mse = {}'.format(scores[0]))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10

In [12]:
for i in range(4):
    print(model.layers[i].name)

input_1
layer1_256
layer2_64
layer3_256


In [13]:
for i in range(4):
    print(model.layers[i].name)
    model.layers[i].trainable = False

input_1
layer1_256
layer2_64
layer3_256


In [21]:
ll = model.layers[3].output
ll = Dense(units = 64, activation='relu', name='layer4_256')(ll)
ll = Dense(1,activation="hard_sigmoid", name='positive_classification')(ll)
new_model = Model(inputs=model.input, outputs=ll)

new_model.compile(optimizer = 'adam', loss = 'mse', metrics=[metrics.categorical_accuracy])

In [22]:
batch_size = 500
nb_epoch = 30
samples_per_epoch = 15
model_path = 'models/basic_autoencoder1.h5'

# instantiate callbacks
tensorboard = TensorBoard(log_dir='./autoencoder_logs', histogram_freq=2, batch_size=batch_size, write_graph=True, write_grads=True, write_images=True)
earlystopping = EarlyStopping(monitor='val_loss', patience=2)

# try different number of epochs - 10 gives good performanace 
"""model.fit(X_train, X_train, epochs=10, batch_size=batch_size, verbose=1,
          validation_split=0.1, callbacks = [earlystopping, tensorboard])""" # cross val to estimate test error


new_model.fit_generator(generator=batch_generator('jsons', batch_size, autoencoder_layer=False),
                    epochs=nb_epoch,
                    steps_per_epoch=samples_per_epoch)


scores = new_model.evaluate(X_test, y_test)
print('Test accuracy = {}'.format(scores[1]))

X_test_decoded = new_model.predict(X_test)

new_model.save(model_path)

Epoch 1/30


  if sys.path[0] == '':


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Test accuracy = 1.0


In [18]:
X_test_decoded

array([[1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],

In [20]:
confusion_matrix(y_test, X_test_decoded.astype(int))

array([[  0, 116],
       [  0, 884]])

In [29]:
y_test

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [0., 1.],
       [1., 0.],
       [1., 0.]])

In [32]:
X_test_decoded.astype(int)

array([[0, 0],
       [0, 0],
       [0, 0],
       ...,
       [0, 0],
       [0, 0],
       [0, 0]])