In [1]:
%load_ext autoreload
%autoreload

In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder
import re
from keras.layers import Input, Dense
from keras.models import Model, load_model
from keras.callbacks import EarlyStopping, TensorBoard
from keras import metrics


#plt.style.use('seaborn')
plt.style.use('ggplot')

Reads a subcategory of Amazon reviews into a pandas dataframe, and saves 2000 rows as a .pkl file.

Restarting Kernal while only running the following cell allows EDA without excessive memory use

In [13]:
#Set 'small_sample_pickled' to False to create'data/small_df.pkl'
small_sample_pickled = True

if not small_sample_pickled:
    df = pd.read_csv('data/gaming_reviews.tsv', sep='\t', error_bad_lines=False)
    df['text'] = df.review_headline + df.review_body
    
    #Drops any text that has a NaN
    df = df[~df.text.isnull()].reset_index(drop=True)
    df = df[['text', 'star_rating']]
    
    df = df[0:10000]
    df.to_pickle('data/small_df.pkl')

In [None]:
df = np.genfromtxt('data/gaming_reviews.tsv', skip_header=1, dtype=None, delimiter='\t', invalid_raise=False)

In [14]:
df = pd.read_pickle("data/small_df.pkl")
df.shape

(10000, 2)

The review_headline and review_body columns will both be useful for prediction.
Review date might be useful as well, such as day of the week reviewed. 

In [15]:
#Count vectorizes text into a sparse vector
cv = CountVectorizer(strip_accents='ascii')
sparse_vec = cv.fit_transform(df.text)


In [16]:
X_train[0].shape[1]

17275

In [17]:
def autoencoder_model(X_train):
    '''
    defines autoencoder model
    input: X_train (2D np array)
    output: autoencoder (compiled autoencoder model)
    '''
    # this is our input placeholder
    input_img = Input(shape=(X_train[0].shape[1],))

    # first encoding layer
    encoded1 = Dense(units = 256, activation = 'relu', name='layer1_256')(input_img)

    # second encoding layer
    # note that each layer is multiplied by the layer before
    encoded2 = Dense(units = 64, activation='relu', name='layer2_64')(encoded1)

    # first decoding layer
    decoded1 = Dense(units = 256, activation='relu', name='layer3_256')(encoded2)

    # second decoding layer - this produces the output
    decoded2 = Dense(units = X_train[0].shape[1], activation='sigmoid', name='layer4_output')(decoded1)

    # this model maps an input to its reconstruction
    autoencoder = Model(input_img, decoded2)

    # compile model
    autoencoder.compile(optimizer = 'adam', loss = 'mean_squared_error', metrics=['mse'])

    return autoencoder

In [18]:
#Train test split for small sample size
test_stop_index = int(sparse_vec.shape[0] * 0.1)
X_test = sparse_vec[0:test_stop_index]
y_test = np.array(df.star_rating)[0:test_stop_index]

X_train = sparse_vec[test_stop_index:]
y_train = np.array(df.star_rating)[test_stop_index:]

#One hot encoding star rating into array for transfer-learned neural net
enc = OneHotEncoder(categories='auto')
enc.fit(y_test.reshape(-1,1))
y_test = enc.transform(y_test.reshape(-1,1)).toarray()
y_train = enc.transform(y_train.reshape(-1,1)).toarray()

In [19]:
def batch_generator(X, batch_size, y=[]):
    number_of_batches = samples_per_epoch/batch_size
    counter=0
    shuffle_index = np.arange(np.shape(X)[0])
    np.random.shuffle(shuffle_index)
    
    while 1:
        index_batch = shuffle_index[batch_size*counter:batch_size*(counter+1)]
        X_batch = X[index_batch]
        X_batch = X_batch.toarray()
        counter += 1
        
        if y == []:
            yield X_batch, X_batch
        else:
            yield X_batch, y[index_batch]
        
        
        if (counter > number_of_batches):
            np.random.shuffle(shuffle_index)
            counter=0

In [21]:
autoencoder_model_created = False
model_path = 'models/basic_autoencoder1.h5'

if not autoencoder_model_created:
    model = autoencoder_model(X_train)

    batch_size = 1000
    nb_epoch = 10
    samples_per_epoch = 10

    # instantiate callbacks
    tensorboard = TensorBoard(log_dir='./autoencoder_logs', histogram_freq=2, batch_size=batch_size, write_graph=True, write_grads=True, write_images=True)
    earlystopping = EarlyStopping(monitor='val_loss', patience=2)

    # try different number of epochs - 10 gives good performanace 
    """model.fit(X_train, X_train, epochs=10, batch_size=batch_size, verbose=1,
              validation_split=0.1, callbacks = [earlystopping, tensorboard])""" # cross val to estimate test error


    model.fit_generator(generator=batch_generator(X_train, batch_size),
                        epochs=nb_epoch,
                        samples_per_epoch=samples_per_epoch)


    scores = model.evaluate(X_test, X_test)
    print('Test mse = {}'.format(scores[0]))

    X_test_decoded = model.predict(X_test)
    
    model.save(model_path)

else:
    model = load_model(model_path)
    scores = model.evaluate(X_test, X_test)
    print('Test mse = {}'.format(scores[0]))




Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10

KeyboardInterrupt: 

In [10]:
for i in range(4):
    print(model.layers[i].name)

input_2
layer1_256
layer2_64
layer3_256


In [None]:
for i in range(4):
    print(model.layers[i].name)
    model.layers[i].trainable = False

In [None]:
ll = model.layers[3].output
ll = Dense(units = 64, activation='relu', name='layer4_256')(ll)
ll = Dense(5,activation="hard_sigmoid", name='star_classification')(ll)
new_model = Model(inputs=model.input, outputs=ll)

new_model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics=[metrics.categorical_accuracy])

In [None]:
batch_size = 500
nb_epoch = 15
samples_per_epoch = 10
model_path = 'models/basic_autoencoder1.h5'

# instantiate callbacks
tensorboard = TensorBoard(log_dir='./autoencoder_logs', histogram_freq=2, batch_size=batch_size, write_graph=True, write_grads=True, write_images=True)
earlystopping = EarlyStopping(monitor='val_loss', patience=2)

# try different number of epochs - 10 gives good performanace 
"""model.fit(X_train, X_train, epochs=10, batch_size=batch_size, verbose=1,
          validation_split=0.1, callbacks = [earlystopping, tensorboard])""" # cross val to estimate test error


new_model.fit_generator(generator=batch_generator(X_train, batch_size, y_train),
                    epochs=nb_epoch,
                    steps_per_epoch=samples_per_epoch)


scores = new_model.evaluate(X_test, y_test)
print('Test accuracy = {}'.format(scores[1]))

X_test_decoded = new_model.predict(X_test)

new_model.save(model_path)

In [17]:
"""enc = OneHotEncoder()
enc.fit(y_test.reshape(-1,1))
enc.transform(y_test.reshape(-1,1)).toarray()
enc.transform(y_train.reshape(-1,1)).toarray()"""

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [0., 1.]])