# Search all Parameters and HyperParameters

### Load Entire Dataset
51 seconds

In [2]:
%%time
# REVIEW_FILE_CSV = 'reviews.csv'
SHUFFLED_REVIEW_FILE_CSV = 'shuffled.reviews.csv'
import pandas as pd
df_all = pd.read_csv(SHUFFLED_REVIEW_FILE_CSV)
del pd, SHUFFLED_REVIEW_FILE_CSV
%whos

df_all	 
CPU times: user 48.1 s, sys: 7.09 s, total: 55.2 s
Wall time: 51.4 s


### Data Prep Functions

In [3]:
def get_data(size, metric):
    return df_all[[metric, 'text']].head(size)

def data_prep(df, metric):
    from keras.preprocessing.text import Tokenizer
    from keras.preprocessing.sequence import pad_sequences
    from sklearn.model_selection import train_test_split

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(df.text.values)
    VOCAB_SIZE = len(tokenizer.word_index) + 1

    X = tokenizer.texts_to_sequences(df.text.values)
    X = pad_sequences(X)
    # Normalize Y to be between 0 and 1
    Y = df[metric] / max(df[metric])
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

    return X, Y, VOCAB_SIZE, X_train, X_test, Y_train, Y_test

### Parameterized Neural Net Factory

In [4]:
def model_factory(X_train, VOCAB_SIZE, EMBED_OUTPUT_DIM, LSTM_OUT, LSTM_DROPOUT, RECURRENT_DROPOUT, USE_SPATIAL_DROPOUT, SPATIAL_DROPOUT):
    from keras.models import Sequential
    from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D

    model = Sequential()

    # TODO https://realpython.com/python-keras-text-classification/

    # https://keras.io/layers/embeddings/
    # keras.layers.Embedding(input_dim, output_dim, embeddings_initializer='uniform', embeddings_regularizer=None, activity_regularizer=None, embeddings_constraint=None, mask_zero=False, input_length=None)
    model.add(Embedding(VOCAB_SIZE,
                        EMBED_OUTPUT_DIM,
                        mask_zero=True,
                        input_length=X_train.shape[1]))

    if USE_SPATIAL_DROPOUT:
        model.add(SpatialDropout1D(SPATIAL_DROPOUT))

    if LSTM_LAYER_COUNT > 1:
        for i in range(LSTM_LAYER_COUNT):
            model.add(LSTM(LSTM_OUT, return_sequences=True, dropout=LSTM_DROPOUT, recurrent_dropout=RECURRENT_DROPOUT))

    model.add(LSTM(LSTM_OUT, dropout=LSTM_DROPOUT, recurrent_dropout=RECURRENT_DROPOUT))

    model.add(Dense(1, activation='linear'))
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae', 'accuracy'])
    return model

### induce and evaluate

In [None]:
def evaluate(model, model_name, X_train, Y_train, EPOCHS, BATCH_SIZE):

    import matplotlib.pyplot as plt
    plt.plot(history.history['acc'], '.-')
    plt.plot(history.history['val_acc'], '.-')
    plt.plot(history.history['loss'], '.-')
    plt.plot(history.history['val_loss'], '.-')

    plt.title('training')
    plt.xlabel('epoch')
    plt.legend(['acc', 'val_acc', 'loss', 'val_loss'], loc='best')
    plt.show()

    loss, mae, acc = model.evaluate(X_test, Y_test, verbose=2, batch_size=BATCH_SIZE)
    print('loss(mse):' + str(loss))  # mse
    print('mae:' + str(mae))
    print('acc:' + str(acc))

    y_pred = model.predict(X_test)
    #y_pred[0:5]
    #Y_test[0:5]

    fig, ax = plt.subplots()
    ax.scatter(Y_test, y_pred)
    ax.plot([Y_test.min(), Y_test.max()], [Y_test.min(), Y_test.max()], 'k--', lw=4)
    ax.set_xlabel('Measured')
    ax.set_ylabel('Predicted')
    plt.show()
    
    return loss, mae, acc
    


### Parameters and Hyperparameters to Search

In [5]:
DATA_SIZE = 20
METRIC = 'stars'  # ['stars','funny'] # 'useful' ,'cool']

EMBED_OUTPUT_DIM = 8  # 128

USE_SPATIAL_DROPOUT = False
SPATIAL_DROPOUT = 0.1

LSTM_LAYER_COUNT = 1
LSTM_OUT = 8  # 196
LSTM_DROPOUT = 0.1
RECURRENT_DROPOUT = 0.1

EPOCHS = 1

### Perform Search

In [9]:
%%time

import time

df = get_data(DATA_SIZE, METRIC)
X, Y, VOCAB_SIZE, X_train, X_test, Y_train, Y_test = data_prep(df, METRIC)

model = model_factory(X_train, VOCAB_SIZE, EMBED_OUTPUT_DIM, LSTM_OUT, LSTM_DROPOUT, RECURRENT_DROPOUT, USE_SPATIAL_DROPOUT, SPATIAL_DROPOUT)
print(model.summary())

model_name = METRIC + '_epochs-' + str(EPOCHS) + '_batch_sz-' + str(BATCH_SIZE) + '_data_sz-' + str(DATA_SIZE)
start_time = time.time()
history = model.fit(X_train, Y_train, validation_split=0.333, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=2)
fit_time = time.time() - start_time

loss, mae, acc = evaluate_model(model, model_name, X_train, Y_train, EPOCHS, BATCH_SIZE)

model.save(model_name + '.' + str(fit_time) + '_loss-' +  str(loss) + '_mae-' + str(mae) + '_acc=' + str(acc) + '.h5')

%whos

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 259, 8)            5968      
_________________________________________________________________
lstm_2 (LSTM)                (None, 8)                 544       
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 9         
Total params: 6,521
Trainable params: 6,521
Non-trainable params: 0
_________________________________________________________________
None
Variable              Type          Data/Info
---------------------------------------------
DATA_SIZE             int           20
EMBED_OUTPUT_DIM      int           8
EPOCHS                int           1
LSTM_DROPOUT          float         0.1
LSTM_LAYER_COUNT      int           1
LSTM_OUT              int           8
METRIC                str           stars
RECURRENT_DROPOUT     float       