<a href="https://colab.research.google.com/github/tdiggelm/nn-experiments/blob/master/attention_sequential_crossval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
from google.colab import drive
drive.mount('/content/gdrive')
import os
from sklearn.model_selection import RandomizedSearchCV
from time import time
from keras import layers, models
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
from nltk.corpus import movie_reviews
import nltk
nltk.download('movie_reviews')
nltk.download('punkt')
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.sequence import pad_sequences
from scipy import stats
from keras import activations
from keras import backend as K
from keras import optimizers
from keras import datasets
from IPython.core.display import HTML
from sklearn.metrics import classification_report
!pip install keras-tcn
from tcn import TCN
from keras.wrappers.scikit_learn import KerasClassifier
from datetime import datetime

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [25]:
MAX_SEQ_LEN = 250
#MAX_NUM_WORDS = 10000

(X_train, y_train), (X_test, y_test) = datasets.imdb.load_data()

X_train = pad_sequences(X_train, MAX_SEQ_LEN)
X_test = pad_sequences(X_test, MAX_SEQ_LEN)

word_index = datasets.imdb.get_word_index()
word_index = {k:(v+3) for k,v in word_index.items()}
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2

index_word = {}
for k,v in word_index.items():
  index_word[v] = k

Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz
Downloading data from https://s3.amazonaws.com/text-datasets/imdb_word_index.json


In [0]:
import os
from keras import initializers
from keras import layers
def get_glove_embedding(word_index, input_length=None, trainable=True):
  if not os.path.isfile("glove.6B.100d.txt"):
    !wget "http://nlp.stanford.edu/data/glove.6B.zip"
    !unzip "glove.6B.zip"

  # get glove coeff matrix
  embeddings_index = {}
  with open("glove.6B.100d.txt", encoding="utf-8") as f:
      for line in f:
          values = line.split()
          word = values[0]
          coefs = np.asarray(values[1:], dtype='float32')
          embeddings_index[word] = coefs
  print('Found %s word vectors.' % len(embeddings_index))

  # prepare pre-learned embedding matrix
  embdedding_dim = 100
  #num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
  num_words = len(word_index) + 1
  embedding_matrix = np.zeros((num_words, embdedding_dim))
  for word, i in word_index.items():
      #if i > MAX_NUM_WORDS:
      #    continue
      embedding_vector = embeddings_index.get(word)
      if embedding_vector is not None:
          # words not found in embedding index will be all-zeros.
          embedding_matrix[i] = embedding_vector

  return layers.Embedding(num_words, embdedding_dim, 
                   input_length=input_length, 
                   embeddings_initializer=initializers.Constant(embedding_matrix),
                   trainable=trainable)

In [0]:
def frobenius_regularizer(weight_matrix):
  A = K.softmax(weight_matrix, axis=1)
  AT = K.transpose(A)
  M = K.dot(A, AT)
  d = K.shape(M)[1]
  return 1.0 * K.sum(K.abs(M-K.eye(d))**2)

In [0]:
def build_model(
    n_hidden=50,
    da=350,
    r=30,
    dropout=0.5,
    lr=0.001, 
    clipnorm=0.1,
    n_dense=1024
):
  inputs = layers.Input(shape=(MAX_SEQ_LEN,))
  embedding = get_glove_embedding(word_index)(inputs)
  H = layers.Bidirectional(layers.CuDNNLSTM(n_hidden,
                                            return_sequences=True,
                                           ))(embedding)
  #--- BEGIN ATTENTION (arXiv:1703.03130)
  WS1 = layers.Dense(da, activation='tanh')(H)
  WS1 = layers.Dropout(dropout)(WS1)
  WS2 = layers.Dense(r, kernel_regularizer=frobenius_regularizer)(WS1)
  WS2 = layers.Dropout(dropout)(WS2)
  A = layers.Softmax(axis=1, name='attention_matrix')(WS2)
  M = layers.Dot(axes=1)([A, H])
  #--- END ATTENTION

  reduced = layers.Lambda(lambda x: K.mean(x, axis=1))(M)
  dense = layers.Dense(n_dense, activation='relu')(reduced)
  dense = layers.Dropout(dropout)(dense)
  dense = layers.Dense(n_dense, activation='relu')(dense)
  dense = layers.Dropout(dropout)(dense)
  output = layers.Dense(1, activation='sigmoid')(dense)
  model = models.Model(inputs, output)
  optimizer = optimizers.Adam(lr=lr, clipnorm=clipnorm)
  model.compile(optimizer=optimizer, loss=['binary_crossentropy'],
                metrics=['accuracy'])
  return model

In [0]:
outputdir = "/content/gdrive/My Drive/ml_output/crossval"
!mkdir -p "/content/gdrive/My Drive/ml_output/crossval"


In [0]:
param_dist = {'n_hidden': [50, 100, 150],
              'da': [100, 350, 500],
              'r': [10, 30, 60],
              'dropout': [0.1, 0.2, 0.5],
              'lr': [0.001, 0.0025, 0.005, 0.01],
              'clipnorm': [0.0, 0.1, 0.25, 0.5],
              'n_dense': [256, 512, 1024, 2048],
              'batch_size': [8, 16, 32],
              'epochs': [3]
             }

clfr = KerasClassifier(build_fn=build_model, verbose=1)

# run randomized search
n_iter_search = 10
random_search = RandomizedSearchCV(clfr, param_distributions=param_dist,
                                   n_iter=n_iter_search, cv=3)
random_search.fit(X_train, y_train)
df = pd.DataFrame(random_search.cv_results_)
df.to_csv(
    os.path.join(outputdir, 'attention_sequential_%s.csv' 
                 % datetime.today().isoformat(timespec='seconds')))

Found 400000 word vectors.
Epoch 1/3
Epoch 2/3
Epoch 3/3
Found 400000 word vectors.
Epoch 1/3
 3808/16667 [=====>........................] - ETA: 54s - loss: 2895.6485 - acc: 0.7188