# Seq2Seq Model

In [1]:
import numpy as np
import theano
from theano import tensor as T
from model.th.ug_utils import floatX, Dropout
from model.th.rnn import (RNN, SequenceLogisticRegression, LogisticRegression, GRULayer, GRULayerAttention, LSTMLayer,
                 LayerWrapper, seq_cat_crossent, Downscale, cross_entropy)
from model.th.encdec_shared import BiRNNEncoder, reverse_sent, RNNEncoder
from model.th.opt import get_opt_fn
from model.th.ug_utils import (glorot_init, norm_init, uniform_init,
                      get_sequence_dropout_mask, _linear_params)
from model.th.opt import optimizers
from model.th.run_utils import setup_exp
from model.th.util import load_vocab
from os.path import join as pjoin
from trident_cfg import STORY_DATA_PATH, VOCAB_PATH, EMBED_PATH
from data.story_loader import StoryLoader

%load_ext autoreload
%autoreload 2

In [2]:
word_idx_map, idx_word_map = load_vocab(VOCAB_PATH)
vocab_size = len(idx_word_map)

loader = StoryLoader(STORY_DATA_PATH,
                         batch_size=50, src_seq_len=65,
                         tgt_seq_len=20, mode='merged')

embed = loader.get_w2v_embed().astype('float32')

## Examine if `RNNTargetEncoder` works

It also has an olayer

In [21]:
from model.th.story_model import RNNTargetEncoder

args = type('Args', (object,), {"rnn_dim":256, 'recdrop':False, 'stocdrop':0.0, 'dropout':0.0, 'rlayers':1,
                               'label_size':2, 'input_size': 300, 'src_steps': 65})()
pdrop = T.scalar(dtype=floatX)
src_sent = T.imatrix('src_sent')
tgt_sent = T.imatrix('tgt_sent')
space_mask = T.bmatrix('space_mask')

src_mask = T.ones_like(src_sent).astype(floatX)  # this is used to drop words? Now we don't
tgt_mask = T.ones_like(tgt_sent).astype(floatX)  # this is used to drop words? Now we don't

labels = T.ivector('labels')

embedding = theano.shared(embed, 'embedding', borrow=True)

In [44]:
tgt_encoder = RNNTargetEncoder(tgt_sent.T, tgt_mask.T, embedding, labels, pdrop, args)

NameError: name 'labels' is not defined

In [116]:
outputs_info = [T.zeros((tgt_sent.T.shape[1], args.rnn_dim)).astype(floatX)]
rlayers = list()

inp = embedding[tgt_sent.T]

# exclude last prediction
seqmask = get_sequence_dropout_mask((tgt_sent.T.shape[0], tgt_sent.T.shape[1], embedding.shape[1]), pdrop)
inplayer = GRULayer(inp.astype(floatX), tgt_mask.T, seqmask, embed.shape[1], outputs_info,
                    args, backwards=False)

rlayers.append(inplayer)
for k in xrange(1, args.rlayers):
    inp = rlayers[-1].out
    seqmask = get_sequence_dropout_mask((tgt_sent.T.shape[0], tgt_sent.T.shape[1], args.rnn_dim), pdrop)
    rlayer = GRULayer(Dropout(inp, pdrop).out, tgt_mask.T, seqmask, args.rnn_dim,
                      outputs_info, args, backwards=False)
    rlayers.append(rlayer)

last_layer = Dropout(rlayers[-1].out, pdrop)
olayer = LogisticRegression(Dropout(rlayers[-1].out, pdrop).out[-1, :, :], args.rnn_dim,
                                    args.label_size)
cost = cross_entropy(olayer.out, labels, normalize=False)

In [52]:
test_func = theano.function([tgt_sent, pdrop], [last_layer.out])

In [28]:
x, (y, y_2), real_label = loader.get_batch('train', 2)
a = test_func(y, 0.0)

In [126]:
print a[0].shape

(20, 50, 256)


In [127]:
real_label.flatten().shape

(50,)

In [128]:
test_cost = theano.function([tgt_sent, pdrop, labels], [cost])

In [131]:
test_cost(y, 0.0, real_label)

[array(0.6927227973937988, dtype=float32)]

In [132]:
# prediction
test_olayer_out = theano.function([tgt_sent, pdrop], [olayer.y_pred])

In [133]:
preds = test_olayer_out(y, 0.0)
preds

[array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
        0, 0, 0, 0])]

In [134]:
np.mean(preds == real_label)

0.47999999999999998

## Explore RNNEncoder

In [24]:
encoder = RNNEncoder(src_sent.T, src_mask.T, space_mask.T, embedding, pdrop, args)

In [101]:
func_outputinfo = theano.function([src_sent, pdrop], encoder.out)

In [103]:
outputinfo = func_outputinfo(x, 0.0)

In [105]:
outputinfo[0].shape

(50, 256)

## Explore RNNEncoderAttention

RNNEncoder Attention explored outside attention connection (but no inner attention)

In [7]:
from model.th.story_model import RNNEncoderAttention

args = type('Args', (object,), {"rnn_dim":256, 'recdrop':False, 'stocdrop':0.0, 'dropout':0.0, 'rlayers':2,
                               'label_size':2, 'input_size': 300, 'src_steps': 65})()
pdrop = T.scalar(dtype=floatX)
src_sent = T.imatrix('src_sent')
tgt_sent = T.imatrix('tgt_sent')
space_mask = T.bmatrix('space_mask')

src_mask = T.ones_like(src_sent).astype(floatX)  # this is used to drop words? Now we don't
tgt_mask = T.ones_like(tgt_sent).astype(floatX)  # this is used to drop words? Now we don't

labels = T.ivector('labels')

embedding = theano.shared(embed, 'embedding', borrow=True)

In [49]:
tgt_encoder = RNNEncoderAttention(encoder, tgt_sent.T, labels, tgt_mask.T, embedding, pdrop, args)

In [32]:
# Gut of the AttentionEncoder

# target_sqn: (time_step, N)
hs = encoder.hs

# NOTE just use this so only last layer uses attention
def layer_init(attention):
    if not attention:
        return GRULayer
    else:
        return lambda *largs, **kwargs: GRULayerAttention(hs, *largs, **kwargs)

# initial states
outputs_info = encoder.out
rlayers = list()

inp = embedding[tgt_sent]
attention = args.rlayers == 1
# exclude last prediction
seqmask = get_sequence_dropout_mask((tgt_sent.shape[0], tgt_sent.shape[1], embedding.shape[1]), pdrop)
inplayer = layer_init(attention)(inp.astype(floatX), tgt_mask, seqmask, args.input_size,
                                 outputs_info[0], args, suffix='tgtenc0')
rlayers.append(inplayer)
for k in xrange(1, args.rlayers):
    attention = (args.rlayers == k + 1)
    seqmask = get_sequence_dropout_mask((tgt_sent.shape[0], tgt_sent.shape[1], args.rnn_dim), pdrop)
    rlayer = layer_init(attention)(Dropout(rlayers[-1].out, pdrop).out, tgt_mask,
                                   seqmask, args.rnn_dim, outputs_info[k], args, suffix='dec%d' % k)
    rlayers.append(rlayer)

olayer = LogisticRegression(Dropout(rlayers[-1].out, pdrop).out[-1, :, :], args.rnn_dim,
                                    args.label_size)

In [27]:
test_encoder_hs = theano.function([src_sent, pdrop], [encoder.hs])

In [29]:
a = test_encoder_hs(x, 0.0)

In [31]:
a[0].shape

(65, 50, 256)

In [33]:
test_olayer = theano.function([src_sent, tgt_sent, pdrop], [olayer.out])

In [39]:
a = test_olayer(x, y.T, 0.0)  # this is a bit weird :( 
# because we forgot to transpose y in the above code..but we remembered to tranpose x..
a[0].shape

(50, 2)

In [40]:
cost = cross_entropy(olayer.out, labels, normalize=False)

In [42]:
test_cost = theano.function([src_sent, tgt_sent, pdrop, labels], [cost])

In [47]:
test_cost(x, y.T, 0.0, real_label)

[array(0.693530261516571, dtype=float32)]

## Vector Preprocessing

We preprocess two story endings by various means. We'll examine if our target encoder class works or not here.

In [3]:
x, (y, y_2), real_label = loader.get_batch('train', 2)
embed[x].shape

(50, 65, 300)

In [6]:
np.concatenate((embed[y], embed[y_2]), axis=2).shape

(50, 20, 600)