In [1]:
import sys
sys.path.append("..")
from common.download_utils import download_week2_resources
import numpy as np
download_week2_resources()
from collections import defaultdict
import tensorflow as tf

File data\train.txt is already downloaded.
File data\validation.txt is already downloaded.
File data\test.txt is already downloaded.


In [2]:
def read_data(file_path):
    tokens = []
    tags = []
    
    tweet_tokens = []
    tweet_tags = []
    for line in open(file_path, encoding='utf-8'):
        line = line.strip()
        if not line:
            if tweet_tokens:
                tokens.append(tweet_tokens)
                tags.append(tweet_tags)
            tweet_tokens = []
            tweet_tags = []
        else:
            token, tag = line.split()
            # Replace all urls with <URL> token
            # Replace all users with <USR> token

            ######################################
            ######### YOUR CODE HERE #############
            ######################################
            if token.startswith('http://'):
                token='<URL>'
            if token.startswith('@'):
                token='<USR>'
            tweet_tokens.append(token)
            tweet_tags.append(tag)
            
    return tokens, tags

In [3]:
train_tokens, train_tags = read_data('data/train.txt')
validation_tokens, validation_tags = read_data('data/validation.txt')
test_tokens, test_tags = read_data('data/test.txt')

In [4]:
def build_dict(tokens_or_tags, special_tokens):
    """
        tokens_or_tags: a list of lists of tokens or tags
        special_tokens: some special tokens
    """
    # Create a dictionary with default value 0
    tok2idx = defaultdict(lambda: 0)
    idx2tok = []
    
    # Create mappings from tokens to indices and vice versa
    # Add special tokens to dictionaries
    # The first special token must have index 0
    
    ######################################
    ######### YOUR CODE HERE #############
    ######################################
    idx = 0
    for tok in special_tokens:
        tok2idx[tok] = idx
        idx += 1
    for toks in tokens_or_tags:
        for tok in toks:
            if tok in tok2idx:
                continue
            tok2idx[tok] = idx
            idx += 1
    idx2tok = sorted(tok2idx.keys(), key = tok2idx.get)
    return tok2idx, idx2tok

In [5]:
special_tokens = ['<UNK>', '<PAD>']
special_tags = ['O']
# Create dictionaries 
token2idx, idx2token = build_dict(train_tokens + validation_tokens, special_tokens)
tag2idx, idx2tag = build_dict(train_tags, special_tags)

In [6]:
def words2idxs(tokens_list):
    return [token2idx[word] for word in tokens_list]

def tags2idxs(tags_list):
    return [tag2idx[tag] for tag in tags_list]

def idxs2words(idxs):
    return [idx2token[idx] for idx in idxs]

def idxs2tags(idxs):
    return [idx2tag[idx] for idx in idxs]

In [7]:
def batches_generator(batch_size, tokens, tags,
                      shuffle=True, allow_smaller_last_batch=True):
    """Generates padded batches of tokens and tags."""
    
    n_samples = len(tokens)
    if shuffle:
        order = np.random.permutation(n_samples)
    else:
        order = np.arange(n_samples)

    n_batches = n_samples // batch_size
    if allow_smaller_last_batch and n_samples % batch_size:
        n_batches += 1

    for k in range(n_batches):
        batch_start = k * batch_size
        batch_end = min((k + 1) * batch_size, n_samples)
        # print(batch_start)
        # print(batch_end)
        current_batch_size = batch_end - batch_start
        x_list = []
        y_list = []
        max_len_token = 0
        for idx in order[batch_start: batch_end]:
            # print(words2idxs(tokens[idx]))
            x_list.append(words2idxs(tokens[idx]))
            y_list.append(tags2idxs(tags[idx]))
            max_len_token = max(max_len_token, len(tags[idx]))
        # print(tag2idx['O'])
        # Fill in the data into numpy nd-arrays filled with padding indices.
        x = np.ones([current_batch_size, max_len_token], dtype=np.int32) * token2idx['<PAD>']
        print(x.shape)
        y = np.ones([current_batch_size, max_len_token], dtype=np.int32) * tag2idx['O']
        print(y.shape)
        lengths = np.zeros(current_batch_size, dtype=np.int32)
        for n in range(current_batch_size):
            utt_len = len(x_list[n])
            x[n, :utt_len] = x_list[n]
            lengths[n] = utt_len
            y[n, :utt_len] = y_list[n]
        print(x.shape)
        print(y.shape)
        yield x, y, lengths

In [8]:
g = batches_generator(32, train_tokens, train_tags)

In [9]:
for x, y, l in g:
    break

(32, 31)
(32, 31)
(32, 31)
(32, 31)


In [10]:
x.shape
y.shape

(32, 31)

In [28]:
vocabulary_size = len(idx2token)
embedding_dim = 200
n_hidden_rnn = 200
n_tags = len(idx2tag)

In [12]:
initial_embedding_matrix = np.random.randn(vocabulary_size, embedding_dim) / np.sqrt(embedding_dim)

In [13]:
embedding_matrix_variable = tf.Variable(initial_embedding_matrix, name='embedding_matrix', dtype=tf.float32)

In [14]:
embedding_matrix_variable

<tf.Variable 'embedding_matrix:0' shape=(21117, 200) dtype=float32_ref>

In [16]:
forward_rnn_cell = tf.nn.rnn_cell.BasicRNNCell(n_hidden_rnn)
forward_cell = tf.nn.rnn_cell.DropoutWrapper(forward_rnn_cell,
                                                 input_keep_prob=0.5,
                                                 output_keep_prob=0.5,
                                                 state_keep_prob=0.5)

In [17]:
backward_rnn_cell = tf.nn.rnn_cell.BasicRNNCell(n_hidden_rnn)
backward_cell = tf.nn.rnn_cell.DropoutWrapper(backward_rnn_cell,
                                              input_keep_prob=0.5,
                                              output_keep_prob=0.5,
                                              state_keep_prob=0.5)

In [20]:
embeddings =  tf.nn.embedding_lookup(embedding_matrix_variable, ids=x)

In [21]:
embeddings

<tf.Tensor 'embedding_lookup:0' shape=(32, 31, 200) dtype=float32>

In [22]:
(rnn_output_fw, rnn_output_bw), _ =  tf.nn.bidirectional_dynamic_rnn(cell_fw=forward_cell, 
                                                                         cell_bw=backward_cell,
                                                                         inputs=embeddings,
                                                                         dtype=tf.float32)

In [23]:
rnn_output_fw

<tf.Tensor 'bidirectional_rnn/fw/fw/transpose_1:0' shape=(32, 31, 200) dtype=float32>

In [24]:
rnn_output_bw

<tf.Tensor 'ReverseV2:0' shape=(32, 31, 200) dtype=float32>

In [25]:
rnn_output = tf.concat([rnn_output_fw, rnn_output_bw], axis=2)

In [26]:
rnn_output

<tf.Tensor 'concat:0' shape=(32, 31, 400) dtype=float32>

In [29]:
logits = tf.layers.dense(rnn_output, n_tags, activation=None)

In [30]:
logits

<tf.Tensor 'dense/BiasAdd:0' shape=(32, 31, 21) dtype=float32>

In [31]:
softmax_output = tf.nn.softmax(logits, axis=-1)

In [32]:
softmax_output

<tf.Tensor 'Reshape_1:0' shape=(32, 31, 21) dtype=float32>

In [37]:
predictions = tf.argmax(softmax_output, axis=-1)

In [38]:
predictions

<tf.Tensor 'ArgMax_1:0' shape=(32, 31) dtype=int64>

In [40]:
y.shape

(32, 31)

In [41]:
ground_truth_tags_one_hot = tf.one_hot(y, n_tags)

In [42]:
ground_truth_tags_one_hot

<tf.Tensor 'one_hot:0' shape=(32, 31, 21) dtype=float32>

In [74]:
sess = tf.Session()

In [75]:
init = tf.global_variables_initializer()

In [76]:
sess.run(init)
r_output = sess.run(predictions)