In [42]:
from rasa_nlu.training_data import TrainingData, Message
import io
import logging
import numpy as np
import os
import pickle
import typing
from tqdm import tqdm
from typing import Any, Dict, List, Optional, Text, Tuple

from rasa_nlu.classifiers import INTENT_RANKING_LENGTH
from rasa_nlu.components import Component

logger = logging.getLogger(__name__)

In [2]:
import spacy
LANGUAGE = 'en'
spacy_nlp = spacy.load(LANGUAGE)

In [3]:
examples = [Message("anywhere in the west", {
                    "intent": "restaurant_search",
                    "entities": [{"start": 16, "end": 20,
                                  "value": "west", "entity": "location"}],
                    "spacy_doc": spacy_nlp("anywhere in the west")
                    }),
            Message("central indian restaurant", {
                    "intent": "restaurant_search",
                    "entities": [
                     {"start": 0, "end": 7, "value": "central",
                      "entity": "location",
                      "extractor": "random_extractor"},
                     {"start": 8, "end": 14, "value": "indian",
                      "entity": "cuisine",
                      "extractor": "CRFEntityExtractor"}
                                 ],
                    "spacy_doc": spacy_nlp("central indian restaurant")
                    }),
            Message("hi there!", {"intent": "greet", "entities": [],
                                  "spacy_doc": spacy_nlp("hi there!")}),
            Message("good morning", {"intent": "greet", "entities": [],         
                                     "spacy_doc": spacy_nlp("good morning")}),
            Message("thank you", {"intent": "thanks", "entities": [],
                                  "spacy_doc": spacy_nlp("thank you")}),
            Message("good bye", {"intent": "thanks", "entities": [],
                                 "spacy_doc": spacy_nlp("good bye")})        
      ]

In [4]:
examples

[<rasa_nlu.training_data.message.Message at 0x10971c978>,
 <rasa_nlu.training_data.message.Message at 0x109fd02b0>,
 <rasa_nlu.training_data.message.Message at 0x11a1d27f0>,
 <rasa_nlu.training_data.message.Message at 0x11a1d2208>,
 <rasa_nlu.training_data.message.Message at 0x11b088a58>,
 <rasa_nlu.training_data.message.Message at 0x11b088a20>]

In [5]:
training_data = TrainingData(training_examples=examples)

  self.MIN_EXAMPLES_PER_ENTITY))


In [6]:
distinct_intents = set([example.get("intent") for example in training_data.intent_examples])

In [7]:
distinct_intents

{'greet', 'restaurant_search', 'thanks'}

In [8]:
intent_dict = {intent: idx for idx, intent in enumerate(sorted(distinct_intents))}

In [9]:
intent_dict

{'greet': 0, 'restaurant_search': 1, 'thanks': 2}

In [11]:
def _create_intent_token_dict(intents,
                              intent_split_symbol):
     """Create intent token dictionary"""

     distinct_tokens = set([token
                            for intent in intents
                            for token in intent.split(intent_split_symbol)])
     return {token: idx
            for idx, token in enumerate(sorted(distinct_tokens))}

In [12]:
inv_intent_dict = {v: k for k, v in intent_dict.items()}

In [13]:
inv_intent_dict

{0: 'greet', 1: 'restaurant_search', 2: 'thanks'}

In [14]:
defaults = {'hidden_layers_sizes_a': [256, 128],
 'hidden_layers_sizes_b': [],
 'batch_size': [64, 256],
 'epochs': 300,
 'embed_dim': 20,
 'mu_pos': 0.8,
 'mu_neg': -0.4,
 'similarity_type': 'cosine',
 'num_neg': 20,
 'use_max_sim_neg': True,
 'random_seed': None,
 'C2': 0.002,
 'C_emb': 0.8,
 'droprate': 0.2,
 'intent_tokenization_flag': False,
 'intent_split_symbol': '_',
 'evaluate_every_num_epochs': 10,
 'evaluate_on_num_examples': 1000}

In [16]:
import numpy as np
def _create_encoded_intents(intent_dict):
    """Create matrix with intents encoded in rows as bag of words.
       If intent_tokenization_flag is off, returns identity matrix.
    """
    return np.eye(len(intent_dict))

In [17]:
encoded_all_intents = _create_encoded_intents(intent_dict)

In [18]:
encoded_all_intents

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [19]:
size = len(training_data.training_examples)
def _create_all_Y(size):
    """Stack encoded_all_intents on top of each other
    to create candidates for training examples and
    to calculate training accuracy
    """

    return np.stack([encoded_all_intents] * size)

In [20]:
all_Y = _create_all_Y(size)

In [21]:
all_Y

array([[[1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.]],

       [[1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.]],

       [[1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.]],

       [[1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.]],

       [[1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.]],

       [[1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.]]])

In [22]:
def _prepare_data_for_training(training_data, intent_dict):
    """Prepare data for training"""

    X = np.stack([e.get("text_features")
                  for e in training_data.intent_examples])

    intents_for_X = np.array([intent_dict[e.get("intent")]
                              for e in training_data.intent_examples])

    Y = np.stack([encoded_all_intents[intent_idx]
                  for intent_idx in intents_for_X])

    return X, Y, intents_for_X

In [23]:
from rasa_nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer
ftr = CountVectorsFeaturizer({"token_pattern": r'(?u)\b\w+\b'})
ftr.train(training_data)

In [24]:
training_data.intent_examples[0].get('text_features')

array([1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1], dtype=int64)

In [25]:
X, Y, intents_for_X = _prepare_data_for_training(training_data,
                                                 intent_dict)

In [26]:
X

array([[1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1],
       [0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0],
       [0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)

In [27]:
Y

array([[0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.]])

In [28]:
intents_for_X

array([1, 1, 0, 0, 2, 2])

In [29]:
num_neg = min(defaults['num_neg'], encoded_all_intents.shape[0]-1)

In [30]:
def _create_tf_embed_nn(x_in, is_training, layer_sizes, name):
    """Create nn with hidden layers and name"""
    reg = tf.contrib.layers.l2_regularizer(defaults['C2'])
    x = x_in
    for i, layer_size in enumerate(layer_sizes):
        x = tf.layers.dense(inputs=x, units=layer_size,
                            activation=tf.nn.relu, kernel_regularizer=reg,
                            name='hidden_layer_{}_{}'.format(name, i))
        x = tf.layers.dropout(x, rate=defaults['droprate'],
                              training=is_training)
    x = tf.layers.dense(inputs=x, units=defaults['embed_dim'],
                        kernel_regularizer=reg,
                        name='embed_layer_{}'.format(name))
    return x

In [31]:
def _create_tf_embed(a_in, b_in, is_training):
    """Create tf graph for training"""
    emb_a = _create_tf_embed_nn(a_in, is_training,
                                defaults['hidden_layers_sizes_a'],
                                name='a')
    emb_b = _create_tf_embed_nn(b_in, is_training,
                                defaults['hidden_layers_sizes_b'],
                                name='b')
    return emb_a, emb_b

In [32]:
def _tf_sim(a, b):
    """Define similarity
    in two cases:
    sim: between embedded words and embedded intent labels
    sim_emb: between individual embedded intent labels only
    """
    if defaults['similarity_type'] == 'cosine':
        a = tf.nn.l2_normalize(a, -1)
        b = tf.nn.l2_normalize(b, -1)
    if defaults['similarity_type'] in {'cosine', 'inner'}:
        sim = tf.reduce_sum(tf.expand_dims(a, 1) * b, -1)
        sim_emb = tf.reduce_sum(b[:, 0:1, :] * b[:, 1:, :], -1)
        return sim, sim_emb
    else:
        raise ValueError("Wrong similarity type {}, "
                         "should be 'cosine' or 'inner'"
                         "".format(defaults['similarity_type']))

In [34]:
def _tf_loss(sim, sim_emb):
    """Define loss"""
    loss = tf.maximum(0., defaults['mu_pos'] - sim[:, 0])
    if defaults['use_max_sim_neg']:
        max_sim_neg = tf.reduce_max(sim[:, 1:], -1)
        loss += tf.maximum(0., defaults['mu_neg'] + max_sim_neg)
    else:
        max_margin = tf.maximum(0., defaults['mu_neg'] + sim[:, 1:])
        loss += tf.reduce_sum(max_margin, -1)
    max_sim_emb = tf.maximum(0., tf.reduce_max(sim_emb, -1))
    loss += max_sim_emb * defaults['C_emb']
    loss = (tf.reduce_mean(loss) + tf.losses.get_regularization_loss())
    return loss

In [35]:
def _create_batch_b(batch_pos_b, intent_ids):
        """Create batch of intents.
        Where the first is correct intent
        and the rest are wrong intents sampled randomly
        """

        batch_pos_b = batch_pos_b[:, np.newaxis, :]

        # sample negatives
        batch_neg_b = np.zeros((batch_pos_b.shape[0], num_neg,
                                batch_pos_b.shape[-1]))
        for b in range(batch_pos_b.shape[0]):
            # create negative indexes out of possible ones
            # except for correct index of b
            negative_indexes = [i for i in
                                range(encoded_all_intents.shape[0])
                                if i != intent_ids[b]]
            negs = np.random.choice(negative_indexes, size=num_neg)

            batch_neg_b[b] = encoded_all_intents[negs]

        return np.concatenate([batch_pos_b, batch_neg_b], 1)

In [36]:
def _linearly_increasing_batch_size(epoch):
        """Linearly increase batch size with every epoch.
        The idea comes from https://arxiv.org/abs/1711.00489
        """

        if not isinstance(defaults['batch_size'], list):
            return int(defaults['batch_size'])

        if defaults['epochs'] > 1:
            return int(defaults['batch_size'][0] +
                       epoch * (defaults['batch_size'][1] -
                                defaults['batch_size'][0]) / (defaults['epochs'] - 1))
        else:
            return int(defaults['batch_size'][0])

In [37]:
def _output_training_stat(a_in, b_in, X, intents_for_X, is_training,
                          session, sim_op):
        """Output training statistics"""

        n = defaults['evaluate_on_num_examples']
        ids = np.random.permutation(len(X))[:n]
        all_Y = _create_all_Y(X[ids].shape[0])

        train_sim = session.run(sim_op,
                                feed_dict={a_in: X[ids],
                                           b_in: all_Y,
                                           is_training: False})

        train_acc = np.mean(np.argmax(train_sim, -1) == intents_for_X[ids])
        return train_acc

In [38]:
def _train_tf(a_in, b_in, X, Y, intents_for_X, loss, is_training,
              train_op, session, sim_op):
        """Train tf graph"""

        session.run(tf.global_variables_initializer())

        if defaults['evaluate_on_num_examples']:
            logger.info("Accuracy is updated every {} epochs"
                        "".format(defaults['evaluate_every_num_epochs']))

        pbar = tqdm(range(defaults['epochs']), desc="Epochs")
        train_acc = 0
        last_loss = 0
        for ep in pbar:
            indices = np.random.permutation(len(X))

            batch_size = _linearly_increasing_batch_size(ep)
            batches_per_epoch = (len(X) // batch_size +
                                 int(len(X) % batch_size > 0))

            ep_loss = 0
            for i in range(batches_per_epoch):
                end_idx = (i + 1) * batch_size
                start_idx = i * batch_size
                batch_a = X[indices[start_idx:end_idx]]
                batch_pos_b = Y[indices[start_idx:end_idx]]
                intents_for_b = intents_for_X[indices[start_idx:end_idx]]
                # add negatives
                batch_b = _create_batch_b(batch_pos_b, intents_for_b)

                sess_out = session.run(
                    {'loss': loss, 'train_op': train_op},
                    feed_dict={a_in: batch_a,
                               b_in: batch_b,
                               is_training: True}
                )
                ep_loss += sess_out.get('loss') / batches_per_epoch

            if defaults['evaluate_on_num_examples']:
                if (ep == 0 or
                        (ep + 1) % defaults['evaluate_every_num_epochs'] == 0 or
                        (ep + 1) == defaults['epochs']):
                    train_acc = _output_training_stat(a_in, b_in, X,
                                                      intents_for_X,
                                                      is_training,
                                                      session, sim_op)
                    last_loss = ep_loss

                pbar.set_postfix({
                    "loss": "{:.3f}".format(ep_loss),
                    "acc": "{:.3f}".format(train_acc)
                })
            else:
                pbar.set_postfix({
                    "loss": "{:.3f}".format(ep_loss)
                })

        if defaults['evaluate_on_num_examples']:
            logger.info("Finished training embedding classifier, "
                        "loss={:.3f}, train accuracy={:.3f}"
                        "".format(last_loss, train_acc))

In [43]:
graph = tf.Graph()
with graph.as_default():
    random_seed = None
    np.random.seed(random_seed)
    print('placeholders')
    a_in = tf.placeholder(tf.float32, (None, X.shape[-1]), name='a')
    b_in = tf.placeholder(tf.float32, (None, None, Y.shape[-1]), name='b')
    is_training = tf.placeholder_with_default(False, shape=())
    print('Embeddings for features and intent')
    word_embed, intent_embed = _create_tf_embed(a_in, b_in, is_training)
    print('Similarity estimation')
    sim_op, sim_emb = _tf_sim(word_embed, intent_embed)
    print('loss estimation')
    loss = _tf_loss(sim_op, sim_emb)
    print('optimizer')
    train_op = tf.train.AdamOptimizer().minimize(loss)
    session = tf.Session()
    print('training begins')
    _train_tf(a_in, b_in, X, Y, intents_for_X,
              loss, is_training, train_op, session, sim_op)

placeholders
Embeddings for features and intent
Similarity estimation
loss estimation
optimizer


Epochs:   0%|          | 0/300 [00:00<?, ?it/s]

training begins


Epochs: 100%|██████████| 300/300 [00:01<00:00, 221.09it/s, loss=0.089, acc=1.000]


In [44]:
def _calculate_message_sim(X, all_Y):
        """Load tf graph and calculate message similarities"""

        message_sim = session.run(sim_op, feed_dict={a_in: X,
                                                     b_in: all_Y})
        message_sim = message_sim.flatten()  # sim is a matrix

        intent_ids = message_sim.argsort()[::-1]
        message_sim[::-1].sort()

        if defaults['similarity_type'] == 'cosine':
            # clip negative values to zero
            message_sim[message_sim < 0] = 0
        elif defaults['similarity_type'] == 'inner':
            # normalize result to [0, 1] with softmax
            message_sim = np.exp(message_sim)
            message_sim /= np.sum(message_sim)

        # transform sim to python list for JSON serializing
        return intent_ids, message_sim.tolist()

In [59]:
def process(message, **kwargs):
        """Return the most likely intent and its similarity to the input."""

        intent = {"name": None, "confidence": 0.0}
        intent_ranking = []

        if session is None:
            logger.error("There is no trained tf.session: "
                         "component is either not trained or "
                         "didn't receive enough training data")

        else:
            # get features (bag of words) for a message
            # noinspection PyPep8Naming
            X = message.get("text_features").reshape(1, -1)

            # stack encoded_all_intents on top of each other
            # to create candidates for test examples
            # noinspection PyPep8Naming
            all_Y = _create_all_Y(X.shape[0])

            # load tf graph and session
            intent_ids, message_sim = _calculate_message_sim(X, all_Y)

            # if X contains all zeros do not predict some label
            if X.any() and intent_ids.size > 0:
                intent = {"name": inv_intent_dict[intent_ids[0]],
                          "confidence": message_sim[0]}

                ranking = list(zip(list(intent_ids), message_sim))
                ranking = ranking[:INTENT_RANKING_LENGTH]
                intent_ranking = [{"name": inv_intent_dict[intent_idx],
                                   "confidence": score}
                                  for intent_idx, score in ranking]
        return intent, intent_ranking

        message.set("intent", intent, add_to_output=True)
        message.set("intent_ranking", intent_ranking, add_to_output=True)

In [96]:
def persist(file_name, model_dir):
    """Persist this model into the passed directory.
    Return the metadata necessary to load the model again.
    """

    if session is None:
        return {"file": None}

    checkpoint = os.path.join(model_dir, file_name + ".ckpt")

    try:
        os.makedirs(os.path.dirname(checkpoint))
    except OSError as e:
        # be happy if someone already created the path
        import errno
        if e.errno != errno.EEXIST:
            raise
    with graph.as_default():
        graph.clear_collection('message_placeholder')
        graph.add_to_collection('message_placeholder',
                                a_in)

        graph.clear_collection('intent_placeholder')
        graph.add_to_collection('intent_placeholder',
                                b_in)

        graph.clear_collection('similarity_op')
        graph.add_to_collection('similarity_op',
                                sim_op)

        graph.clear_collection('word_embed')
        graph.add_to_collection('word_embed',
                                word_embed)
        graph.clear_collection('intent_embed')
        graph.add_to_collection('intent_embed',
                                intent_embed)

        saver = tf.train.Saver()
        saver.save(session, checkpoint)

    with io.open(os.path.join(
            model_dir,
            file_name + "_inv_intent_dict.pkl"), 'wb') as f:
        pickle.dump(inv_intent_dict, f)
    with io.open(os.path.join(
            model_dir,
            file_name + "_encoded_all_intents.pkl"), 'wb') as f:
        pickle.dump(encoded_all_intents, f)

    return {"file": file_name}

In [97]:
def load(meta, model_dir, model_metadata, cached_component, **kwargs):

    if model_dir and meta.get("file"):
        file_name = meta.get("file")
        checkpoint = os.path.join(model_dir, file_name + ".ckpt")
        graph = tf.Graph()
        with graph.as_default():
            sess = tf.Session()
            saver = tf.train.import_meta_graph(checkpoint + '.meta')

            saver.restore(sess, checkpoint)

            a_in = tf.get_collection('message_placeholder')[0]
            b_in = tf.get_collection('intent_placeholder')[0]

            sim_op = tf.get_collection('similarity_op')[0]

            word_embed = tf.get_collection('word_embed')[0]
            intent_embed = tf.get_collection('intent_embed')[0]

        with io.open(os.path.join(
                model_dir,
                file_name + "_inv_intent_dict.pkl"), 'rb') as f:
            inv_intent_dict = pickle.load(f)
        with io.open(os.path.join(
                model_dir,
                file_name + "_encoded_all_intents.pkl"), 'rb') as f:
            encoded_all_intents = pickle.load(f)

        return cls(
            component_config=meta,
            inv_intent_dict=inv_intent_dict,
            encoded_all_intents=encoded_all_intents,
            session=sess,
            graph=graph,
            message_placeholder=a_in,
            intent_placeholder=b_in,
            similarity_op=sim_op,
            word_embed=word_embed,
            intent_embed=intent_embed
        )

    else:
        logger.warning("Failed to load nlu model. Maybe path {} "
                       "doesn't exist"
                       "".format(os.path.abspath(model_dir)))
        return cls(component_config=meta)

In [98]:
# save output
model_dir = '/Users/varunn/Documents/NLP-data/'
file_name = 'embedding_intent_classifier_exampledataset'
persist(file_name=file_name, model_dir=model_dir)

{'file': 'embedding_intent_classifier_exampledataset'}

In [124]:
test_inp = Message("show me some indian restuarants")

In [125]:
test_inp.as_dict()

{'text': 'show me some indian restuarants'}

In [126]:
ftr.process(test_inp)

In [127]:
test_inp.as_dict()

{'text_features': array([[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]]),
 'text': 'show me some indian restuarants'}

In [128]:
process(test_inp)

({'name': 'restaurant_search', 'confidence': 0.9497399926185608},
 [{'name': 'restaurant_search', 'confidence': 0.9497399926185608},
  {'name': 'thanks', 'confidence': 0.09301461279392242},
  {'name': 'greet', 'confidence': 0.07009059935808182}])