In [1]:
import re
from typing import Any, List, Union, Tuple

from collections import defaultdict

import numpy as np
import spacy

from text_to_command.profiler import Profiler


from text_to_command.entities import AppsConfiguration, SystemConfiguration, SessionConfiguration, Function, \
    FunctionIndex, IndexedData, Application, ApplicationIndex
from text_to_command.main import apps_config_factory

In [2]:
from text_to_command.profiler import Profiler
class Indexer:
    word_2_vec_mapper: Any  # spacy.lang.en.English for now

    def __init__(self, word_2_vec_mapper: Any):
        self.word_2_vec_mapper = word_2_vec_mapper

    @staticmethod
    def clear_string(data: str) -> str:
        lc_cleared = data.lower()
        lc_cleared = re.sub(r"[.,?/()\[\]\'\"]", " ", lc_cleared)
        lc_cleared = re.sub(r"\s+", " ", lc_cleared)
        return lc_cleared.strip()

    def get_index_function_data(self, function: Function):
        cleared_lc_name = self.clear_string(function.name)
        cleared_lc_description = self.clear_string(function.description)

        cleared_lc_call_examples = [
            self.clear_string(call_example) for call_example in function.call_examples]

        return FunctionIndex(
            name_index=IndexedData(
                cleared_lc=cleared_lc_name,
                general_vector=self.word_2_vec_mapper(cleared_lc_name).vector
            ),
            description_index=IndexedData(
                cleared_lc=cleared_lc_description,
                general_vector=self.word_2_vec_mapper(cleared_lc_description).vector
            ),
            call_examples_index=[IndexedData(
                cleared_lc=call_example,
                general_vector=self.word_2_vec_mapper(call_example).vector
            ) for call_example in cleared_lc_call_examples]
        )

    def get_index_application_data(self, application: Application):
        cleared_lc_name = self.clear_string(application.name)
        cleared_lc_description = self.clear_string(application.description)

        cleared_lc_tags = [
            self.clear_string(tag) for tag in application.tags]

        return ApplicationIndex(
            name_index=IndexedData(
                cleared_lc=cleared_lc_name,
                general_vector=self.word_2_vec_mapper(cleared_lc_name).vector
            ),
            description_index=IndexedData(
                cleared_lc=cleared_lc_description,
                general_vector=self.word_2_vec_mapper(cleared_lc_description).vector
            ),
            tags_index=[IndexedData(
                cleared_lc=tag,
                general_vector=self.word_2_vec_mapper(tag).vector
            ) for tag in cleared_lc_tags]
        )

    def ensure_indexed(self, configuration_entries: List[Union[Function, Application]]):
        for configuration_entry in configuration_entries:

            if isinstance(configuration_entry, Function):
                if not configuration_entry.indexed_data:
                    configuration_entry.indexed_data = self.get_index_function_data(configuration_entry)
            elif isinstance(configuration_entry, Application):
                if not configuration_entry.indexed_data:
                    configuration_entry.indexed_data = self.get_index_application_data(configuration_entry)
                self.ensure_indexed(configuration_entry.functions)

In [3]:
indexer = Indexer(spacy.load("en_core_web_md"))

In [8]:
profiler = Profiler()

In [9]:
class IntentResolver:  # recommendation system

    indexer: Indexer

    _min_query_ngram_size = 1
    _max_query_ngram_size = 3

    def __init__(self, indexer: Indexer):
        self.indexer = indexer

    @staticmethod
    @profiler.profile("default")
    def _spawn_variants(indices: List[int], min_size: int, max_size: int) -> List[Tuple[int, int]]:
        result = []
        for size in range(min_size, max_size + 1):
            for i in range(len(indices) - size + 1):
                result.append(
                    (indices[i], indices[i + size - 1])
                )
        return result

    @staticmethod
    @profiler.profile("default")
    def get_candidates_vectors(apps_configuration: AppsConfiguration, *other_configs) -> List[Tuple[str, np.array]]:
        candidates_vectors = []
        for app in apps_configuration.applications:
            candidates_vectors.append((app.get_identifier(), app.indexed_data.name_index.general_vector))
            candidates_vectors.append((app.get_identifier(), app.indexed_data.description_index.general_vector))
            candidates_vectors.extend([(app.get_identifier(), t.general_vector) for t in app.indexed_data.tags_index])

            for function in app.functions:
                candidates_vectors.append((function.get_identifier(), function.indexed_data.name_index.general_vector))
                candidates_vectors.append(
                    (function.get_identifier(), function.indexed_data.description_index.general_vector))
                candidates_vectors.extend(
                    [(function.get_identifier(), ce.general_vector) for ce in function.indexed_data.call_examples_index]
                )

        return candidates_vectors

    @staticmethod
    @profiler.profile("default")
    def pairwise_cosine_similarity(f_vector, s_vector):
        return np.sum((f_vector * s_vector), axis=1) / (np.linalg.norm(f_vector, axis=1)*np.linalg.norm(s_vector, axis=1))

    @staticmethod
    @profiler.profile("default")
    def get_cartesian_product(
            query_tokens_parts: List[Tuple[Tuple[int, int], np.array]],
            candidates_vectors: List[Tuple[str, np.array]]
    ) -> Tuple[List[Tuple], List[Tuple]]:
        x = np.tile(query_tokens_parts, (len(candidates_vectors), 1))
        y = np.array(candidates_vectors, dtype=tuple).repeat(len(query_tokens_parts), 0)
        return x, y

    @profiler.scope_controller("default")
    def resolve_intent_recommendations(
            self,
            query: str,
            apps_configuration: AppsConfiguration,
            system_configuration: SystemConfiguration,
            session_configurations: SessionConfiguration
    ):
        with profiler.profile_code("default", "ensure_indexed"):
            self.indexer.ensure_indexed(apps_configuration.applications)

        query = self.indexer.clear_string(query)
        query_parts = query.split(" ")

        query_tokens = self.indexer.word_2_vec_mapper(query)
        variants = self._spawn_variants(
            list(range(len(query_parts))),
            min(self._min_query_ngram_size, len(query_parts) - 1),
            min(self._max_query_ngram_size, len(query_parts) - 1)
        )

        query_tokens_parts, candidates_vectors = self.get_cartesian_product(
            [(v, query_tokens[v[0]: v[1] + 1].vector) for v in variants],
             self.get_candidates_vectors(apps_configuration)
        )

        with profiler.profile_code("default", "rest"):
            query_parts_indices, query_parts_vectors = query_tokens_parts[::, 0], query_tokens_parts[::, 1]
            candidates_vectors_indices, candidates_vectors_vectors = candidates_vectors[::, 0], candidates_vectors[::, 1]

            query_parts_vectors = np.array(list(map(lambda x: list(x), query_parts_vectors)))
            candidates_vectors_vectors = np.array(list(map(lambda x: list(x), candidates_vectors_vectors)))

            similarities = self.pairwise_cosine_similarity(query_parts_vectors, candidates_vectors_vectors)

            grouping = defaultdict(lambda: [[]] * len(query_parts))
            for query_part_indices, candidate_id, similarity in zip(
                    query_parts_indices, candidates_vectors_indices, similarities):
                for i in range(query_part_indices[0], query_part_indices[1] + 1):
                    grouping[candidate_id][i].append(similarity)

            functions_mapping = {}
            for app in apps_configuration.applications:
                for func in app.functions:
                    functions_mapping[func.get_identifier()] = func

            for identifier, func in functions_mapping.items():
                if func.application and func.application.get_identifier() in grouping:
                    for i in range(len(query_parts)):
                        grouping[identifier][i] += grouping[func.application.get_identifier()][i]

            priorities = {key: [sum(d) / len(d) for d in dist] for key, dist in grouping.items()}

        for key, dist in sorted(priorities.items(), key=lambda x: sum(x[1]) / len(x[1]), reverse=True):
            print(key, dist)

In [10]:
intent_resolver = IntentResolver(indexer)
intent_resolver.resolve_intent_recommendations(
    "Send Ann a message 'Buy some potato' with Telegram",
    apps_config_factory(),
    None, None
)

telegram.unread_messages [0.3918569024344795, 0.3918569024344795, 0.3918569024344795, 0.3918569024344795, 0.3918569024344795, 0.3918569024344795, 0.3918569024344795, 0.3918569024344795, 0.3918569024344795]
telegram.send_message [0.3915675294782082, 0.3915675294782082, 0.3915675294782082, 0.3915675294782082, 0.3915675294782082, 0.3915675294782082, 0.3915675294782082, 0.3915675294782082, 0.3915675294782082]
telegram [0.37843263112902775, 0.37843263112902775, 0.37843263112902775, 0.37843263112902775, 0.37843263112902775, 0.37843263112902775, 0.37843263112902775, 0.37843263112902775, 0.37843263112902775]
email.unread_mails [0.37772105157954117, 0.37772105157954117, 0.37772105157954117, 0.37772105157954117, 0.37772105157954117, 0.37772105157954117, 0.37772105157954117, 0.37772105157954117, 0.37772105157954117]
email.send_mail [0.3732615831842059, 0.3732615831842059, 0.3732615831842059, 0.3732615831842059, 0.3732615831842059, 0.3732615831842059, 0.3732615831842059, 0.3732615831842059, 0.3732

In [11]:
profiler.visualize_aggregated()

=== Scope 'default' ===
--- resolve_intent_recommendations ---
called:
    count(1.000000000)
    avg(1.0)/session
execution time
    sum(1.455601931)
    avg(1.455601931)/session
    avg(1.455601931)/call

--- ensure_indexed ---
called:
    count(1.000000000)
    avg(1.0)/session
execution time
    sum(1.010313988)
    avg(1.010313988)/session
    avg(1.010313988)/call

--- _spawn_variants ---
called:
    count(1.000000000)
    avg(1.0)/session
execution time
    sum(0.000005960)
    avg(0.000005960)/session
    avg(0.000005960)/call

--- get_candidates_vectors ---
called:
    count(1.000000000)
    avg(1.0)/session
execution time
    sum(0.000095367)
    avg(0.000095367)/session
    avg(0.000095367)/call

--- get_cartesian_product ---
called:
    count(1.000000000)
    avg(1.0)/session
execution time
    sum(0.000181198)
    avg(0.000181198)/session
    avg(0.000181198)/call

--- rest ---
called:
    count(1.000000000)
    avg(1.0)/session
execution time
    sum(0.437892914)
    avg(