In [6]:
import re
import spacy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import f1_score

from bert_serving.client import BertClient

In [11]:
from typing import NamedTuple, Optional, Dict, List, Tuple
from abc import ABCMeta, abstractmethod

class Command(NamedTuple):
    code: str
    title: str
    description: str


class IndexedCommand(NamedTuple):
    command: Command
    index: np.ndarray

        
#     parameters: Optional[List['Parameter']] = None

# class Parameter(metaclass=ABCMeta):
#     key: str
    
#     def __init__(self, key: str):
#         if not key:
#             raise Exception("Parameter key is mandatory")
#         self.key = key
    
#     @abstractmethod
#     def is_appropriate(self, token: str) -> bool:
#         pass
    
#     @abstractmethod
#     def transform(self, tokem: str) -> any:
#         pass

    
# class IntegerParameter(Parameter):
    
#     def is_appropriate(self, token: str) -> bool:
#         return token.isnumeric()
    
#     def transform(self, token: str) -> int:
#         return int(token)

# class ExactStringMatchParameter(Parameter):
    
#     vocab: Dict[str, str]  # key - String that appears in the text, value - key
        
#     def __init__(self, key: str, vocab: Dict[str, str]):
#         super(ExactStringMatchParameter, self).__init__(key)
#         if not vocab:
#             raise Exception("Parameter vocab is mandatory")
#         self.vocab = {k.lower(): v for k, v in vocab.items()}
    
#     def is_appropriate(self, token: str) -> bool:
#         return token.lower() in self.vocab
    
#     def transform(self, token: str) -> int:
#         return self.vocab.get(token)

In [12]:

class Preprocessor:
    
    def preprocess(self, text: str) -> str:
        lc_cleared = text.lower()
        lc_cleared = re.sub(r"[0-9.,?/()\[\]\'\":#№$\t;<>!+\-_=%{}><~`|]", " ", lc_cleared)
        lc_cleared = re.sub(r"\s+", " ", lc_cleared)
        return lc_cleared.strip()


class Indexer(metaclass=ABCMeta):
    
    @abstractmethod
    def get_index(self, text: str) -> np.ndarray:
        pass

    
class BertIndexer(Indexer):
    
    def __init__(self, mapper):
        self.mapper = mapper
    
    def get_index(self, text: str) -> np.ndarray:
        return self.mapper.encode([text])[0]


class Predictor:
    
    def rate_commands(self, indexed_commands: List[IndexedCommand], query_index: np.ndarray) -> List[Tuple[str, float]]:
        target_vocab = np.array(list(map(lambda c: np.array((c.command.code, c.index)), indexed_commands)))
        
        commands, commands_indexes = target_vocab[::, 0], target_vocab[::, 1]
        
        a = np.array([np.array(x) for x in commands_indexes])
        b = query_index

        predict = (np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b)) + 1) / 2

        return list(sorted(zip(commands, predict), key=lambda x: x[1], reverse=True))
        

class Resolver:
    
    def resolve(self, prediction: List[Tuple[str, float]]) -> str:
        return prediction[0][0]


class Pipeline:
    preprocessor: Preprocessor
    indexer: Indexer
    predictor: Predictor
    resolver: Resolver
    indexed_commands: List[IndexedCommand]
    
    def __init__(
        self, 
        preprocessor: Preprocessor,
        indexer: Indexer,
        predictor: Predictor,
        resolver: Resolver,
        indexed_commands: List[IndexedCommand]
    ):
        self.preprocessor = preprocessor
        self.indexer = indexer
        self.predictor = predictor
        self.resolver = resolver
        self.indexed_commands = indexed_commands
    
    def predict(self, query: str):
        clean_query = self.preprocessor.preprocess(query)
        indexed_query = self.indexer.get_index(clean_query)
        rating = self.predictor.rate_commands(self.indexed_commands, indexed_query)
        return self.resolver.resolve(rating)


In [13]:
commands = [
    Command(
        code="PlayMusic",
        title="Play Music",
        description="Allows to listen music.",
    ),
    Command(
        code="AddToPlaylist",
        title="Add to playlist",
        description="Adds track to playlist."
    ),
    Command(
        code="RateBook",
        title="Rate Book",
        description="Rates book.",
    ),
    Command(
        code="SearchScreeningEvent",
        title="Search Screening Event",
        description="Searches for screening events",
    ),
    Command(
        code="BookRestaurant",
        title="Book Restaurant",
        description="Books restaurant",
    ),
    Command(
        code="GetWeather",
        title="Get Weather",
        description="Weather information",
    ),
    Command(
        code="SearchCreativeWork",
        title="Search Creative Work",
        description="Searches for creative works, such as films or books.",
    ),
]


# Parameters parsing (Raw)
# Preprocess
# Indexing
# Predicting
# Resolving

In [14]:
bc = BertClient()

preprocessor = Preprocessor()

In [15]:
indexed_commands = []

for command in commands:
    text_to_index = preprocessor.preprocess(command.title + " " + command.description)
    indexed_commands.append(IndexedCommand(
        command=command,
        index=bc.encode([text_to_index])[0]
    ))


In [16]:
train = pd.read_csv("test_data/snips/train.csv")
test = pd.read_csv("test_data/snips/test.csv")

In [17]:
train['intent'].unique()

array(['PlayMusic', 'AddToPlaylist', 'RateBook', 'SearchScreeningEvent',
       'BookRestaurant', 'GetWeather', 'SearchCreativeWork'], dtype=object)

In [18]:
train['intent'].value_counts()

PlayMusic               2014
GetWeather              1996
BookRestaurant          1981
RateBook                1976
SearchScreeningEvent    1952
SearchCreativeWork      1947
AddToPlaylist           1918
Name: intent, dtype: int64

In [19]:
test['intent'].value_counts()

AddToPlaylist           124
SearchScreeningEvent    107
SearchCreativeWork      107
GetWeather              104
BookRestaurant           92
PlayMusic                86
RateBook                 80
Name: intent, dtype: int64

In [20]:
pipeline = Pipeline(preprocessor, BertIndexer(bc), Predictor(), Resolver(), indexed_commands)

In [21]:
prediction = test['text'].map(pipeline.predict)

  target_vocab = np.array(list(map(lambda c: np.array((c.command.code, c.index)), indexed_commands)))


In [25]:
print("=== Bert with cosine distance ===")
print("f1_score_micro", round(f1_score(test['intent'], prediction, average='micro'), 2), '\n')

print("--- Detailed ---\n")
for intent in train['intent'].unique():
    print(f"{intent}")
    
    TP_FN = (test['intent'] == intent)
    TP = (prediction[TP_FN] == intent)
    
    print("  Recall:", round(TP.astype(int).sum() / TP_FN.astype(int).sum(), 2))
    
    TP_FP = prediction == intent
    print("  Precision:", round(TP.astype(int).sum() / TP_FP.astype(int).sum(), 2))
    
    print()


=== Bert with cosine distance ===
f1_score_micro 0.48 

--- Detailed ---

PlayMusic
  Recall: 0.47
  Precision: 0.33

AddToPlaylist
  Recall: 0.96
  Precision: 0.61

RateBook
  Recall: 0.74
  Precision: 0.4

SearchScreeningEvent
  Recall: 0.21
  Precision: 0.37

BookRestaurant
  Recall: 0.95
  Precision: 0.54

GetWeather
  Recall: 0.06
  Precision: 1.0

SearchCreativeWork
  Recall: 0.02
  Precision: 0.67

