In [1]:
# docker start bert-as-service


# try with examples encoded into 1 vector
# try with cont learning (update indexes)
# try with SGDRegressor

import re
import spacy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.metrics import f1_score
from bert_serving.client import BertClient
from timeit import default_timer as timer
from functools import reduce

In [2]:
from typing import NamedTuple, Optional, Dict, List, Tuple, Callable
from abc import ABCMeta, abstractmethod

class Command(NamedTuple):
    code: str
    title: str = None
    description: str = None
    examples: Optional[List[str]] = None


class IndexedCommand(NamedTuple):
    command: Command
    index: np.ndarray

In [3]:

class Preprocessor:
    
    def __init__(self, remove_stop_words: bool):
        self.remove_stop_words = remove_stop_words
    
    stop_words = set(STOP_WORDS)
    
    def preprocess(self, text: str) -> str:
        lc_cleared = text.lower()
        lc_cleared = re.sub(r"[0-9.,?/()\[\]\'\":#№$\t;<>!+\-_=%{}><~`|]", " ", lc_cleared)
        lc_cleared = re.sub(r"\s+", " ", lc_cleared)
        
        lc_cleared = lc_cleared.strip()
        
        if self.remove_stop_words:
            return " ".join(list(filter(lambda x: x not in self.stop_words, lc_cleared.split(" "))))
        else:
            return lc_cleared


class Indexer(metaclass=ABCMeta):
    
    @abstractmethod
    def get_index(self, text: str) -> np.ndarray:
        pass

    
class Word2VecIndexer(Indexer):
    
    def __init__(self, mapper):
        self.mapper = mapper
    
    def get_index(self, text: str) -> np.ndarray:
        embedding = self.mapper(text)
        if embedding.has_vector:
            return embedding.vector


    
class BertIndexer(Indexer):
    
    def __init__(self, mapper):
        self.mapper = mapper
    
    def get_index(self, text: str) -> np.ndarray:
        return self.mapper.encode([text])[0]

        

class Predictor:

    
    def rate_commands(self, indexed_commands: List[IndexedCommand], query_index: np.ndarray) -> List[Tuple[str, float]]:
        target_vocab = np.array(list(map(lambda c: np.array((c.command.code, c.index)), indexed_commands)))
        
        commands, commands_indexes = target_vocab[::, 0], target_vocab[::, 1]
        
        a = np.array([np.array(x) for x in commands_indexes])
        b = query_index

        predict = (np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b)) + 1) / 2

        return list(sorted(zip(commands, predict), key=lambda x: x[1], reverse=True))
        


class Resolver:
    
    def resolve(self, prediction: List[Tuple[str, float]]) -> str:
        return prediction[0][0]


def title_description_commands_indexer(commands: List[Command], preprocessor: Preprocessor, indexer: Indexer):
    indexed_commands = []

    for command in commands:
        text_to_index = preprocessor.preprocess(command.title + " " + command.description)
        indexed_commands.append(IndexedCommand(
            command=command,
            index=indexer.get_index(text_to_index)
        ))
    return indexed_commands


def examples_commands_indexer(commands: List[Command], preprocessor: Preprocessor, indexer: Indexer):
    indexed_commands = []

    for command in commands:
        # for w2v mean of vectors for each sentence is the same that vector for concatenated message (99%)
        # for bert (<87%)
        examples = preprocessor.preprocess(" ".join(command.examples))
        indexed_commands.append(IndexedCommand(
            command=command,
            index=indexer.get_index(examples)
        ))
    return indexed_commands


def mean_examples_commands_indexer(commands: List[Command], preprocessor: Preprocessor, indexer: Indexer):
    indexed_commands = []

    for command in commands:
        indexed_commands.append(IndexedCommand(
            command=command,
            index=reduce(lambda x, acc: x + acc, [indexer.get_index(t) for t in command.examples]) / len(command.examples)
        ))
    return indexed_commands


def bert_pair_sentances_examples_commands_indexer(commands: List[Command], preprocessor: Preprocessor, indexer: Indexer):
    indexed_commands = []

    for command in commands:
        indexed_commands.append(IndexedCommand(
            command=command,
            index=indexer.get_index(" ||| ".join(command.examples))
        ))
    return indexed_commands


class IndexUpdater:
    coef: float
        
    def __init__(self, coef: float):
        assert 0 <= coef <= 1
        self.coef = coef
        
    def update_index(self, command_index: np.array, query_index: np.array):
        return (1 - self.coef) * command_index + self.coef * query_index
    

class Pipeline:
    preprocessor: Preprocessor
    indexer: Indexer
    predictor: Predictor
    resolver: Resolver
    indexed_commands: List[IndexedCommand]
    index_updater: Optional[IndexUpdater]
    
    def __init__(
        self, 
        preprocessor: Preprocessor,
        indexer: Indexer,
        predictor: Predictor,
        resolver: Resolver,
        commands: List[Command],
        commands_indexer: Callable[[List[Command], Preprocessor, Indexer], List[IndexedCommand]],
        index_updater: Optional[IndexUpdater] = None
    ):
        self.preprocessor = preprocessor
        self.indexer = indexer
        self.predictor = predictor
        self.resolver = resolver
        self.index_updater = index_updater
 
        self.indexed_commands = commands_indexer(commands, preprocessor, indexer)
    
    def train(self, query: str, target: str) -> 'Pipeline':
        clean_query = self.preprocessor.preprocess(query)
        indexed_query = self.indexer.get_index(clean_query)
        rating = self.predictor.rate_commands(self.indexed_commands, indexed_query)
        prediction = self.resolver.resolve(rating)
        if prediction != target and self.index_updater is not None:
            mapping = {c.command.code: (c.command, c.index) for c in self.indexed_commands}
            mapping[target] = (
                mapping[target][0],
                self.index_updater.update_index(mapping[target][1], indexed_query)
            )
            self.indexed_commands = [IndexedCommand(
                command=c,
                index=i
            ) for c, i in mapping.values()]
        
    
    def predict(self, query: str):
        clean_query = self.preprocessor.preprocess(query)
        indexed_query = self.indexer.get_index(clean_query)
        rating = self.predictor.rate_commands(self.indexed_commands, indexed_query)
        return self.resolver.resolve(rating)


In [4]:
train = pd.read_csv("test_data/snips/train.csv")
test = pd.read_csv("test_data/snips/test.csv")

In [5]:
train['intent'].value_counts(), test['intent'].value_counts()

(PlayMusic               2014
 GetWeather              1996
 BookRestaurant          1981
 RateBook                1976
 SearchScreeningEvent    1952
 SearchCreativeWork      1947
 AddToPlaylist           1918
 Name: intent, dtype: int64,
 AddToPlaylist           124
 SearchScreeningEvent    107
 SearchCreativeWork      107
 GetWeather              104
 BookRestaurant           92
 PlayMusic                86
 RateBook                 80
 Name: intent, dtype: int64)

In [6]:
def test_pipeline(pipeline: Pipeline, df):
    stats = {}
    
    start = timer()
    
    prediction = df['text'].map(pipeline.predict)
    
    stats['prediction_time'] = timer() - start
    
    stats['f1_score_micro'] = round(f1_score(test['intent'], prediction, average='micro'), 3)
    stats['f1_score_macro'] = round(f1_score(test['intent'], prediction, average='macro'), 3)
    
    stats['detailed'] = {}
    
    for intent in df['intent'].unique():
        TP_FN = (df['intent'] == intent)
        TP = (prediction[TP_FN] == intent)
        TP_FP = prediction == intent
        stats['detailed'][intent] = {
            "recall": round(TP.astype(int).sum() / TP_FN.astype(int).sum(), 3),
            "precision": round(TP.astype(int).sum() / TP_FP.astype(int).sum(), 3)
        }
    return stats



def spawn_named_combinations(*args: List[List[Dict]]):
    result = args[0]
    
    for candidates in args[1:]:
        new_result = []
        for r in result:
            for c in candidates:
                name_r = r['name']
                name_c = c["name"]
                new_name = name_r + "/" + name_c
                new_result.append({
                    **r,
                    **c,
                    "name": new_name
                })
        result = new_result
    return result
    

def test_pipeline_configs(configs: List[Dict], df, train_df = None):
    result = {}
    
    for config in configs:
        name = config['name']
        pipeline = Pipeline(**{k: v for k, v in config.items() if k != "name"})
        
        if train_df is not None:
            print(f"Trainin pipeline {name}")
            train_df.apply(lambda row: pipeline.train(row['text'], row['intent']), axis=1)
            print(f"Training finished")
        
        result[name] = test_pipeline(pipeline, df)
        print(f"Pipeline {name} was trained and estimated.")
    
    print("\n ===== Results ==== ")
    for name, score in sorted(map(lambda x: (x[0], x[1]['f1_score_micro']), result.items()), key = lambda x: x[1], reverse=True):
        print(f"{score:.3f} | {name}")
        
    return result

In [7]:
short_commands_description = [
        Command(
            code="PlayMusic",
            title="Play Music",
            description="Allows to listen music.",
        ),
        Command(
            code="AddToPlaylist",
            title="Add to playlist",
            description="Adds track to playlist."
        ),
        Command(
            code="RateBook",
            title="Rate Book",
            description="Rates book.",
        ),
        Command(
            code="SearchScreeningEvent",
            title="Search Screening Event",
            description="Searches for screening events",
        ),
        Command(
            code="BookRestaurant",
            title="Book Restaurant",
            description="Books restaurant",
        ),
        Command(
            code="GetWeather",
            title="Get Weather",
            description="Weather information",
        ),
        Command(
            code="SearchCreativeWork",
            title="Search Creative Work",
            description="Searches for creative works, such as films or books.",
        ),
    ]



extended_commands_description = [
        Command(
            code="PlayMusic",
            title="Play Music",
            description="Starts selected song from media.",
        ),
        Command(
            code="AddToPlaylist",
            title="Add to playlist",
            description="Adds soundtrack to your media playlist."
        ),
        Command(
            code="RateBook",
            title="Rate Book",
            description="Adds your review about selected book.",
        ),
        Command(
            code="SearchScreeningEvent",
            title="Search Screening Event",
            description="Searches for screening events",
        ),
        Command(
            code="BookRestaurant",
            title="Book Restaurant",
            description="Books a selected restaurant for specific date and time",
        ),
        Command(
            code="GetWeather",
            title="Get Weather",
            description="Provides information about the weather conditions, temperature, humidity for specific date.",
        ),
        Command(
            code="SearchCreativeWork",
            title="Search Creative Work",
            description="Searches for creative works, such as films or books.",
        ),
    ]


commands_with_5_examples = [
        Command(
            code="PlayMusic",
            examples=list(train[train['intent'] == "PlayMusic"]['text'][:5].values)
        ),
        Command(
            code="AddToPlaylist",
            examples=list(train[train['intent'] == "AddToPlaylist"]['text'][:5].values)
        ),
        Command(
            code="RateBook",
            examples=list(train[train['intent'] == "RateBook"]['text'][:5].values)
        ),
        Command(
            code="SearchScreeningEvent",
            examples=list(train[train['intent'] == "SearchScreeningEvent"]['text'][:5].values)
        ),
        Command(
            code="BookRestaurant",
            examples=list(train[train['intent'] == "BookRestaurant"]['text'][:5].values)
        ),
        Command(
            code="GetWeather",
            examples=list(train[train['intent'] == "GetWeather"]['text'][:5].values)
        ),
        Command(
            code="SearchCreativeWork",
            examples=list(train[train['intent'] == "SearchCreativeWork"]['text'][:5].values)
        ),
    ]


commands_with_10_examples = [
        Command(
            code="PlayMusic",
            examples=list(train[train['intent'] == "PlayMusic"]['text'][:10].values)
        ),
        Command(
            code="AddToPlaylist",
            examples=list(train[train['intent'] == "AddToPlaylist"]['text'][:10].values)
        ),
        Command(
            code="RateBook",
            examples=list(train[train['intent'] == "RateBook"]['text'][:10].values)
        ),
        Command(
            code="SearchScreeningEvent",
            examples=list(train[train['intent'] == "SearchScreeningEvent"]['text'][:10].values)
        ),
        Command(
            code="BookRestaurant",
            examples=list(train[train['intent'] == "BookRestaurant"]['text'][:10].values)
        ),
        Command(
            code="GetWeather",
            examples=list(train[train['intent'] == "GetWeather"]['text'][:10].values)
        ),
        Command(
            code="SearchCreativeWork",
            examples=list(train[train['intent'] == "SearchCreativeWork"]['text'][:10].values)
        ),
    ]

In [8]:
preprocessor_with_st_removing = Preprocessor(remove_stop_words=True)
preprocessor_without_st_removing = Preprocessor(remove_stop_words=False)

In [9]:
w2v_indexer = Word2VecIndexer(spacy.load("en_core_web_md"))
bert_indexer = BertIndexer(BertClient())



In [10]:
index_updater_01 = IndexUpdater(0.1)
index_updater_03 = IndexUpdater(0.3)

In [12]:
configs = spawn_named_combinations(
    [
        {"name": "BERT", "indexer": bert_indexer},
        {"name": "Word2Vec", "indexer": w2v_indexer}
    ],
    [
        {"name": "RemoveST", "preprocessor": preprocessor_with_st_removing}, 
        {"name": "SaveST", "preprocessor": preprocessor_without_st_removing}
    ],
    [
        {
            "name": "5ExamplesCommands",
            "commands": commands_with_5_examples,
            "commands_indexer": examples_commands_indexer
        },
        {
            "name": "10ExamplesCommands",
            "commands": commands_with_10_examples,
            "commands_indexer": examples_commands_indexer
        }, 
        {
            "name": "5MeanExamplesCommands",
            "commands": commands_with_5_examples,
            "commands_indexer": mean_examples_commands_indexer
        },

        {
            "name": "10MeanExamplesCommands",
            "commands": commands_with_10_examples,
            "commands_indexer": mean_examples_commands_indexer
        },
    ],
    [
        {
            "name": "NoneIndexUpdater",
            "index_updater": None
        },
        {
            "name": "IndexUpdater0.1",
            "index_updater": index_updater_01
        },
        {
            "name": "IndexUpdater0.3",
            "index_updater": index_updater_03
        },
    ],
    [
        {
            "name": "CosineDist",
            "predictor": Predictor(),
        }
    ],
    [
        {
            "name": "default",
            "resolver": Resolver()
        }
    ]
)

In [13]:
details = test_pipeline_configs(configs, test, train_df=pd.concat([train[train.intent == intent].head(70) for intent in train.intent.unique()]))

here is what you can do:
- or, start a new server with a larger "max_seq_len"


Trainin pipeline BERT/RemoveST/5ExamplesCommands/NoneIndexUpdater/CosineDist/default


  target_vocab = np.array(list(map(lambda c: np.array((c.command.code, c.index)), indexed_commands)))


Training finished


  "precision": round(TP.astype(int).sum() / TP_FP.astype(int).sum(), 3)
here is what you can do:
- or, start a new server with a larger "max_seq_len"


Pipeline BERT/RemoveST/5ExamplesCommands/NoneIndexUpdater/CosineDist/default was trained and estimated.
Trainin pipeline BERT/RemoveST/5ExamplesCommands/IndexUpdater0.1/CosineDist/default


  target_vocab = np.array(list(map(lambda c: np.array((c.command.code, c.index)), indexed_commands)))


Training finished


  "precision": round(TP.astype(int).sum() / TP_FP.astype(int).sum(), 3)
here is what you can do:
- or, start a new server with a larger "max_seq_len"


Pipeline BERT/RemoveST/5ExamplesCommands/IndexUpdater0.1/CosineDist/default was trained and estimated.
Trainin pipeline BERT/RemoveST/5ExamplesCommands/IndexUpdater0.3/CosineDist/default


  target_vocab = np.array(list(map(lambda c: np.array((c.command.code, c.index)), indexed_commands)))


Training finished


  "precision": round(TP.astype(int).sum() / TP_FP.astype(int).sum(), 3)
here is what you can do:
- or, start a new server with a larger "max_seq_len"


Pipeline BERT/RemoveST/5ExamplesCommands/IndexUpdater0.3/CosineDist/default was trained and estimated.
Trainin pipeline BERT/RemoveST/10ExamplesCommands/NoneIndexUpdater/CosineDist/default


  target_vocab = np.array(list(map(lambda c: np.array((c.command.code, c.index)), indexed_commands)))


Training finished


  "precision": round(TP.astype(int).sum() / TP_FP.astype(int).sum(), 3)
here is what you can do:
- or, start a new server with a larger "max_seq_len"


Pipeline BERT/RemoveST/10ExamplesCommands/NoneIndexUpdater/CosineDist/default was trained and estimated.
Trainin pipeline BERT/RemoveST/10ExamplesCommands/IndexUpdater0.1/CosineDist/default


  target_vocab = np.array(list(map(lambda c: np.array((c.command.code, c.index)), indexed_commands)))


Training finished


  "precision": round(TP.astype(int).sum() / TP_FP.astype(int).sum(), 3)
here is what you can do:
- or, start a new server with a larger "max_seq_len"


Pipeline BERT/RemoveST/10ExamplesCommands/IndexUpdater0.1/CosineDist/default was trained and estimated.
Trainin pipeline BERT/RemoveST/10ExamplesCommands/IndexUpdater0.3/CosineDist/default


  target_vocab = np.array(list(map(lambda c: np.array((c.command.code, c.index)), indexed_commands)))


Training finished


  "precision": round(TP.astype(int).sum() / TP_FP.astype(int).sum(), 3)


Pipeline BERT/RemoveST/10ExamplesCommands/IndexUpdater0.3/CosineDist/default was trained and estimated.
Trainin pipeline BERT/RemoveST/5MeanExamplesCommands/NoneIndexUpdater/CosineDist/default


  target_vocab = np.array(list(map(lambda c: np.array((c.command.code, c.index)), indexed_commands)))


Training finished


  "precision": round(TP.astype(int).sum() / TP_FP.astype(int).sum(), 3)


Pipeline BERT/RemoveST/5MeanExamplesCommands/NoneIndexUpdater/CosineDist/default was trained and estimated.
Trainin pipeline BERT/RemoveST/5MeanExamplesCommands/IndexUpdater0.1/CosineDist/default


  target_vocab = np.array(list(map(lambda c: np.array((c.command.code, c.index)), indexed_commands)))


Training finished
Pipeline BERT/RemoveST/5MeanExamplesCommands/IndexUpdater0.1/CosineDist/default was trained and estimated.
Trainin pipeline BERT/RemoveST/5MeanExamplesCommands/IndexUpdater0.3/CosineDist/default


  target_vocab = np.array(list(map(lambda c: np.array((c.command.code, c.index)), indexed_commands)))


Training finished


  "precision": round(TP.astype(int).sum() / TP_FP.astype(int).sum(), 3)


Pipeline BERT/RemoveST/5MeanExamplesCommands/IndexUpdater0.3/CosineDist/default was trained and estimated.
Trainin pipeline BERT/RemoveST/10MeanExamplesCommands/NoneIndexUpdater/CosineDist/default


  target_vocab = np.array(list(map(lambda c: np.array((c.command.code, c.index)), indexed_commands)))


Training finished
Pipeline BERT/RemoveST/10MeanExamplesCommands/NoneIndexUpdater/CosineDist/default was trained and estimated.
Trainin pipeline BERT/RemoveST/10MeanExamplesCommands/IndexUpdater0.1/CosineDist/default


  target_vocab = np.array(list(map(lambda c: np.array((c.command.code, c.index)), indexed_commands)))


Training finished
Pipeline BERT/RemoveST/10MeanExamplesCommands/IndexUpdater0.1/CosineDist/default was trained and estimated.
Trainin pipeline BERT/RemoveST/10MeanExamplesCommands/IndexUpdater0.3/CosineDist/default


  target_vocab = np.array(list(map(lambda c: np.array((c.command.code, c.index)), indexed_commands)))


Training finished


  "precision": round(TP.astype(int).sum() / TP_FP.astype(int).sum(), 3)
here is what you can do:
- or, start a new server with a larger "max_seq_len"


Pipeline BERT/RemoveST/10MeanExamplesCommands/IndexUpdater0.3/CosineDist/default was trained and estimated.
Trainin pipeline BERT/SaveST/5ExamplesCommands/NoneIndexUpdater/CosineDist/default


  target_vocab = np.array(list(map(lambda c: np.array((c.command.code, c.index)), indexed_commands)))


Training finished


  "precision": round(TP.astype(int).sum() / TP_FP.astype(int).sum(), 3)
here is what you can do:
- or, start a new server with a larger "max_seq_len"


Pipeline BERT/SaveST/5ExamplesCommands/NoneIndexUpdater/CosineDist/default was trained and estimated.
Trainin pipeline BERT/SaveST/5ExamplesCommands/IndexUpdater0.1/CosineDist/default


  target_vocab = np.array(list(map(lambda c: np.array((c.command.code, c.index)), indexed_commands)))


Training finished


  "precision": round(TP.astype(int).sum() / TP_FP.astype(int).sum(), 3)
here is what you can do:
- or, start a new server with a larger "max_seq_len"


Pipeline BERT/SaveST/5ExamplesCommands/IndexUpdater0.1/CosineDist/default was trained and estimated.
Trainin pipeline BERT/SaveST/5ExamplesCommands/IndexUpdater0.3/CosineDist/default


  target_vocab = np.array(list(map(lambda c: np.array((c.command.code, c.index)), indexed_commands)))


Training finished


  "precision": round(TP.astype(int).sum() / TP_FP.astype(int).sum(), 3)
here is what you can do:
- or, start a new server with a larger "max_seq_len"


Pipeline BERT/SaveST/5ExamplesCommands/IndexUpdater0.3/CosineDist/default was trained and estimated.
Trainin pipeline BERT/SaveST/10ExamplesCommands/NoneIndexUpdater/CosineDist/default


  target_vocab = np.array(list(map(lambda c: np.array((c.command.code, c.index)), indexed_commands)))


Training finished


  "precision": round(TP.astype(int).sum() / TP_FP.astype(int).sum(), 3)
here is what you can do:
- or, start a new server with a larger "max_seq_len"


Pipeline BERT/SaveST/10ExamplesCommands/NoneIndexUpdater/CosineDist/default was trained and estimated.
Trainin pipeline BERT/SaveST/10ExamplesCommands/IndexUpdater0.1/CosineDist/default


  target_vocab = np.array(list(map(lambda c: np.array((c.command.code, c.index)), indexed_commands)))


Training finished


  "precision": round(TP.astype(int).sum() / TP_FP.astype(int).sum(), 3)
here is what you can do:
- or, start a new server with a larger "max_seq_len"


Pipeline BERT/SaveST/10ExamplesCommands/IndexUpdater0.1/CosineDist/default was trained and estimated.
Trainin pipeline BERT/SaveST/10ExamplesCommands/IndexUpdater0.3/CosineDist/default


  target_vocab = np.array(list(map(lambda c: np.array((c.command.code, c.index)), indexed_commands)))


Training finished


  "precision": round(TP.astype(int).sum() / TP_FP.astype(int).sum(), 3)


Pipeline BERT/SaveST/10ExamplesCommands/IndexUpdater0.3/CosineDist/default was trained and estimated.
Trainin pipeline BERT/SaveST/5MeanExamplesCommands/NoneIndexUpdater/CosineDist/default


  target_vocab = np.array(list(map(lambda c: np.array((c.command.code, c.index)), indexed_commands)))


Training finished


  "precision": round(TP.astype(int).sum() / TP_FP.astype(int).sum(), 3)


Pipeline BERT/SaveST/5MeanExamplesCommands/NoneIndexUpdater/CosineDist/default was trained and estimated.
Trainin pipeline BERT/SaveST/5MeanExamplesCommands/IndexUpdater0.1/CosineDist/default


  target_vocab = np.array(list(map(lambda c: np.array((c.command.code, c.index)), indexed_commands)))


Training finished
Pipeline BERT/SaveST/5MeanExamplesCommands/IndexUpdater0.1/CosineDist/default was trained and estimated.
Trainin pipeline BERT/SaveST/5MeanExamplesCommands/IndexUpdater0.3/CosineDist/default


  target_vocab = np.array(list(map(lambda c: np.array((c.command.code, c.index)), indexed_commands)))


Training finished


  "precision": round(TP.astype(int).sum() / TP_FP.astype(int).sum(), 3)


Pipeline BERT/SaveST/5MeanExamplesCommands/IndexUpdater0.3/CosineDist/default was trained and estimated.
Trainin pipeline BERT/SaveST/10MeanExamplesCommands/NoneIndexUpdater/CosineDist/default


  target_vocab = np.array(list(map(lambda c: np.array((c.command.code, c.index)), indexed_commands)))


Training finished
Pipeline BERT/SaveST/10MeanExamplesCommands/NoneIndexUpdater/CosineDist/default was trained and estimated.
Trainin pipeline BERT/SaveST/10MeanExamplesCommands/IndexUpdater0.1/CosineDist/default


  target_vocab = np.array(list(map(lambda c: np.array((c.command.code, c.index)), indexed_commands)))


Training finished
Pipeline BERT/SaveST/10MeanExamplesCommands/IndexUpdater0.1/CosineDist/default was trained and estimated.
Trainin pipeline BERT/SaveST/10MeanExamplesCommands/IndexUpdater0.3/CosineDist/default


  target_vocab = np.array(list(map(lambda c: np.array((c.command.code, c.index)), indexed_commands)))


Training finished
Pipeline BERT/SaveST/10MeanExamplesCommands/IndexUpdater0.3/CosineDist/default was trained and estimated.
Trainin pipeline Word2Vec/RemoveST/5ExamplesCommands/NoneIndexUpdater/CosineDist/default


  target_vocab = np.array(list(map(lambda c: np.array((c.command.code, c.index)), indexed_commands)))


Training finished
Pipeline Word2Vec/RemoveST/5ExamplesCommands/NoneIndexUpdater/CosineDist/default was trained and estimated.
Trainin pipeline Word2Vec/RemoveST/5ExamplesCommands/IndexUpdater0.1/CosineDist/default


  target_vocab = np.array(list(map(lambda c: np.array((c.command.code, c.index)), indexed_commands)))


Training finished
Pipeline Word2Vec/RemoveST/5ExamplesCommands/IndexUpdater0.1/CosineDist/default was trained and estimated.
Trainin pipeline Word2Vec/RemoveST/5ExamplesCommands/IndexUpdater0.3/CosineDist/default


  target_vocab = np.array(list(map(lambda c: np.array((c.command.code, c.index)), indexed_commands)))


Training finished
Pipeline Word2Vec/RemoveST/5ExamplesCommands/IndexUpdater0.3/CosineDist/default was trained and estimated.
Trainin pipeline Word2Vec/RemoveST/10ExamplesCommands/NoneIndexUpdater/CosineDist/default


  target_vocab = np.array(list(map(lambda c: np.array((c.command.code, c.index)), indexed_commands)))


Training finished
Pipeline Word2Vec/RemoveST/10ExamplesCommands/NoneIndexUpdater/CosineDist/default was trained and estimated.
Trainin pipeline Word2Vec/RemoveST/10ExamplesCommands/IndexUpdater0.1/CosineDist/default


  target_vocab = np.array(list(map(lambda c: np.array((c.command.code, c.index)), indexed_commands)))


Training finished
Pipeline Word2Vec/RemoveST/10ExamplesCommands/IndexUpdater0.1/CosineDist/default was trained and estimated.
Trainin pipeline Word2Vec/RemoveST/10ExamplesCommands/IndexUpdater0.3/CosineDist/default


  target_vocab = np.array(list(map(lambda c: np.array((c.command.code, c.index)), indexed_commands)))


Training finished
Pipeline Word2Vec/RemoveST/10ExamplesCommands/IndexUpdater0.3/CosineDist/default was trained and estimated.
Trainin pipeline Word2Vec/RemoveST/5MeanExamplesCommands/NoneIndexUpdater/CosineDist/default


  target_vocab = np.array(list(map(lambda c: np.array((c.command.code, c.index)), indexed_commands)))


Training finished
Pipeline Word2Vec/RemoveST/5MeanExamplesCommands/NoneIndexUpdater/CosineDist/default was trained and estimated.
Trainin pipeline Word2Vec/RemoveST/5MeanExamplesCommands/IndexUpdater0.1/CosineDist/default


  target_vocab = np.array(list(map(lambda c: np.array((c.command.code, c.index)), indexed_commands)))


Training finished
Pipeline Word2Vec/RemoveST/5MeanExamplesCommands/IndexUpdater0.1/CosineDist/default was trained and estimated.
Trainin pipeline Word2Vec/RemoveST/5MeanExamplesCommands/IndexUpdater0.3/CosineDist/default


  target_vocab = np.array(list(map(lambda c: np.array((c.command.code, c.index)), indexed_commands)))


Training finished
Pipeline Word2Vec/RemoveST/5MeanExamplesCommands/IndexUpdater0.3/CosineDist/default was trained and estimated.
Trainin pipeline Word2Vec/RemoveST/10MeanExamplesCommands/NoneIndexUpdater/CosineDist/default


  target_vocab = np.array(list(map(lambda c: np.array((c.command.code, c.index)), indexed_commands)))


Training finished
Pipeline Word2Vec/RemoveST/10MeanExamplesCommands/NoneIndexUpdater/CosineDist/default was trained and estimated.
Trainin pipeline Word2Vec/RemoveST/10MeanExamplesCommands/IndexUpdater0.1/CosineDist/default


  target_vocab = np.array(list(map(lambda c: np.array((c.command.code, c.index)), indexed_commands)))


Training finished
Pipeline Word2Vec/RemoveST/10MeanExamplesCommands/IndexUpdater0.1/CosineDist/default was trained and estimated.
Trainin pipeline Word2Vec/RemoveST/10MeanExamplesCommands/IndexUpdater0.3/CosineDist/default


  target_vocab = np.array(list(map(lambda c: np.array((c.command.code, c.index)), indexed_commands)))


Training finished
Pipeline Word2Vec/RemoveST/10MeanExamplesCommands/IndexUpdater0.3/CosineDist/default was trained and estimated.
Trainin pipeline Word2Vec/SaveST/5ExamplesCommands/NoneIndexUpdater/CosineDist/default


  target_vocab = np.array(list(map(lambda c: np.array((c.command.code, c.index)), indexed_commands)))


Training finished
Pipeline Word2Vec/SaveST/5ExamplesCommands/NoneIndexUpdater/CosineDist/default was trained and estimated.
Trainin pipeline Word2Vec/SaveST/5ExamplesCommands/IndexUpdater0.1/CosineDist/default


  target_vocab = np.array(list(map(lambda c: np.array((c.command.code, c.index)), indexed_commands)))


Training finished
Pipeline Word2Vec/SaveST/5ExamplesCommands/IndexUpdater0.1/CosineDist/default was trained and estimated.
Trainin pipeline Word2Vec/SaveST/5ExamplesCommands/IndexUpdater0.3/CosineDist/default


  target_vocab = np.array(list(map(lambda c: np.array((c.command.code, c.index)), indexed_commands)))


Training finished
Pipeline Word2Vec/SaveST/5ExamplesCommands/IndexUpdater0.3/CosineDist/default was trained and estimated.
Trainin pipeline Word2Vec/SaveST/10ExamplesCommands/NoneIndexUpdater/CosineDist/default


  target_vocab = np.array(list(map(lambda c: np.array((c.command.code, c.index)), indexed_commands)))


Training finished
Pipeline Word2Vec/SaveST/10ExamplesCommands/NoneIndexUpdater/CosineDist/default was trained and estimated.
Trainin pipeline Word2Vec/SaveST/10ExamplesCommands/IndexUpdater0.1/CosineDist/default


  target_vocab = np.array(list(map(lambda c: np.array((c.command.code, c.index)), indexed_commands)))


Training finished
Pipeline Word2Vec/SaveST/10ExamplesCommands/IndexUpdater0.1/CosineDist/default was trained and estimated.
Trainin pipeline Word2Vec/SaveST/10ExamplesCommands/IndexUpdater0.3/CosineDist/default


  target_vocab = np.array(list(map(lambda c: np.array((c.command.code, c.index)), indexed_commands)))


Training finished
Pipeline Word2Vec/SaveST/10ExamplesCommands/IndexUpdater0.3/CosineDist/default was trained and estimated.
Trainin pipeline Word2Vec/SaveST/5MeanExamplesCommands/NoneIndexUpdater/CosineDist/default


  target_vocab = np.array(list(map(lambda c: np.array((c.command.code, c.index)), indexed_commands)))


Training finished


  "precision": round(TP.astype(int).sum() / TP_FP.astype(int).sum(), 3)
  target_vocab = np.array(list(map(lambda c: np.array((c.command.code, c.index)), indexed_commands)))


Pipeline Word2Vec/SaveST/5MeanExamplesCommands/NoneIndexUpdater/CosineDist/default was trained and estimated.
Trainin pipeline Word2Vec/SaveST/5MeanExamplesCommands/IndexUpdater0.1/CosineDist/default
Training finished
Pipeline Word2Vec/SaveST/5MeanExamplesCommands/IndexUpdater0.1/CosineDist/default was trained and estimated.
Trainin pipeline Word2Vec/SaveST/5MeanExamplesCommands/IndexUpdater0.3/CosineDist/default


  target_vocab = np.array(list(map(lambda c: np.array((c.command.code, c.index)), indexed_commands)))


Training finished
Pipeline Word2Vec/SaveST/5MeanExamplesCommands/IndexUpdater0.3/CosineDist/default was trained and estimated.
Trainin pipeline Word2Vec/SaveST/10MeanExamplesCommands/NoneIndexUpdater/CosineDist/default


  target_vocab = np.array(list(map(lambda c: np.array((c.command.code, c.index)), indexed_commands)))


Training finished
Pipeline Word2Vec/SaveST/10MeanExamplesCommands/NoneIndexUpdater/CosineDist/default was trained and estimated.
Trainin pipeline Word2Vec/SaveST/10MeanExamplesCommands/IndexUpdater0.1/CosineDist/default


  target_vocab = np.array(list(map(lambda c: np.array((c.command.code, c.index)), indexed_commands)))


Training finished
Pipeline Word2Vec/SaveST/10MeanExamplesCommands/IndexUpdater0.1/CosineDist/default was trained and estimated.
Trainin pipeline Word2Vec/SaveST/10MeanExamplesCommands/IndexUpdater0.3/CosineDist/default


  target_vocab = np.array(list(map(lambda c: np.array((c.command.code, c.index)), indexed_commands)))


Training finished
Pipeline Word2Vec/SaveST/10MeanExamplesCommands/IndexUpdater0.3/CosineDist/default was trained and estimated.

 ===== Results ==== 
0.820 | Word2Vec/RemoveST/5ExamplesCommands/IndexUpdater0.3/CosineDist/default
0.789 | Word2Vec/RemoveST/5MeanExamplesCommands/IndexUpdater0.1/CosineDist/default
0.779 | Word2Vec/RemoveST/10ExamplesCommands/IndexUpdater0.3/CosineDist/default
0.741 | Word2Vec/SaveST/10MeanExamplesCommands/IndexUpdater0.3/CosineDist/default
0.740 | Word2Vec/RemoveST/5MeanExamplesCommands/IndexUpdater0.3/CosineDist/default
0.734 | Word2Vec/RemoveST/10MeanExamplesCommands/IndexUpdater0.3/CosineDist/default
0.726 | Word2Vec/SaveST/10ExamplesCommands/IndexUpdater0.3/CosineDist/default
0.726 | Word2Vec/SaveST/5MeanExamplesCommands/IndexUpdater0.3/CosineDist/default
0.724 | BERT/SaveST/10MeanExamplesCommands/IndexUpdater0.1/CosineDist/default
0.721 | Word2Vec/RemoveST/10ExamplesCommands/IndexUpdater0.1/CosineDist/default
0.721 | Word2Vec/RemoveST/10MeanExamplesCo

In [13]:
# snips f1_micro(по 70) = 0.79
# snips f1_micro(по 286 - всего 2000) = 0.93


# With index updating 2002 (uniform) training samples
# 0.779 | Word2Vec/RemoveST/5ExamplesCommands/IndexUpdater0.3/CosineDist/default
# 0.759 | Word2Vec/RemoveST/10ExamplesCommands/IndexUpdater0.1/CosineDist/default


# With index updating 70 (uniform) training samples
# 0.770 | Word2Vec/RemoveST/5ExamplesCommands/IndexUpdater0.3/CosineDist/default
# 0.759 | Word2Vec/RemoveST/10ExamplesCommands/IndexUpdater0.3/CosineDist/default


# With index updating 150 (random) training samples
# 0.819 | Word2Vec/SaveST/10ExamplesCommands/IndexUpdater0.3/CosineDist/default
# 0.804 | Word2Vec/SaveST/10MeanExamplesCommands/IndexUpdater0.3/CosineDist/default
# 0.791 | Word2Vec/SaveST/5ExamplesCommands/IndexUpdater0.3/CosineDist/default
# 0.754 | Word2Vec/SaveST/5MeanExamplesCommands/IndexUpdater0.3/CosineDist/default


# 0.683 | Word2Vec/RemoveST/10ExamplesCommands/CosineDist/default
# 0.64  | Word2Vec/RemoveST/ExtendedCommands/CosineDist/default
# 0.624 | Word2Vec/RemoveST/5ExamplesCommands/CosineDist/default
# 0.617 | BERT/SaveST/5MeanExamplesCommands/CosineDist/default

In [16]:
details['Word2Vec/RemoveST/5ExamplesCommands/IndexUpdater0.3/CosineDist/default']

{'prediction_time': 2.990687382000033,
 'f1_score_micro': 0.77,
 'f1_score_macro': 0.784,
 'detailed': {'AddToPlaylist': {'recall': 0.798, 'precision': 0.971},
  'BookRestaurant': {'recall': 0.674, 'precision': 0.969},
  'GetWeather': {'recall': 0.952, 'precision': 0.884},
  'PlayMusic': {'recall': 0.767, 'precision': 0.857},
  'SearchScreeningEvent': {'recall': 0.785, 'precision': 0.609},
  'SearchCreativeWork': {'recall': 0.598, 'precision': 0.457},
  'RateBook': {'recall': 0.812, 'precision': 0.97}}}

In [17]:
details['BERT/RemoveST/10MeanExamplesCommands/IndexUpdater0.1/CosineDist/default']

{'prediction_time': 65.00556309500098,
 'f1_score_micro': 0.701,
 'f1_score_macro': 0.684,
 'detailed': {'AddToPlaylist': {'recall': 0.831, 'precision': 0.786},
  'BookRestaurant': {'recall': 0.891, 'precision': 0.573},
  'GetWeather': {'recall': 0.644, 'precision': 0.957},
  'PlayMusic': {'recall': 0.221, 'precision': 0.95},
  'SearchScreeningEvent': {'recall': 0.748, 'precision': 0.784},
  'SearchCreativeWork': {'recall': 0.794, 'precision': 0.489},
  'RateBook': {'recall': 0.688, 'precision': 0.917}}}

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import random

In [17]:
def _get_commands(indexed_commands_mapping, valid_intent, imballance_coefficient):
    commands = [indexed_commands_mapping[valid_intent]]
    
    other_commands_keys = set(indexed_commands_mapping.keys()) - {valid_intent}
    
    for key in random.sample(other_commands_keys, imballance_coefficient):
        commands.append(indexed_commands_mapping[key])
    
    return commands
        
    

def main(from_df, commands, indexer, preprocessor, commands_indexer, imballance_coefficient = 1):
    
    indexed_commands = commands_indexer(commands, preprocessor, indexer)
    indexed_commands_mapping = {c.command.code: c for c in indexed_commands}

    X = []
    y = []
    
    # fill dataset
    for _, row in from_df.iterrows():
        
        text_index = w2v_indexer.get_index(preprocessor.preprocess(row["text"]))  # add preprocessing
        for ic in _get_commands(indexed_commands_mapping, row["intent"], imballance_coefficient):
            X.append([*text_index, *ic.index, (np.dot(text_index, ic.index)/(np.linalg.norm(text_index)*np.linalg.norm(ic.index)) + 1) / 2])
            y.append(int(row["intent"] == ic.command.code))
    
    X, y = pd.DataFrame(X), pd.DataFrame(y)
    return X, y

In [19]:
%%time
X, y = main(pd.concat([train[train.intent == intent].head(1000) for intent in train.intent.unique()]), commands_with_5_examples, w2v_indexer, preprocessor_with_st_removing, mean_examples_commands_indexer)

  X.append([*text_index, *ic.index, (np.dot(text_index, ic.index)/(np.linalg.norm(text_index)*np.linalg.norm(ic.index)) + 1) / 2])


CPU times: user 34.1 s, sys: 270 ms, total: 34.3 s
Wall time: 34.3 s


In [48]:
mask = X.isna().astype(int).sum(axis=1) == 0
X = X[mask]
y = y[mask]

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=True, random_state=42)

In [59]:
%%time

config = {
    "est__loss": ['log',],
    "est__alpha": [1e-3, 1e-2,],
    "pca__n_components": [200, 300, 400, 500]
}

model = Pipeline([
    ("pca", PCA()),
    ("est", SGDClassifier())
])


gs_cv = GridSearchCV(
    estimator=model,
    param_grid=config,
    scoring="f1",
    n_jobs=-1,
    cv=5,
    verbose=2
)


gs_cv.fit(X_train, y_train.values.reshape(-1))

print(f"Best CV score: {gs_cv.best_score_}")

prediction = gs_cv.predict(X_test)

print(f"f1 {round(f1_score(y_test, prediction), 3)}")

print(f"precision {round(precision_score(y_test, prediction), 3)}")
print(f"recall {round(recall_score(y_test, prediction), 3)}")

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best CV score: 0.7714667209697723
f1 0.785
precision 0.731
recall 0.848
CPU times: user 15.3 s, sys: 11.9 s, total: 27.3 s
Wall time: 47 s


In [60]:
gs_cv.best_params_

{'est__alpha': 0.001, 'est__loss': 'log', 'pca__n_components': 300}