In [2]:
import json
from typing import List

import numpy as np
import pandas as pd
import spacy
from Levenshtein import distance

from text_to_command.indexer import Indexer

In [3]:
with open("test_index_data.json") as f:
    data = json.load(f)

config_ids = list(data.keys())

In [4]:
# python3 -m spacy download en_core_web_md
indexer = Indexer(spacy.load("en_core_web_md"))

In [60]:
skill_score_weight = 0.2

query = indexer.clear_string("Is there any messengers?")
query_embedding = indexer.get_embedding(query)
assert query_embedding is not None

In [61]:
exact_data = []

for config_id in config_ids:
    exact_data.extend([
        dict(zip(("id", "weight", "variants"), (config_id, *weighed_pair)))
        for weighed_pair in data[config_id]['exact']
    ])

exact_df = pd.DataFrame(exact_data)

In [62]:
def calculate_exact_similarity(query: str):
    def internal(variants: List[str]):
        similarity_score = 0
        for variant in variants:
            candidate_score = 1 - distance(query, variant) / max(len(query), len(variant))
            if candidate_score > similarity_score:
                similarity_score = candidate_score
        return similarity_score
    return internal

In [63]:
exact_df['similarity'] = exact_df['variants'].map(calculate_exact_similarity(query))

In [64]:
embedding_data = []

for config_id in config_ids:
    embedding_data.extend([
        dict(zip(("id", "weight", "embedding"), (config_id, *weighed_pair)))
        for weighed_pair in data[config_id]['embeddings']
    ])

embedding_df = pd.DataFrame(embedding_data)

In [65]:
def calculate_embedding_similarity(query_embedding: np.array):
    def internal(embedding: list):
        np_embedding = np.array(embedding)
        return max(
            0,
            np.dot(query_embedding, np_embedding) / (np.linalg.norm(query_embedding) * np.linalg.norm(np_embedding))
        )

    return internal

In [66]:
embedding_df['similarity'] = embedding_df['embedding'].map(calculate_embedding_similarity(query_embedding))

In [67]:
common_columns = ['id', 'weight', 'similarity']
common_df = pd.concat([
    exact_df[common_columns],
    embedding_df[common_columns]
], ignore_index=True)

In [68]:
common_df['weighted_similarity'] = common_df['weight'] * common_df['similarity']
grouped = common_df.groupby("id")['weighted_similarity'].sum()

In [69]:
calculated_dict = grouped.to_dict()

# correct weights to take skill for function score calculation
for key, similarity in calculated_dict.items():
    if "." in key:
        skill_id, function_id = key.split(".")
        calculated_dict[key] = skill_score_weight * calculated_dict[skill_id] + (1 - skill_score_weight) * calculated_dict[key]

In [70]:
for key, similarity in sorted(calculated_dict.items(), key=lambda x: x[1], reverse=True)[:5]:
    print(key, similarity)

telegram.unread_messages 0.5649771574149363
telegram.send_message 0.5231825345644661
calendar.list 0.4787061854854008
timer.start_new 0.4648475555761865
timer.list 0.46406250862207543
