# Multi-attribute search with vector embeddings

In [1]:
# pip install ipykernel pandas superlinked tabulate

## Dataset

In [2]:
import json
import pandas as pd
pd.set_option("display.max_colwidth", 1000)

In [3]:
filename =  "monsters.json"

with open(filename) as f:
    df = pd.DataFrame(json.load(f))

df.head()

Unnamed: 0,name,look,habitat,behavior
0,Luminoth,Moth-like creature with glowing wings and antenna,Dense forests and jungles with bioluminescent flora,Emits soothing light patterns to communicate and attract prey
1,Aqua Wraith,Translucent humanoid figure made of flowing water,"Rivers, lakes, and coastal areas",Shapeshifts to blend with water bodies and controls currents
2,Stoneheart Golem,Massive humanoid composed of interlocking rock formations,Rocky mountains and ancient ruins,"Hibernates for centuries, awakens to protect its territory"
3,Whispering Shade,"Shadowy, amorphous being with glowing eyes",Dark forests and abandoned buildings,Feeds on fear and whispers unsettling truths
4,Zephyr Dancer,Graceful avian creature with iridescent feathers,High mountain peaks and wind-swept plains,Creates mesmerizing aerial displays to attract mates


In [4]:
filename =  "queries.json"

with open(filename) as f:
    queries = json.load(f)

queries

[{'look': 'glowing',
  'habitat': 'dark places',
  'behavior': 'light manipulation'},
 {'look': 'elemental',
  'habitat': 'extreme environments',
  'behavior': 'environmental control'},
 {'look': 'shapeshifting',
  'habitat': 'varied landscapes',
  'behavior': 'illusion creation'},
 {'look': 'crystalline',
  'habitat': 'mineral-rich areas',
  'behavior': 'energy absorption'},
 {'look': 'ethereal', 'habitat': 'atmospheric', 'behavior': 'mind influence'}]

## Retrieval

In [5]:
LIMIT = 3
MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"

### Naive approach

In [6]:
from sklearn.neighbors import NearestNeighbors
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [7]:
class NaiveRetriever:
    def __init__(self, data: pd.DataFrame):
        self.model = SentenceTransformer(MODEL_NAME)
        self.data = data.copy()
        self.ids = self.data.index.to_list()
        self.knns = {}
        for key in self.data:
            embeddings = self.model.encode(self.data[key].values)
            knn = NearestNeighbors(metric="cosine").fit(embeddings)
            self.knns[key] = knn

    def search_key(
        self, key: str, value: str, limit: int = LIMIT
    ) -> pd.DataFrame:
        embedding = self.model.encode(value)
        knn = self.knns[key]
        distances, indices = knn.kneighbors(
            [embedding], n_neighbors=limit, return_distance=True
        )
        ids = [self.ids[i] for i in indices[0]]
        
        similarities = (1 - distances).flatten()
        # by definition:
        # cosine distance = 1 - cosine similarity

        result = pd.DataFrame({"id": ids, f"score_{key}": similarities, key: self.data[key][ids]})
        result.set_index("id", inplace=True)

        return result

    def search(self, query: dict, limit: int = LIMIT) -> pd.DataFrame:
        results = []
        for key, value in query.items():
            if key not in self.knns:
                continue
            result_key = self.search_key(key, value, limit=limit)
            result_key.drop(columns=[key], inplace=True)
            results.append(result_key)

        merged_results = pd.concat(results, axis=1)
        merged_results["score"] = merged_results.mean(axis=1, skipna=False)
        merged_results.sort_values("score", ascending=False, inplace=True)
        return merged_results

In [8]:
naive_retriever = NaiveRetriever(df.set_index("name"))

### Search monsters by features

In [9]:
query = queries[0]
query

{'look': 'glowing', 'habitat': 'dark places', 'behavior': 'light manipulation'}

In [10]:
for key, value in query.items():
    result = naive_retriever.search_key(key, value)
    content = (
        f"**{key}**\n"
        f"{result.to_markdown()}"
        "\n"
    )
    print(content)

**look**
| id               |   score_look | look                                              |
|:-----------------|-------------:|:--------------------------------------------------|
| Whispering Shade |     0.503578 | Shadowy, amorphous being with glowing eyes        |
| Sandstorm Djinn  |     0.407344 | Swirling vortex of sand with glowing symbols      |
| Luminoth         |     0.378619 | Moth-like creature with glowing wings and antenna |

**habitat**
| id                  |   score_habitat | habitat                              |
|:--------------------|----------------:|:-------------------------------------|
| Whispering Shade    |        0.609567 | Dark forests and abandoned buildings |
| Fungal Network      |        0.438856 | Underground caverns and damp forests |
| Thornvine Elemental |        0.423421 | Overgrown ruins and dense jungles    |

**behavior**
| id                |   score_behavior | behavior                                                       |
|:-----------

In [11]:
naive_retriever.search(query, limit=6)

Unnamed: 0_level_0,score_look,score_habitat,score_behavior,score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Whispering Shade,0.503578,0.609567,,
Sandstorm Djinn,0.407344,0.365061,,
Luminoth,0.378619,,0.345566,
Nebula Jellyfish,0.36627,,0.259969,
Dreamweaver Octopus,0.315679,,,
Quantum Firefly,0.288577,,,
Fungal Network,,0.438856,,
Thornvine Elemental,,0.423421,,
Mist Phantom,,0.366815,0.236649,
Stoneheart Golem,,0.342287,,


### Search monsters similar to existing one

In [12]:
query = df.iloc[-1].to_dict()
query

{'name': 'Harmonic Coral',
 'look': 'Branching, musical instrument-like structure with vibrating tendrils',
 'habitat': 'Shallow seas and tidal pools',
 'behavior': 'Creates complex melodies to communicate and influence emotions'}

In [13]:
naive_retriever.search(query)

Unnamed: 0_level_0,score_look,score_habitat,score_behavior,score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Harmonic Coral,1.0,1.0,1.0,1.0
Thornvine Elemental,0.437504,,,
Plasma Serpent,0.421306,,,
Dreamweaver Octopus,,0.527561,,
Aqua Wraith,,0.506463,,
Zephyr Dancer,,,0.429848,
Luminoth,,,0.42838,


## Superlinked

In [14]:
from superlinked.framework.common.schema.id_schema_object import IdField
from superlinked.framework.common.schema.schema import schema
from superlinked.framework.common.schema.schema_object import String
from superlinked.framework.dsl.index.index import Index
from superlinked.framework.dsl.space.text_similarity_space import TextSimilaritySpace
from superlinked.framework.dsl.query.param import Param

from superlinked.framework.dsl.executor.in_memory.in_memory_executor import (
    InMemoryExecutor,
)
from superlinked.framework.dsl.source.in_memory_source import InMemorySource
from superlinked.framework.dsl.query.query import Query

from superlinked.framework.common.parser.dataframe_parser import DataFrameParser

In [15]:
@schema
class Monster:
    id: IdField
    look: String
    habitat: String
    behavior: String


monster = Monster()

In [16]:
look_space = TextSimilaritySpace(text=monster.look, model=MODEL_NAME)
habitat_space = TextSimilaritySpace(text=monster.habitat, model=MODEL_NAME)
behavior_space = TextSimilaritySpace(text=monster.behavior, model=MODEL_NAME)
monster_index = Index([look_space, habitat_space, behavior_space])

In [17]:
monster_parser = DataFrameParser(monster, mapping={monster.id: "name"})

In [18]:
source: InMemorySource = InMemorySource(monster, parser=monster_parser)
executor = InMemoryExecutor(sources=[source], indices=[monster_index])
app = executor.run()

In [19]:
source.put([df])

In [20]:
monster_query = (
    Query(monster_index)
    .find(monster)
    .similar(look_space.text, Param("look"))
    .similar(habitat_space.text, Param("habitat"))
    .similar(behavior_space.text, Param("behavior"))
    .limit(LIMIT)
)

In [21]:
def format_result(result) -> pd.DataFrame:
    scores = [entry.entity.score for entry in result.entries]
    df = result.to_pandas()
    df.insert(0, "score", scores)
    df.set_index("id", inplace=True)
    return df

### Search monsters by features

In [22]:
query = queries[0]
query

{'look': 'glowing', 'habitat': 'dark places', 'behavior': 'light manipulation'}

In [23]:
result = app.query(monster_query, limit=LIMIT, **query)

format_result(result)

Unnamed: 0_level_0,score,look,habitat,behavior
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Whispering Shade,0.376738,"Shadowy, amorphous being with glowing eyes",Dark forests and abandoned buildings,Feeds on fear and whispers unsettling truths
Luminoth,0.340084,Moth-like creature with glowing wings and antenna,Dense forests and jungles with bioluminescent flora,Emits soothing light patterns to communicate and attract prey
Living Graffiti,0.330587,"Two-dimensional, colorful creature that inhabits flat surfaces","Urban areas, particularly walls and billboards",Shapeshifts to blend with surroundings and absorbs pigments


### Search monsters similar to existing one

In [24]:
query = df.iloc[-1].to_dict()
query

{'name': 'Harmonic Coral',
 'look': 'Branching, musical instrument-like structure with vibrating tendrils',
 'habitat': 'Shallow seas and tidal pools',
 'behavior': 'Creates complex melodies to communicate and influence emotions'}

In [25]:
result = app.query(monster_query, limit=LIMIT, **query)

format_result(result)

Unnamed: 0_level_0,score,look,habitat,behavior
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Harmonic Coral,1.0,"Branching, musical instrument-like structure with vibrating tendrils",Shallow seas and tidal pools,Creates complex melodies to communicate and influence emotions
Dreamweaver Octopus,0.402288,Cephalopod with tentacles that shimmer like auroras,Deep ocean trenches and underwater caves,Influences the dreams of nearby creatures
Aqua Wraith,0.330869,Translucent humanoid figure made of flowing water,"Rivers, lakes, and coastal areas",Shapeshifts to blend with water bodies and controls currents


### weights

In [26]:
monster_query_with_weights = (
    Query(monster_index, weights={
        look_space: Param("look_weight"),
        habitat_space: Param("habitat_weight"),
        behavior_space: Param("behavior_weight")
    })
    .find(monster)
    .similar(look_space.text, Param("look"))
    .similar(habitat_space.text, Param("habitat"))
    .similar(behavior_space.text, Param("behavior"))
    .limit(LIMIT)
)

In [27]:
query = df.iloc[-1].to_dict()
query

{'name': 'Harmonic Coral',
 'look': 'Branching, musical instrument-like structure with vibrating tendrils',
 'habitat': 'Shallow seas and tidal pools',
 'behavior': 'Creates complex melodies to communicate and influence emotions'}

In [28]:
weights = {
    "look_weight": 0.5,
    "habitat_weight": -1.0,
    "behavior_weight": 1.0
}

In [29]:
result = app.query(monster_query_with_weights, limit=LIMIT, **weights, **query)

format_result(result)

Unnamed: 0_level_0,score,look,habitat,behavior
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Harmonic Coral,0.19245,"Branching, musical instrument-like structure with vibrating tendrils",Shallow seas and tidal pools,Creates complex melodies to communicate and influence emotions
Luminoth,0.149196,Moth-like creature with glowing wings and antenna,Dense forests and jungles with bioluminescent flora,Emits soothing light patterns to communicate and attract prey
Zephyr Dancer,0.136456,Graceful avian creature with iridescent feathers,High mountain peaks and wind-swept plains,Creates mesmerizing aerial displays to attract mates
