In [2]:
from typing import Any

import pandas as pd

import cbrkit
query_name = 42
casebase_file = "data/cars-1k.csv"

df = pd.read_csv(casebase_file)
casebase = cbrkit.loaders.dataframe(df)
query = casebase[query_name]
print(query)
df

price             13686
year               2011
manufacturer     subaru
make            outback
fuel             diesel
miles           6024800
title_status      clean
transmission     manual
drive               rwd
type            compact
paint_color       black
Name: 42, dtype: object


Unnamed: 0,price,year,manufacturer,make,fuel,miles,title_status,transmission,drive,type,paint_color
0,22168,2011,mercedes-benz,viano,diesel,203593,rebuilt,manual,fwd,van,black
1,9437,2011,ford,s-max,diesel,137316,rebuilt,manual,fwd,van,black
2,1073,2002,hyundai,matrix,gas,182000,rebuilt,manual,fwd,van,black
3,1846,2012,chrysler,town-country,gas,122800,clean,manual,fwd,van,black
4,3515,2006,fiat,doblo,diesel,155623,clean,manual,4wd,van,black
...,...,...,...,...,...,...,...,...,...,...,...
994,462,2007,citroen,c3,gas,764905,clean,manual,fwd,compact,black
995,1665,2000,volvo,v70,gas,300000,rebuilt,manual,fwd,compact,black
996,3663,2006,opel,zafira,gas,94000,clean,manual,fwd,compact,black
997,1295,1360,citroen,xsara,gas,150000,clean,manual,rwd,compact,black


In [4]:
retriever = cbrkit.retrieval.build(
    cbrkit.global_sim.attribute_value(
        attributes={
            "price": cbrkit.sim.numeric.linear(max=100000),
            "year": cbrkit.sim.numeric.linear(max=50),
            "manufacturer": cbrkit.sim.taxonomy.load(
                "./data/cars-taxonomy.yaml",
                measure=cbrkit.sim.taxonomy.wu_palmer(),
            ),
            # TODO: needs nlp extra to be available during tests
            # "make": cbrkit.data_sim.strings.levenshtein(),
            "miles": cbrkit.sim.numeric.linear(max=1000000),
        },
        types_fallback=cbrkit.sim.generic.equality(),
        aggregator=cbrkit.global_sim.aggregator(pooling="mean"),
    ),
    limit=5,
)
result = cbrkit.retrieval.apply(casebase, query, retriever)
result

Result(final=_Result(similarities={42: AttributeValueSim(value=1.0, by_attribute={'manufacturer': 1.0, 'price': 1.0, 'fuel': 1.0, 'drive': 1.0, 'transmission': 1.0, 'title_status': 1.0, 'paint_color': 1.0, 'miles': 1.0, 'type': 1.0, 'year': 1.0, 'make': 1.0}), 204: AttributeValueSim(value=0.726909090909091, by_attribute={'manufacturer': 0.0, 'price': 0.996, 'fuel': 1.0, 'drive': 1.0, 'transmission': 1.0, 'title_status': 1.0, 'paint_color': 1.0, 'miles': 0.0, 'type': 1.0, 'year': 1.0, 'make': 0.0}), 714: AttributeValueSim(value=0.7268754545454545, by_attribute={'manufacturer': 0.0, 'price': 0.99563, 'fuel': 1.0, 'drive': 1.0, 'transmission': 1.0, 'title_status': 1.0, 'paint_color': 1.0, 'miles': 0.0, 'type': 1.0, 'year': 1.0, 'make': 0.0}), 343: AttributeValueSim(value=0.7234418181818182, by_attribute={'manufacturer': 0.0, 'price': 0.97786, 'fuel': 1.0, 'drive': 1.0, 'transmission': 1.0, 'title_status': 1.0, 'paint_color': 1.0, 'miles': 0.0, 'type': 1.0, 'year': 0.98, 'make': 0.0}), 406

In [5]:
query_name = 42
casebase_file = "data/cars-1k.yaml"

casebase: dict[int, Any] = cbrkit.loaders.yaml(casebase_file)
query = casebase[query_name]
retriever = cbrkit.retrieval.build(
    cbrkit.global_sim.attribute_value(
        attributes={
            "price": cbrkit.sim.numeric.linear(max=100000),
            "year": cbrkit.sim.numeric.linear(max=50),
            "model": cbrkit.global_sim.attribute_value(
                attributes={
                    "make": cbrkit.sim.generic.equality(),
                    "manufacturer": cbrkit.sim.taxonomy.load(
                        "./data/cars-taxonomy.yaml",
                        measure=cbrkit.sim.taxonomy.wu_palmer(),
                    ),
                }
            ),
        },
        aggregator=cbrkit.global_sim.aggregator(pooling="mean"),
    ),
    limit=5,
)
result = cbrkit.retrieval.apply(casebase, query, retriever)
result

Result(final=_Result(similarities={42: AttributeValueSim(value=1.0, by_attribute={'price': 1.0, 'year': 1.0, 'model': AttributeValueSim(value=1.0, by_attribute={'manufacturer': 1.0, 'make': 1.0})}), 499: AttributeValueSim(value=0.8137366666666667, by_attribute={'price': 0.96121, 'year': 0.98, 'model': AttributeValueSim(value=0.5, by_attribute={'manufacturer': 1.0, 'make': 0.0})}), 221: AttributeValueSim(value=0.7765633333333333, by_attribute={'price': 0.86969, 'year': 0.96, 'model': AttributeValueSim(value=0.5, by_attribute={'manufacturer': 1.0, 'make': 0.0})}), 512: AttributeValueSim(value=0.7744733333333333, by_attribute={'price': 0.98342, 'year': 0.84, 'model': AttributeValueSim(value=0.5, by_attribute={'manufacturer': 1.0, 'make': 0.0})}), 600: AttributeValueSim(value=0.7673333333333333, by_attribute={'price': 0.902, 'year': 0.9, 'model': AttributeValueSim(value=0.5, by_attribute={'manufacturer': 1.0, 'make': 0.0})})}, ranking=[42, 499, 221, 512, 600], casebase={42: {'engine': {'dr

In [6]:
sim = cbrkit.sim.numeric.linear(max=100)
sim(50, 60)

0.9

In [8]:
sim = cbrkit.sim.numeric.threshold(10)
sim(50, 61)

0.0

In [9]:
sim = cbrkit.sim.numeric.exponential(0.1)
sim(50, 60)

0.36787944117144233

In [19]:
sim = cbrkit.sim.numeric.sigmoid(1, 10)
sim(50, 60)
sim(50,58)

0.8807970779778823

In [22]:
sim = cbrkit.sim.generic.table([("a", "b", 0.5), ("b", "c", .7)], True, 0.0)
sim("a", "c")

0.0

In [6]:
sim = cbrkit.sim.strings.jaro()
sim("kitten", "sitting")

0.746031746031746