In [4]:
import weaviate
from weaviate.classes.config import Property, DataType, Tokenization, Configure
from weaviate.classes.query import Filter

client = weaviate.connect_to_local()

In [5]:
tkn_options = [
    Tokenization.WORD,
    Tokenization.LOWERCASE,
    Tokenization.WHITESPACE,
    Tokenization.FIELD,
]

properties = [
    Property(
        name=f"text_word",
        data_type=DataType.TEXT,
        tokenization=Tokenization.WORD
    ),
    Property(
        name=f"text_lowercase",
        data_type=DataType.TEXT,
        tokenization=Tokenization.LOWERCASE
    ),
    Property(
        name=f"text_whitespace",
        data_type=DataType.TEXT,
        tokenization=Tokenization.WHITESPACE
    ),
    Property(
        name=f"text_field",
        data_type=DataType.TEXT,
        tokenization=Tokenization.FIELD
    ),
]

for p in properties:
    print(p.name, p.tokenization)

text_word Tokenization.WORD
text_lowercase Tokenization.LOWERCASE
text_whitespace Tokenization.WHITESPACE
text_field Tokenization.FIELD


In [6]:
client.collections.delete("TokenExample")

collection = client.collections.create(
    name="TokenExample",
    properties=properties,
    vectorizer_config=Configure.Vectorizer.none()
)

print("Created TokenExample collection")

Created TokenExample collection


In [7]:
property_names = [p.name for p in properties]

for phrase in [
    "Lois & Clark: The New Adventures of Superman",
    "SW1A 1AA",
    "15-30",
    "30-15",
    "Beyoncé - Single Ladies (Put a Ring on It)",
]:
    obj_properties = {name: phrase for name in property_names}
    print(obj_properties)
    collection.data.insert(
        properties=obj_properties
    )

{'text_word': 'Lois & Clark: The New Adventures of Superman', 'text_lowercase': 'Lois & Clark: The New Adventures of Superman', 'text_whitespace': 'Lois & Clark: The New Adventures of Superman', 'text_field': 'Lois & Clark: The New Adventures of Superman'}
{'text_word': 'SW1A 1AA', 'text_lowercase': 'SW1A 1AA', 'text_whitespace': 'SW1A 1AA', 'text_field': 'SW1A 1AA'}
{'text_word': '15-30', 'text_lowercase': '15-30', 'text_whitespace': '15-30', 'text_field': '15-30'}
{'text_word': '30-15', 'text_lowercase': '30-15', 'text_whitespace': '30-15', 'text_field': '30-15'}
{'text_word': 'Beyoncé - Single Ladies (Put a Ring on It)', 'text_lowercase': 'Beyoncé - Single Ladies (Put a Ring on It)', 'text_whitespace': 'Beyoncé - Single Ladies (Put a Ring on It)', 'text_field': 'Beyoncé - Single Ladies (Put a Ring on It)'}


In [8]:
def token_test_query(query_term):
    print(f"\nHits for: '{query_term}'")

    # run a query on each property
    for name in property_names:
        response = collection.query.fetch_objects(
            filters=Filter.by_property(name).like(query_term),
            limit=5
        )

        if len(response.objects) > 0:
            for obj in response.objects:
                print(f"'{obj.properties[name]}' found in {name}")
        else:
            print(f"No matches for {name}")

In [9]:
token_test_query("Superman")


Hits for: 'Superman'
'Lois & Clark: The New Adventures of Superman' found in text_word
'Lois & Clark: The New Adventures of Superman' found in text_lowercase
'Lois & Clark: The New Adventures of Superman' found in text_whitespace
No matches for text_field


In [10]:
token_test_query("SUPERman")
token_test_query("Super man")


Hits for: 'SUPERman'
'Lois & Clark: The New Adventures of Superman' found in text_word
'Lois & Clark: The New Adventures of Superman' found in text_lowercase
No matches for text_whitespace
No matches for text_field

Hits for: 'Super man'
No matches for text_word
No matches for text_lowercase
No matches for text_whitespace
No matches for text_field


In [11]:
token_test_query("Lois & Superman")


Hits for: 'Lois & Superman'
'Lois & Clark: The New Adventures of Superman' found in text_word
'Lois & Clark: The New Adventures of Superman' found in text_lowercase
'Lois & Clark: The New Adventures of Superman' found in text_whitespace
No matches for text_field


In [12]:
token_test_query("Lois & Clark")
token_test_query("Lois & Clark:")


Hits for: 'Lois & Clark'
'Lois & Clark: The New Adventures of Superman' found in text_word
No matches for text_lowercase
No matches for text_whitespace
No matches for text_field

Hits for: 'Lois & Clark:'
'Lois & Clark: The New Adventures of Superman' found in text_word
'Lois & Clark: The New Adventures of Superman' found in text_lowercase
'Lois & Clark: The New Adventures of Superman' found in text_whitespace
No matches for text_field


In [13]:
token_test_query("SW1A 1AA")
token_test_query("1AA")
token_test_query("1AA SW1A")


Hits for: 'SW1A 1AA'
'SW1A 1AA' found in text_word
'SW1A 1AA' found in text_lowercase
'SW1A 1AA' found in text_whitespace
'SW1A 1AA' found in text_field

Hits for: '1AA'
'SW1A 1AA' found in text_word
'SW1A 1AA' found in text_lowercase
'SW1A 1AA' found in text_whitespace
No matches for text_field

Hits for: '1AA SW1A'
'SW1A 1AA' found in text_word
'SW1A 1AA' found in text_lowercase
'SW1A 1AA' found in text_whitespace
No matches for text_field


In [14]:
token_test_query("15-30")

token_test_query("30-15")


Hits for: '15-30'
'15-30' found in text_word
'30-15' found in text_word
'15-30' found in text_lowercase
'15-30' found in text_whitespace
'15-30' found in text_field

Hits for: '30-15'
'15-30' found in text_word
'30-15' found in text_word
'30-15' found in text_lowercase
'30-15' found in text_whitespace
'30-15' found in text_field
