In [2]:
# _ve/bin/pip install nomic[local]
# note:
#   nomic doesn't install gpt4all, which required for local mode; nomic[local] does

# Reference:
# https://blog.nomic.ai/posts/local-nomic-embed


In [3]:
from nomic import embed
import numpy as np
import gpt4all
import subprocess
import numpy as np
import pprint

In [4]:
# nomic local downloads weights (262MiB) on first use, storing them at this path
#     ~/.cache/gpt4all/nomic-embed-text-v1.5.f16.gguf

# shasum -a 256 ~/.cache/gpt4all/nomic-embed-text-v1.5.f16.gguf
# f7af6f66802f4df86eda10fe9bbcfc75c39562bed48ef6ace719a251cf1c2fdb

# under the hood, this is done by gpt4all
# do this manually here.

model = gpt4all.GPT4All('nomic-embed-text-v1.5.f16.gguf')


Failed to load libllamamodel-mainline-cuda-avxonly.so: dlopen: libcuda.so.1: cannot open shared object file: No such file or directory
Failed to load libllamamodel-mainline-cuda.so: dlopen: libcuda.so.1: cannot open shared object file: No such file or directory


In [5]:
model

<gpt4all.gpt4all.GPT4All at 0x7f530420ba10>

In [6]:
output = embed.text(
    texts=[
        'Nomic Embed now supports local and dynamic inference to save you inference latency and cost!',
        'Hey Nomic, why don\'t you release a multimodal model soon?',
    ],
    model='nomic-embed-text-v1.5',
    task_type="search_document",
    inference_mode='local',
    dimensionality=768,
)


In [7]:
output['usage']

{'prompt_tokens': 36, 'total_tokens': 36}

In [8]:
len(output['embeddings'][1])


768

# embed and search manpages

In [9]:
manpage_commands = ['ls','du','dd','id','ps','tr','cat','mv','cp','trunc']

# call `man $command` to read manpage content
manpage_texts = []
for c in manpage_commands:
    manpage_texts.append(subprocess.check_output(['man', c]).decode('utf8'))

# embeddings for each manpage
manpage_embeddings = embed.text(
    texts=manpage_texts,
    model='nomic-embed-text-v1.5',
    task_type="search_document",
    inference_mode='local',
    dimensionality=768,
)
# ^ takes about 22 seconds for 10 commands on an i5-8500 CPU @ 3.00GHz


In [10]:
vect_texts = [np.array(e) for e in manpage_embeddings['embeddings']]


In [12]:
queries = [
    'list running processes',
    'show files in a directory of the filesystem',
    'write data to a block device',
    'find out how much space is used or free',
    'get the uid and gid of the current user',
    'show running processes',
    'substitute characters in a stream',
    'read data from a file and print it to standard output',
    'move or rename a file or directory',
    'copy a file',
    'truncate',
]

for query in queries:
    # embed the query
    search_embed = embed.text(
        texts=[
            query
        ],
        model='nomic-embed-text-v1.5',
        task_type="search_query",
        inference_mode='local',
        dimensionality=768,
    )
    vect_query = np.array(search_embed['embeddings'][0])

    # calculate embedding vector distance between the query and each manpage embedding
    # this uses the euclidean distance (L2)
    # TODO: try cosine, dot product

    # distances = [np.linalg.norm(vect_query - v) for v in vect_texts]
    distances = np.linalg.norm((vect_texts - vect_query), axis=1)

    # pick the one with the smallest distance as the result
    result_cmd = manpage_commands[np.argmin(distances)]
    # ... but also output the closest N (might be interesting, especially if the results is wrong)
    closest_n = 4
    indices_asc_by_distance = np.argsort(distances)
    scores_str = ', '.join([
        f'{manpage_commands[ii]}: {distances[ii]:.2f}'
        for ii in indices_asc_by_distance[0:closest_n]])

    print(f'query:  {query}')
    print(f'result: {result_cmd}')
    print(f'top {closest_n} scores: {scores_str}')
    print()

query:  list running processes
result: ps
top 4 scores: ps: 0.85, ls: 0.90, id: 0.95, mv: 0.97

query:  show files in a directory of the filesystem
result: ls
top 4 scores: ls: 0.78, du: 0.83, dd: 0.87, mv: 0.89

query:  write data to a block device
result: dd
top 4 scores: dd: 0.87, du: 0.92, id: 0.93, cp: 0.95

query:  find out how much space is used or free
result: du
top 4 scores: du: 0.82, ls: 0.91, trunc: 0.94, dd: 0.95

query:  get the uid and gid of the current user
result: id
top 4 scores: id: 0.81, du: 0.91, ls: 0.94, dd: 0.95

query:  show running processes
result: ps
top 4 scores: ps: 0.84, ls: 0.90, id: 0.91, cat: 0.93

query:  substitute characters in a stream
result: tr
top 4 scores: tr: 0.91, id: 0.96, mv: 0.96, dd: 0.97

query:  read data from a file and print it to standard output
result: cat
top 4 scores: cat: 0.76, id: 0.83, dd: 0.83, ls: 0.85

query:  move or rename a file or directory
result: mv
top 4 scores: mv: 0.69, cp: 0.78, dd: 0.81, ls: 0.83

query:  copy a 