In [1]:
import sys

!{sys.executable} -W ignore:DEPRECATION -m pip install --quiet duckdb==0.7.1 \
duckdb-engine \
watermark \
jupysql \
sqlalchemy \
python-snappy \
pyarrow \
memray \
pandas \
ipywidgets  \
matplotlib \
gensim \
nltk \
plotly \
redis==4.5.3 \
jupyter-black

In [2]:
# Autoformat cells on run
import jupyter_black

jupyter_black.load()

In [3]:
import duckdb
import re
import pandas as pd
import string
from time import time

In [6]:
%load_ext sql
%config SqlMagic.autopandas = True
%config SqlMagic.feedback = False
%config SqlMagic.displaycon = False

In [9]:
# connect with pyscopg
%sql duckdb:///viberary.duckdb
con = duckdb.connect("viberary.duckdb")

In [107]:
# set log level for model training
import logging

logger = logging.getLogger()
logger.setLevel(logging.INFO)
logging.basicConfig(
    format="%(levelname)s - %(asctime)s: %(message)s",
    datefmt="%H:%M:%S",
    level=logging.INFO,
)

In [10]:
%sql DESCRIBE select * from read_json_auto('/Users/vicki/viberary/viberary/data/goodreads_books.json',lines='true');

Unnamed: 0,column_name,column_type,null,key,default,extra
0,isbn,VARCHAR,YES,,,
1,text_reviews_count,VARCHAR,YES,,,
2,series,BIGINT[],YES,,,
3,country_code,VARCHAR,YES,,,
4,language_code,VARCHAR,YES,,,
5,popular_shelves,"STRUCT(count BIGINT, ""name"" VARCHAR)[]",YES,,,
6,asin,VARCHAR,YES,,,
7,is_ebook,VARCHAR,YES,,,
8,average_rating,VARCHAR,YES,,,
9,kindle_asin,VARCHAR,YES,,,


In [11]:
%sql select book_id, title, popular_shelves as ps, description from goodreads limit 10;

Unnamed: 0,book_id,title,ps,description
0,5333265,W.C. Fields: A Life on Film,"[{'count': 3, 'name': 'to-read'}, {'count': 1,...",
1,1333909,Good Harbor,"[{'count': 2634, 'name': 'to-read'}, {'count':...","Anita Diamant's international bestseller ""The ..."
2,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...","[{'count': 58, 'name': 'to-read'}, {'count': 1...",Omnibus book club edition containing the Ladie...
3,6066819,Best Friends Forever,"[{'count': 7615, 'name': 'to-read'}, {'count':...",Addie Downs and Valerie Adler were eight when ...
4,287140,Runic Astrology: Starcraft and Timekeeping in ...,"[{'count': 32, 'name': 'to-read'}, {'count': 3...",
5,287141,The Aeneid for Boys and Girls,"[{'count': 56, 'name': 'to-read'}, {'count': 1...","Relates in vigorous prose the tale of Aeneas, ..."
6,378460,The Wanting of Levine,"[{'count': 14, 'name': 'to-read'}, {'count': 1...",
7,6066812,All's Fairy in Love and War (Avalon: Web of Ma...,"[{'count': 515, 'name': 'to-read'}, {'count': ...","To Kara's astonishment, she discovers that a p..."
8,34883016,Playmaker: A Venom Series Novella,"[{'count': 4, 'name': 'to-read'}, {'count': 1,...",Secrets. Sometimes keeping them in confidence ...
9,287149,The Devil's Notebook,"[{'count': 961, 'name': 'to-read'}, {'count': ...","Wisdom, humor, and dark observations by the fo..."


In [None]:
# Build input for Redis

# Let's start with title and description as our sentence features
sentences = con.sql(
    """select title, concat_ws(' ' , lower(regexp_replace(title, '[[:^alpha:]]',' ','g')), \
                    lower(regexp_replace(description, '[[:^alpha:]]',' ','g'))) as sentence from goodreads_en;"""
).df()

In [13]:
sentences

Unnamed: 0,title,sentence
0,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",the unschooled wizard sun wolf and starhawk ...
1,Best Friends Forever,best friends forever addie downs and valerie a...
2,The House of Memory (Pluto's Snitch #2),the house of memory pluto s snitch
3,The Bonfire of the Vanities,the bonfire of the vanities
4,Heaven,heaven what is heaven really going to be like ...
...,...,...
866178,"Ondine (Ondine Quartet, #0.5)",ondine ondine quartet i shouldn t get...
866179,Different Breeds,different breeds derek and blake have an under...
866180,"This Sceptred Isle, Vol. 10: The Age of Victor...",this sceptred isle vol the age of victor...
866181,Sherlock Holmes and the July Crisis,sherlock holmes and the july crisis sir arthur...


In [30]:
sentence_sample = sentences[:10]

In [31]:
corpus = sentence_sample["sentence"].tolist()

In [None]:
from sentence_transformers import SentenceTransformer, util

#  A common value for BERT & Co. are 512 word pieces, which corresponde to about 300-400 words (for English).
# Longer texts than this are truncated to the first x word pieces.
# By default, the provided methods use a limit fo 128 word pieces, longer inputs will be truncated
# the runtime and the memory requirement grows quadratic with the input length - we'll have to play around with this

# Change the length to 200
model = SentenceTransformer("all-MiniLM-L6-v2")
model.max_seq_length = 200


corpus_embeddings = embedder.encode(
    corpus, show_progress_bar=True, convert_to_numpy=False
)

In [76]:
embedding_list = [element.tolist() for element in corpus_embeddings]

In [77]:
sentence_sample["embeddings"] = embedding_list

In [84]:
sentence_embeddings = sentence_sample[["title", "embeddings"]]

In [None]:
sentence_embeddings

sentence_embeddings.loc[sentence_embeddings["embeddings"]] = sentence_embeddings[
    ["embeddings"]
].to_numpy()

In [163]:
sentence_embeddings

Unnamed: 0,title,embeddings
0,"The Unschooled Wizard (Sun Wolf and Starhawk, ...","[-0.01984275132417679, -0.0362430103123188, -0..."
1,Best Friends Forever,"[-0.06455366313457489, -0.021649153903126717, ..."
2,The House of Memory (Pluto's Snitch #2),"[0.011168530210852623, -0.008854340761899948, ..."
3,The Bonfire of the Vanities,"[0.04934627190232277, 0.094296395778656, -0.04..."
4,Heaven,"[-0.011473871767520905, -0.03340819850564003, ..."
5,Dog Heaven,"[-0.012331654317677021, 0.03569839149713516, -..."
6,Glimmering Light,"[-0.04284968227148056, -0.021880364045500755, ..."
7,"The 30s (Fantastic Films of the Decades, #2)","[-0.005945943295955658, 0.020102590322494507, ..."
8,Crude World: The Violent Twilight of Oil,"[-0.010484326630830765, -0.021200427785515785,..."
9,Untold Secrets: Fire & Ice,"[-0.13027681410312653, -0.03882315009832382, -..."


In [183]:
import numpy as np

embedding_dict = dict(
    zip(
        sentence_embeddings.index,
        sentence_embeddings["embeddings"],
    )
)

In [184]:
embedding_dict

{0: array([-1.98427513e-02, -3.62430103e-02, -2.44700387e-02,  5.97363450e-02,
        -4.14715149e-02,  3.65304500e-02, -2.37880573e-02, -1.07979342e-01,
        -9.82399564e-03,  4.50386330e-02,  2.43876479e-03,  3.70797627e-02,
         5.50738014e-02, -7.12135807e-02, -5.25632314e-02,  2.37911493e-02,
        -9.51675326e-02,  2.69943150e-03,  5.43346144e-02, -2.69930065e-02,
        -9.57429409e-03, -2.83143092e-02,  5.44970706e-02,  8.85461792e-02,
        -1.60235278e-02, -6.61060810e-02,  1.04392497e-02,  2.23759841e-02,
        -8.91439244e-02, -3.16970497e-02, -9.69246775e-02,  2.99694948e-02,
        -7.20835775e-02, -5.48197664e-02, -2.31242273e-02, -3.06527526e-03,
         6.05091415e-02,  4.01089676e-02,  7.26415683e-03,  3.56343612e-02,
        -4.52741422e-02, -4.95026708e-02, -4.10252474e-02,  3.94657776e-02,
        -7.40466500e-03,  1.17625110e-02, -1.79529539e-04, -6.39409646e-02,
        -3.63795981e-02,  4.75370847e-02,  1.99519508e-02, -3.77626047e-02,
        -

In [215]:
array = embedding_dict[1]
print(array.size)

384


In [185]:
# Add vectors to Redis
from redis import Redis
from redis.commands.search.field import VectorField, TextField
from redis.commands.search.query import Query

# Examples https://github.com/RediSearch/RediSearch/blob/master/docs/docs/vecsim-hybrid_queries_examples.ipynb
# More: https://lablab.ai/t/efficient-vector-similarity-search-with-redis-a-step-by-step-tutorial

# Redis client connection
host = "localhost"
port = 6379

redis_conn = Redis(host=host, port=port)

In [216]:
# Index fields and configurations
# Params  https://redis.io/docs/stack/search/reference/vectors/

n_vec = 10
dim = 384
M = 40  # Optional Number of maximum allowed outgoing edges for each node in the graph in each layer.
EF = 200  # Number of maximum allowed potential outgoing edges candidates for each node in the graph
vector_field = "vector"
token_field_name = "token"
index_name = "viberary"
distance_metric = "COSINE"
float_type = "FLOAT64"

In [219]:
# Delete and rebuild index
delete_data(redis_conn)
# docker run -p 6379:6379 redislabs/redisearch:latest - need to run through the container

In [220]:
def delete_data(client: Redis):
    client.flushall()


# Vectors in redis are stored in a binary blob
def load_docs(client: Redis, n, d, vector_dict):
    # an input dictionary from a dictionary
    for i, (k, v) in enumerate(embedding_dict.items()):
        logging.info(f"Inserting {i} vector into Redis index {index_name}")
        np_vector = v.astype(np.float64)
        try:
            # try storing vector, log exceptions if fails to map
            client.hset(k, mapping={vector_field: np_vector.tobytes()})
            logging.info(f"Set {k} vector into Redis index as {vector_field}")
        except Exception as e:
            logging.error("An exception occurred: {}".format(e))


def print_results(res):
    docs = [int(doc.id) for doc in res.docs]
    dists = [float(doc.dist) if hasattr(doc, "dist") else "-" for doc in res.docs]
    print(f"got {len(docs)} doc ids: ", docs)
    print("\ndistances: ", dists)

In [221]:
schema = (
    VectorField(
        vector_field,
        "HNSW",
        {"TYPE": "FLOAT64", "DIM": dim, "DISTANCE_METRIC": "COSINE"},
    ),
    TextField(token_field_name),
)
redis_conn.ft(index_name).create_index(schema)
redis_conn.ft(index_name).config_set("default_dialect", 2)

# load vectors with meta-data
load_docs(redis_conn, n_vec, dim, embedding_dict)

INFO:root:Inserting 0 vector into Redis index viberary
INFO:root:Set 0 vector into Redis index as vector
INFO:root:Inserting 1 vector into Redis index viberary
INFO:root:Set 1 vector into Redis index as vector
INFO:root:Inserting 2 vector into Redis index viberary
INFO:root:Set 2 vector into Redis index as vector
INFO:root:Inserting 3 vector into Redis index viberary
INFO:root:Set 3 vector into Redis index as vector
INFO:root:Inserting 4 vector into Redis index viberary
INFO:root:Set 4 vector into Redis index as vector
INFO:root:Inserting 5 vector into Redis index viberary
INFO:root:Set 5 vector into Redis index as vector
INFO:root:Inserting 6 vector into Redis index viberary
INFO:root:Set 6 vector into Redis index as vector
INFO:root:Inserting 7 vector into Redis index viberary
INFO:root:Set 7 vector into Redis index as vector
INFO:root:Inserting 8 vector into Redis index viberary
INFO:root:Set 8 vector into Redis index as vector
INFO:root:Inserting 9 vector into Redis index viberary


In [222]:
print("index meta: ", redis_conn.ft("viberary").info())

index meta:  {'index_name': 'viberary', 'index_options': [], 'index_definition': [b'key_type', b'HASH', b'prefixes', [b''], b'default_score', b'1'], 'attributes': [[b'identifier', b'vector', b'attribute', b'vector', b'type', b'VECTOR'], [b'identifier', b'token', b'attribute', b'token', b'type', b'TEXT', b'WEIGHT', b'1']], 'num_docs': '10', 'max_doc_id': '10', 'num_terms': '0', 'num_records': '10', 'inverted_sz_mb': '0', 'vector_index_sz_mb': '3.1714706420898438', 'total_inverted_index_blocks': '0', 'offset_vectors_sz_mb': '0', 'doc_table_size_mb': '0.0006389617919921875', 'sortable_values_size_mb': '0', 'key_table_size_mb': '0.0002765655517578125', 'records_per_doc_avg': '1', 'bytes_per_record_avg': '0', 'offsets_per_term_avg': '0', 'offset_bits_per_record_avg': '-nan', 'hash_indexing_failures': '0', 'total_indexing_time': '0.59199999999999997', 'indexing': '0', 'percent_indexed': '1', 'number_of_uses': 1, 'gc_stats': [b'bytes_collected', b'0', b'total_ms_run', b'0', b'total_cycles', b

In [223]:
# Check to make sure we indexed shit
print("index size: ", redis_conn.ft("viberary").info()["num_docs"])

index size:  10


In [226]:
# Vectorize our query by hand for now
# Understand why harry is so low
import pprint

string_query = "dog"
query = embedder.encode(string_query, convert_to_numpy=True)

query_vector = query.astype(np.float64).tobytes()
topK = 10
q = (
    Query(f"*=>[KNN {topK} @{vector_field} $vec_param AS vector_score]")
    .sort_by("vector_score")
    .paging(0, topK)
    .return_fields("token", "vector_score")
    .dialect(2)
)
params_dict = {"vec_param": query_vector}

# Return results
results = redis_conn.ft("viberary").search(q, query_params=params_dict)

pp = pprint.PrettyPrinter(indent=4)
pp.pprint(results.docs)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[   Document {'id': '5', 'payload': None, 'vector_score': '0.681756722132'},
    Document {'id': '2', 'payload': None, 'vector_score': '0.768918196623'},
    Document {'id': '3', 'payload': None, 'vector_score': '0.839949631983'},
    Document {'id': '6', 'payload': None, 'vector_score': '0.847654081991'},
    Document {'id': '7', 'payload': None, 'vector_score': '0.899700937653'},
    Document {'id': '4', 'payload': None, 'vector_score': '0.916752917127'},
    Document {'id': '9', 'payload': None, 'vector_score': '0.929005508633'},
    Document {'id': '0', 'payload': None, 'vector_score': '0.945339063262'},
    Document {'id': '1', 'payload': None, 'vector_score': '0.947969736061'},
    Document {'id': '8', 'payload': None, 'vector_score': '0.955747120524'}]
