In [None]:
!pip install duckdb==0.7.1 \
duckdb-engine \
watermark \
jupysql \
sqlalchemy \
python-snappy \
pyarrow \
memray \
pandas  \
ipywidgets  \
matplotlib \
gensim \
nltk \
plotly \
redis==4.5.3 \
jupyter-black \
sentence_transformers \
redis \
jupyter_black

In [None]:
import sys
from tqdm import tqdm
import numpy as np
import pyarrow as pa
import pandas as pd

In [None]:
# Autoformat cells on run
import jupyter_black
import pandas as pd

jupyter_black.load()

In [None]:
# set log level for model training
import logging

logger = logging.getLogger()
logger.setLevel(logging.INFO)
logging.basicConfig(
    format="%(levelname)s - %(asctime)s: %(message)s",
    datefmt="%H:%M:%S",
    level=logging.INFO,
)

In [None]:
embeddings = pd.read_parquet("~/ssl/2023071112_training.parquet")

In [None]:
embeddings

In [None]:
corpus = embeddings["sentence"].tolist()
titles = embeddings["title"].tolist()
author = embeddings["author"].tolist()
link = embeddings["link"].tolist()
review_count = embeddings["text_reviews_count"].tolist()
indices = embeddings.index.tolist()

In [None]:
from sentence_transformers import SentenceTransformer, util

#  A common value for BERT & Co. are 512 word pieces, which correspond to about 300-400 words (for English).
# Longer texts than this are truncated to the first x word pieces.
# By default, the provided methods use a limit fo 128 word pieces, longer inputs will be truncated
# the runtime and the memory requirement grows quadratic with the input length - we'll have to play around with this

# Change the length to 200
model = SentenceTransformer("sentence-transformers/msmarco-distilbert-base-v3")
model.max_seq_length = 200


corpus_embeddings = model.encode(
    corpus, show_progress_bar=True, device="cuda", convert_to_numpy=False
)

In [None]:
embeddings_list = [x.tolist() for x in corpus_embeddings]
embedding_tuple = list(
    zip(titles, indices, author, link, review_count, embeddings_list)
)

In [None]:
df = pd.DataFrame(
    embedding_tuple,
    columns=["title", "index", "author", "link", "review_count", "embeddings"],
)

In [None]:
# introspect
df

In [None]:
fields = [
    ("title", pa.string()),
    ("index", pa.int64(), False),
    ("author", pa.string()),
    ("link", pa.string()),
    ("review_count", pa.string()),
    ("embeddings", pa.large_list(pa.float64())),
]
schema = pa.schema(fields)

df.to_parquet(
    "20230711_learned_embeddings.snappy",
    engine="pyarrow",
    compression="snappy",
    schema=schema,
)