In [1]:
import sys

!{sys.executable} -W ignore:DEPRECATION -m pip install --quiet duckdb==0.7.1 \
duckdb-engine \
watermark \
jupysql \
sqlalchemy \
python-snappy \
pyarrow \
memray \
pandas \
ipywidgets  \
matplotlib \
gensim \
nltk \
plotly \
redis==4.5.3 \
jupyter-black

In [2]:
import duckdb
import re
import pandas as pd
import string
from time import time

from gensim.models import KeyedVectors
from gensim.test.utils import datapath
from gensim import utils
from gensim.test.utils import common_texts

from gensim.models.doc2vec import Doc2Vec, TaggedDocument

import multiprocessing

import redis

In [3]:
# Autoformat cells on run
import jupyter_black

jupyter_black.load()

In [4]:
# set log level for model training
import logging

logger = logging.getLogger()
logger.setLevel(logging.INFO)
logging.basicConfig(
    format="%(levelname)s - %(asctime)s: %(message)s",
    datefmt="%H:%M:%S",
    level=logging.INFO,
)

In [5]:
%load_ext sql

In [6]:
# Create new DB or load existing https://duckdb.org/docs/guides/python/jupyter.html
%sql duckdb:///viberary.duckdb

# connect with pyscopg
con = duckdb.connect("viberary.duckdb")

In [7]:
sentences = con.sql(
    """select concat_ws(' ' , lower(regexp_replace(title, '[[:^alpha:]]',' ','g')), \
                    lower(regexp_replace(description, '[[:^alpha:]]',' ','g'))) as sentence from goodreads_en;"""
).df()

FloatProgress(value=0.0, layout=Layout(width='100%'), style=ProgressStyle(bar_color='black'))

In [None]:
# input for training Doc2Vec is a list of lists or iterable
# needs to be streamable https://radimrehurek.com/gensim/models/word2vec.html


# class CorpusReader:
#     """A class that returns lists of lists of strings."""

#     def get_doc_words() -> list:
#         doc_words = []
#         corpus_path = datapath("sentences_en.csv")
#         for line in open(corpus_path):
#             processed_line = utils.simple_preprocess(line)
#             doc_words.append(processed_line)

#         return doc_words


class CorpusReader:
    """An iterator that yields sentences (lists of str)."""

    def __iter__(self):
        corpus_path = datapath("sentences_en.csv")
        for line in open(corpus_path):
            yield utils.simple_preprocess(line)

In [14]:
sentences = CorpusReader()
# TODO: improve list creation
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(sentences)]
d2v_model = Doc2Vec()

INFO:gensim.utils:Doc2Vec lifecycle event {'params': 'Doc2Vec<dm/m,d100,n5,w5,mc5,s0.001,t3>', 'datetime': '2023-04-08T14:05:46.700508', 'gensim': '4.3.1', 'python': '3.9.12 (main, Mar 26 2022, 15:51:13) \n[Clang 12.0.0 (clang-1200.0.32.29)]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'created'}


In [16]:
%%time

t = time()
d2v_model.build_vocab(documents, progress_per=10000)
print("Time to build vocab: {} mins".format(round((time() - t) / 60, 2)))

INFO:gensim.models.doc2vec:collecting all words and their counts
INFO:gensim.models.doc2vec:PROGRESS: at example #0, processed 0 words (0 words/s), 0 word types, 0 tags
INFO:gensim.models.doc2vec:PROGRESS: at example #10000, processed 1367451 words (6416799 words/s), 52723 word types, 0 tags
INFO:gensim.models.doc2vec:PROGRESS: at example #20000, processed 2719501 words (6311225 words/s), 74831 word types, 0 tags
INFO:gensim.models.doc2vec:PROGRESS: at example #30000, processed 4088179 words (6155862 words/s), 92439 word types, 0 tags
INFO:gensim.models.doc2vec:PROGRESS: at example #40000, processed 5440523 words (6265373 words/s), 107566 word types, 0 tags
INFO:gensim.models.doc2vec:PROGRESS: at example #50000, processed 6795839 words (5931043 words/s), 121370 word types, 0 tags
INFO:gensim.models.doc2vec:PROGRESS: at example #60000, processed 8163934 words (5695783 words/s), 133976 word types, 0 tags
INFO:gensim.models.doc2vec:PROGRESS: at example #70000, processed 9525001 words (614

INFO:gensim.models.doc2vec:PROGRESS: at example #650000, processed 88400910 words (5807436 words/s), 482421 word types, 0 tags
INFO:gensim.models.doc2vec:PROGRESS: at example #660000, processed 89752928 words (5882070 words/s), 486118 word types, 0 tags
INFO:gensim.models.doc2vec:PROGRESS: at example #670000, processed 91110673 words (5646269 words/s), 490223 word types, 0 tags
INFO:gensim.models.doc2vec:PROGRESS: at example #680000, processed 92466771 words (5729999 words/s), 494046 word types, 0 tags
INFO:gensim.models.doc2vec:PROGRESS: at example #690000, processed 93828408 words (5841079 words/s), 498149 word types, 0 tags
INFO:gensim.models.doc2vec:PROGRESS: at example #700000, processed 95174888 words (5154168 words/s), 501856 word types, 0 tags
INFO:gensim.models.doc2vec:PROGRESS: at example #710000, processed 96537390 words (5669910 words/s), 505667 word types, 0 tags
INFO:gensim.models.doc2vec:PROGRESS: at example #720000, processed 97884448 words (5604956 words/s), 509550 wor

Time to build vocab: 0.39 mins
CPU times: user 23.2 s, sys: 143 ms, total: 23.4 s
Wall time: 23.4 s


In [21]:
t = time()

d2v_model.train(
    documents, total_examples=d2v_model.corpus_count, epochs=2, report_delay=1
)

print("Time to train the model: {} mins".format(round((time() - t) / 60, 2)))

INFO:gensim.utils:Doc2Vec lifecycle event {'msg': 'training model with 3 workers on 158872 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2023-04-08T14:28:03.947449', 'gensim': '4.3.1', 'python': '3.9.12 (main, Mar 26 2022, 15:51:13) \n[Clang 12.0.0 (clang-1200.0.32.29)]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'train'}
INFO:gensim.models.word2vec:EPOCH 0 - PROGRESS: at 1.09% examples, 1002209 words/s, in_qsize 6, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 0 - PROGRESS: at 2.20% examples, 1021182 words/s, in_qsize 5, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 0 - PROGRESS: at 3.33% examples, 1028583 words/s, in_qsize 5, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 0 - PROGRESS: at 4.42% examples, 1022899 words/s, in_qsize 5, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 0 - PROGRESS: at 5.43% examples, 1004264 words/s, in_qsize 5, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 0 - PROGRESS: at 6.44% 

INFO:gensim.models.word2vec:EPOCH 0 - PROGRESS: at 78.98% examples, 989909 words/s, in_qsize 5, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 0 - PROGRESS: at 80.07% examples, 990070 words/s, in_qsize 5, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 0 - PROGRESS: at 81.16% examples, 990267 words/s, in_qsize 6, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 0 - PROGRESS: at 82.25% examples, 990425 words/s, in_qsize 5, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 0 - PROGRESS: at 83.33% examples, 990277 words/s, in_qsize 5, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 0 - PROGRESS: at 84.42% examples, 990621 words/s, in_qsize 5, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 0 - PROGRESS: at 85.49% examples, 990736 words/s, in_qsize 6, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 0 - PROGRESS: at 86.58% examples, 990850 words/s, in_qsize 4, out_qsize 1
INFO:gensim.models.word2vec:EPOCH 0 - PROGRESS: at 87.67% examples, 991106 words/s, in_qsize 5, out_qsize 0
INFO:gensim.models.word2vec:

INFO:gensim.models.word2vec:EPOCH 1 - PROGRESS: at 60.43% examples, 1000996 words/s, in_qsize 5, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 1 - PROGRESS: at 61.57% examples, 1001891 words/s, in_qsize 5, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 1 - PROGRESS: at 62.60% examples, 1001367 words/s, in_qsize 5, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 1 - PROGRESS: at 63.70% examples, 1001599 words/s, in_qsize 5, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 1 - PROGRESS: at 64.80% examples, 1002005 words/s, in_qsize 5, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 1 - PROGRESS: at 65.87% examples, 1001775 words/s, in_qsize 6, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 1 - PROGRESS: at 66.89% examples, 1000967 words/s, in_qsize 5, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 1 - PROGRESS: at 67.87% examples, 999474 words/s, in_qsize 5, out_qsize 0
INFO:gensim.models.word2vec:EPOCH 1 - PROGRESS: at 68.91% examples, 999029 words/s, in_qsize 5, out_qsize 0
INFO:gensim.models.wo

Time to train the model: 3.13 mins


In [22]:
# Saving and checkpointing
d2v_model.save("doc2vec.model")

INFO:gensim.utils:Doc2Vec lifecycle event {'fname_or_handle': 'doc2vec.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2023-04-08T14:37:29.396610', 'gensim': '4.3.1', 'python': '3.9.12 (main, Mar 26 2022, 15:51:13) \n[Clang 12.0.0 (clang-1200.0.32.29)]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'saving'}
INFO:gensim.utils:storing np array 'vectors' to doc2vec.model.dv.vectors.npy
INFO:gensim.utils:storing np array 'vectors' to doc2vec.model.wv.vectors.npy
INFO:gensim.utils:storing np array 'syn1neg' to doc2vec.model.syn1neg.npy
INFO:gensim.utils:not storing attribute cum_table
INFO:gensim.utils:saved doc2vec.model


In [23]:
doc2vec = Doc2Vec.load("doc2vec.model")

INFO:gensim.utils:loading Doc2Vec object from doc2vec.model
INFO:gensim.utils:loading dv recursively from doc2vec.model.dv.* with mmap=None
INFO:gensim.utils:loading vectors from doc2vec.model.dv.vectors.npy with mmap=None
INFO:gensim.utils:loading wv recursively from doc2vec.model.wv.* with mmap=None
INFO:gensim.utils:loading vectors from doc2vec.model.wv.vectors.npy with mmap=None
INFO:gensim.utils:loading syn1neg from doc2vec.model.syn1neg.npy with mmap=None
INFO:gensim.utils:setting ignored attribute cum_table to None
INFO:gensim.utils:Doc2Vec lifecycle event {'fname': 'doc2vec.model', 'datetime': '2023-04-08T14:38:19.271922', 'gensim': '4.3.1', 'python': '3.9.12 (main, Mar 26 2022, 15:51:13) \n[Clang 12.0.0 (clang-1200.0.32.29)]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'loaded'}


In [None]:
vector = doc2vec.infer_vector(["beauty", "and", "beast"])

In [25]:
doc2vec.

array([ 5.96549101e-02,  2.81587075e-02,  1.68257509e-03,  3.99970170e-03,
        3.25109474e-02,  2.97387242e-02, -1.48083493e-02,  2.33485755e-02,
       -1.96287036e-02,  6.48077391e-03,  3.23602669e-02,  6.90585840e-03,
       -1.14664752e-02,  1.41324485e-02,  1.78273730e-02,  4.93371412e-02,
       -3.17523889e-02,  2.58993711e-02,  4.18002345e-02, -2.90519521e-02,
        1.19369654e-02, -4.80981870e-03,  1.06778555e-02,  1.89127382e-02,
        1.88990764e-03,  3.68619757e-03, -2.82443734e-03,  2.90792249e-02,
       -4.93242219e-02, -1.16954558e-02, -1.81634109e-02, -3.02275643e-02,
       -2.44297795e-02, -4.53602523e-02, -5.11832302e-03, -1.70268933e-03,
        8.01484957e-02,  1.69352796e-02,  4.79333941e-03, -8.11960490e-04,
       -3.39033790e-02, -3.06029655e-02,  5.92718497e-02,  3.49485241e-02,
        1.53242722e-02,  1.71042737e-02,  8.32467485e-05,  7.59258587e-03,
       -4.83281619e-04,  1.44411232e-02,  3.68658034e-03, -5.00562647e-03,
        2.15299986e-02, -