In [39]:
import sys

!{sys.executable} -W ignore:DEPRECATION -m pip install --quiet duckdb==0.7.1 \
duckdb-engine \
watermark \
jupysql \
sqlalchemy \
python-snappy \
pyarrow \
memray \
pandas \
ipywidgets  \
matplotlib \
gensim \
nltk

In [108]:
import duckdb
import re
import pandas as pd
import shlex
import string
from time import time

from gensim.models import KeyedVectors
from gensim.models import Word2Vec
from gensim.test.utils import datapath
from gensim import utils

import multiprocessing


In [41]:
cores = multiprocessing.cpu_count()
cores

16

In [42]:
import logging 
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [43]:
%load_ext watermark
# Duckdb 0.7.0 offers a bunch of new JSON stuff that I want to test out, checking to see I have the latest
# https://duckdb.org/2023/03/03/json.html
%watermark --iversions

duckdb : 0.7.1
sys    : 3.9.12 (main, Mar 26 2022, 15:51:13) 
[Clang 12.0.0 (clang-1200.0.32.29)]
re     : 2.2.1
pandas : 1.5.3
logging: 0.5.1.2



In [23]:
%load_ext sql

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [24]:
%config SqlMagic.autopandas = True
%config SqlMagic.feedback = False
%config SqlMagic.displaycon = False

In [25]:
# Create new DB or load existing https://duckdb.org/docs/guides/python/jupyter.html
%sql duckdb:///viberary.duckdb

In [26]:
%sql DESCRIBE select * from read_json_auto('/Users/vicki/viberary/viberary/data/goodreads_books.json',lines='true');

Unnamed: 0,column_name,column_type,null,key,default,extra
0,isbn,VARCHAR,YES,,,
1,text_reviews_count,VARCHAR,YES,,,
2,series,BIGINT[],YES,,,
3,country_code,VARCHAR,YES,,,
4,language_code,VARCHAR,YES,,,
5,popular_shelves,"STRUCT(count BIGINT, ""name"" VARCHAR)[]",YES,,,
6,asin,VARCHAR,YES,,,
7,is_ebook,VARCHAR,YES,,,
8,average_rating,VARCHAR,YES,,,
9,kindle_asin,VARCHAR,YES,,,


In [27]:
# Create table in DuckDB
%sql CREATE TABLE goodreads as select * from read_json_auto('/Users/vicki/viberary/viberary/data/goodreads_books.json',lines='true');

(duckdb.CatalogException) Catalog Error: Table with name "goodreads" already exists!
[SQL: CREATE TABLE goodreads as select * from read_json_auto('/Users/vicki/viberary/viberary/data/goodreads_books.json',lines='true');]
(Background on this error at: https://sqlalche.me/e/14/f405)


In [9]:
%sql select book_id, title, popular_shelves as ps, description from goodreads limit 10;

Unnamed: 0,book_id,title,ps,description
0,5333265,W.C. Fields: A Life on Film,"[{'count': 3, 'name': 'to-read'}, {'count': 1,...",
1,1333909,Good Harbor,"[{'count': 2634, 'name': 'to-read'}, {'count':...","Anita Diamant's international bestseller ""The ..."
2,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...","[{'count': 58, 'name': 'to-read'}, {'count': 1...",Omnibus book club edition containing the Ladie...
3,6066819,Best Friends Forever,"[{'count': 7615, 'name': 'to-read'}, {'count':...",Addie Downs and Valerie Adler were eight when ...
4,287140,Runic Astrology: Starcraft and Timekeeping in ...,"[{'count': 32, 'name': 'to-read'}, {'count': 3...",
5,287141,The Aeneid for Boys and Girls,"[{'count': 56, 'name': 'to-read'}, {'count': 1...","Relates in vigorous prose the tale of Aeneas, ..."
6,378460,The Wanting of Levine,"[{'count': 14, 'name': 'to-read'}, {'count': 1...",
7,6066812,All's Fairy in Love and War (Avalon: Web of Ma...,"[{'count': 515, 'name': 'to-read'}, {'count': ...","To Kara's astonishment, she discovers that a p..."
8,34883016,Playmaker: A Venom Series Novella,"[{'count': 4, 'name': 'to-read'}, {'count': 1,...",Secrets. Sometimes keeping them in confidence ...
9,287149,The Devil's Notebook,"[{'count': 961, 'name': 'to-read'}, {'count': ...","Wisdom, humor, and dark observations by the fo..."


In [44]:
# Let's start with title and description as our sentence feature

# we have to connect with pyscopg because the sql magic doesn't allow string literal concatenation due to Posix=
con = duckdb.connect('viberary.duckdb')

# Query the existing goodreads table and clean up formatting
sentences = con.sql("""select concat_ws(' ' , lower(regexp_replace(title, '[[:^alpha:]]',' ','g')), \
                    lower(regexp_replace(description, '[[:^alpha:]]',' ','g'))) as sentence from goodreads;""").df()

FloatProgress(value=0.0, layout=Layout(width='100%'), style=ProgressStyle(bar_color='black'))

In [4]:
# Each sentence is a single book
sentences.head()

Unnamed: 0,sentence
0,w c fields a life on film
1,good harbor anita diamant s international best...
2,the unschooled wizard sun wolf and starhawk ...
3,best friends forever addie downs and valerie a...
4,runic astrology starcraft and timekeeping in ...


In [5]:
# Check for nulls
sentences.isnull().sum()

sentence    0
dtype: int64

In [101]:
sentences.to_csv('sentences.csv', index=False)

In [102]:
! cat sentences.csv | head

sentence
w c  fields  a life on film 
good harbor anita diamant s international bestseller  the red tent  brilliantly re created the ancient world of womanhood  diamant brings her remarkable storytelling skills to  good harbor     offering insight to the precarious balance of marriage and career  motherhood and friendship in the world of modern women  the seaside town of gloucester  massachusetts is a place where the smell of the ocean lingers in the air and the rocky coast glistens in the atlantic sunshine  when longtime gloucester resident kathleen levine is diagnosed with breast cancer  her life is thrown into turmoil  frightened and burdened by secrets  she meets joyce tabachnik    a freelance writer with literary aspirations    and a once in a lifetime friendship is born  joyce has just bought a small house in gloucester  where she hopes to write as well as vacation with her family  like kathleen  joyce is at a fragile place in her life  a mutual love for books  humor  and the b

In [109]:
# input for training Word2Vec is a list of lists or iterable
# needs to be streamable
# https://radimrehurek.com/gensim/models/word2vec.html

class CorpusReader:
    """An iterator that yields sentences (lists of str)."""

    def __iter__(self):
        corpus_path = datapath('sentences.csv')
        for line in open(corpus_path):
            yield utils.simple_preprocess(line)

In [113]:
# Move to where word2vec processes them
!cp sentences.csv /usr/local/lib/python3.9/site-packages/gensim/test/test_data/sentences.csv

In [110]:
corpus = CorpusReader()

In [111]:
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     vector_size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

INFO - 15:27:32: Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=300, alpha=0.03>', 'datetime': '2023-03-24T15:27:32.466744', 'gensim': '4.3.1', 'python': '3.9.12 (main, Mar 26 2022, 15:51:13) \n[Clang 12.0.0 (clang-1200.0.32.29)]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'created'}


In [114]:
# building vocab context window

t = time()

w2v_model.build_vocab(corpus, progress_per=10000)
print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 15:28:18: collecting all words and their counts
INFO - 15:28:18: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 15:28:19: PROGRESS: at sentence #10000, processed 1171668 words, keeping 104898 word types
INFO - 15:28:20: PROGRESS: at sentence #20000, processed 2353603 words, keeping 162062 word types
INFO - 15:28:22: PROGRESS: at sentence #30000, processed 3517410 words, keeping 208425 word types
INFO - 15:28:23: PROGRESS: at sentence #40000, processed 4696496 words, keeping 248496 word types
INFO - 15:28:24: PROGRESS: at sentence #50000, processed 5885849 words, keeping 285840 word types
INFO - 15:28:26: PROGRESS: at sentence #60000, processed 7053388 words, keeping 319222 word types
INFO - 15:28:27: PROGRESS: at sentence #70000, processed 8232576 words, keeping 351037 word types
INFO - 15:28:28: PROGRESS: at sentence #80000, processed 9421715 words, keeping 381934 word types
INFO - 15:28:30: PROGRESS: at sentence #90000, processed 10575500 words, keepi

INFO - 15:30:14: PROGRESS: at sentence #820000, processed 95801186 words, keeping 1494632 word types
INFO - 15:30:16: PROGRESS: at sentence #830000, processed 96978075 words, keeping 1505264 word types
INFO - 15:30:17: PROGRESS: at sentence #840000, processed 98162537 words, keeping 1515379 word types
INFO - 15:30:18: PROGRESS: at sentence #850000, processed 99314203 words, keeping 1525388 word types
INFO - 15:30:20: PROGRESS: at sentence #860000, processed 100492055 words, keeping 1535075 word types
INFO - 15:30:22: PROGRESS: at sentence #870000, processed 101658200 words, keeping 1544652 word types
INFO - 15:30:23: PROGRESS: at sentence #880000, processed 102838636 words, keeping 1554753 word types
INFO - 15:30:25: PROGRESS: at sentence #890000, processed 103994332 words, keeping 1564766 word types
INFO - 15:30:26: PROGRESS: at sentence #900000, processed 105179916 words, keeping 1574052 word types
INFO - 15:30:28: PROGRESS: at sentence #910000, processed 106355504 words, keeping 158

INFO - 15:32:15: PROGRESS: at sentence #1620000, processed 189338233 words, keeping 2161630 word types
INFO - 15:32:17: PROGRESS: at sentence #1630000, processed 190517019 words, keeping 2168478 word types
INFO - 15:32:18: PROGRESS: at sentence #1640000, processed 191691575 words, keeping 2175799 word types
INFO - 15:32:20: PROGRESS: at sentence #1650000, processed 192865318 words, keeping 2182683 word types
INFO - 15:32:21: PROGRESS: at sentence #1660000, processed 194063003 words, keeping 2189633 word types
INFO - 15:32:23: PROGRESS: at sentence #1670000, processed 195238257 words, keeping 2196926 word types
INFO - 15:32:25: PROGRESS: at sentence #1680000, processed 196419566 words, keeping 2203851 word types
INFO - 15:32:26: PROGRESS: at sentence #1690000, processed 197577157 words, keeping 2210649 word types
INFO - 15:32:27: PROGRESS: at sentence #1700000, processed 198737272 words, keeping 2217323 word types
INFO - 15:32:29: PROGRESS: at sentence #1710000, processed 199897088 word

INFO - 15:34:05: Word2Vec lifecycle event {'msg': 'effective_min_count=20 leaves 268992999 word corpus (97.53% of original 275794650, drops 6801651)', 'datetime': '2023-03-24T15:34:05.403901', 'gensim': '4.3.1', 'python': '3.9.12 (main, Mar 26 2022, 15:51:13) \n[Clang 12.0.0 (clang-1200.0.32.29)]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'prepare_vocab'}
INFO - 15:34:06: deleting the raw counts dictionary of 2635551 items
INFO - 15:34:06: sample=6e-05 downsamples 690 most-common words
INFO - 15:34:06: Word2Vec lifecycle event {'msg': 'downsampling leaves estimated 158340100.72092313 word corpus (58.9%% of prior 268992999)', 'datetime': '2023-03-24T15:34:06.676646', 'gensim': '4.3.1', 'python': '3.9.12 (main, Mar 26 2022, 15:51:13) \n[Clang 12.0.0 (clang-1200.0.32.29)]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'prepare_vocab'}
INFO - 15:34:08: estimated required memory for 245474 words and 300 dimensions: 711874600 bytes
INFO - 15:34:08: resetting layer weigh

Time to build vocab: 5.85 mins


In [126]:
t = time()

w2v_model.train(sentence_iterator, total_examples=w2v_model.corpus_count, epochs=10, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 15:36:32: Word2Vec lifecycle event {'msg': 'training model with 15 workers on 245474 vocabulary and 300 features, using sg=0 hs=0 sample=6e-05 negative=20 window=2 shrink_windows=True', 'datetime': '2023-03-24T15:36:32.788763', 'gensim': '4.3.1', 'python': '3.9.12 (main, Mar 26 2022, 15:51:13) \n[Clang 12.0.0 (clang-1200.0.32.29)]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'train'}
INFO - 15:36:33: EPOCH 0 - PROGRESS: at 50.83% examples, 0 words/s, in_qsize 0, out_qsize 0
INFO - 15:36:34: EPOCH 0: training on 2360655 raw words (0 effective words) took 2.0s, 0 effective words/s
INFO - 15:36:35: EPOCH 1 - PROGRESS: at 49.56% examples, 0 words/s, in_qsize 0, out_qsize 0
INFO - 15:36:36: EPOCH 1 - PROGRESS: at 97.01% examples, 0 words/s, in_qsize 0, out_qsize 0
INFO - 15:36:36: EPOCH 1: training on 2360655 raw words (0 effective words) took 2.1s, 0 effective words/s
INFO - 15:36:37: EPOCH 2 - PROGRESS: at 47.87% examples, 0 words/s, in_qsize 0, out_qsize 0
INFO - 15:36:

Time to train the model: 0.34 mins


In [127]:
# Saving and checkpointing
w2v_model.save("word2vec.model")

INFO - 15:38:06: Word2Vec lifecycle event {'fname_or_handle': 'word2vec.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2023-03-24T15:38:06.631816', 'gensim': '4.3.1', 'python': '3.9.12 (main, Mar 26 2022, 15:51:13) \n[Clang 12.0.0 (clang-1200.0.32.29)]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'saving'}
INFO - 15:38:06: storing np array 'vectors' to word2vec.model.wv.vectors.npy
INFO - 15:38:07: storing np array 'syn1neg' to word2vec.model.syn1neg.npy
INFO - 15:38:07: not storing attribute cum_table
INFO - 15:38:07: saved word2vec.model


In [128]:
word_vectors = w2v_model.wv
word_vectors.save('vectors.kv')

INFO - 15:38:09: KeyedVectors lifecycle event {'fname_or_handle': 'vectors.kv', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2023-03-24T15:38:09.623181', 'gensim': '4.3.1', 'python': '3.9.12 (main, Mar 26 2022, 15:51:13) \n[Clang 12.0.0 (clang-1200.0.32.29)]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'saving'}
INFO - 15:38:09: storing np array 'vectors' to vectors.kv.vectors.npy
INFO - 15:38:10: saved vectors.kv


In [131]:
w2v_model.wv.most_similar(["language"])

[('plataformas', 0.2486429214477539),
 ('ruminate', 0.24250322580337524),
 ('tenues', 0.2424517124891281),
 ('himari', 0.24068190157413483),
 ('mtrf', 0.23997090756893158),
 ('viimeiset', 0.23300020396709442),
 ('wlqrt', 0.23264293372631073),
 ('capables', 0.2300754338502884),
 ('parvulescu', 0.22820627689361572),
 ('yalda', 0.22673505544662476)]

In [130]:
vocab_len = len(w2v_model.wv)
vocab_len

245474

In [122]:
w2v_model.wv.index_to_key

['the',
 'and',
 'of',
 'to',
 'in',
 'is',
 'her',
 'for',
 'his',
 'with',
 'that',
 'he',
 'she',
 'as',
 'on',
 'it',
 'but',
 'from',
 'an',
 'de',
 'this',
 'by',
 'has',
 'their',
 'one',
 'be',
 'who',
 'are',
 'life',
 'when',
 'at',
 'you',
 'they',
 'will',
 'all',
 'was',
 'new',
 'can',
 'la',
 'book',
 'have',
 'world',
 'what',
 'into',
 'him',
 'not',
 'or',
 'love',
 'more',
 'about',
 'how',
 'out',
 'no',
 'up',
 'only',
 'time',
 'story',
 'man',
 'un',
 'di',
 'first',
 'them',
 'en',
 'most',
 'your',
 'que',
 'been',
 'than',
 'so',
 'do',
 'now',
 'family',
 'just',
 'there',
 'years',
 'after',
 'own',
 'if',
 'two',
 'find',
 'its',
 'way',
 'through',
 'like',
 'even',
 'we',
 'other',
 'never',
 'had',
 'se',
 'my',
 'old',
 'which',
 'where',
 'over',
 'back',
 'young',
 'people',
 'el',
 'make',
 'series',
 'best',
 'war',
 'our',
 'author',
 'before',
 'una',
 'woman',
 'home',
 'ever',
 'get',
 'these',
 'work',
 'must',
 'heart',
 'between',
 'day',
 'n

In [124]:
w2v_model.wv.key_to_index

{'the': 0,
 'and': 1,
 'of': 2,
 'to': 3,
 'in': 4,
 'is': 5,
 'her': 6,
 'for': 7,
 'his': 8,
 'with': 9,
 'that': 10,
 'he': 11,
 'she': 12,
 'as': 13,
 'on': 14,
 'it': 15,
 'but': 16,
 'from': 17,
 'an': 18,
 'de': 19,
 'this': 20,
 'by': 21,
 'has': 22,
 'their': 23,
 'one': 24,
 'be': 25,
 'who': 26,
 'are': 27,
 'life': 28,
 'when': 29,
 'at': 30,
 'you': 31,
 'they': 32,
 'will': 33,
 'all': 34,
 'was': 35,
 'new': 36,
 'can': 37,
 'la': 38,
 'book': 39,
 'have': 40,
 'world': 41,
 'what': 42,
 'into': 43,
 'him': 44,
 'not': 45,
 'or': 46,
 'love': 47,
 'more': 48,
 'about': 49,
 'how': 50,
 'out': 51,
 'no': 52,
 'up': 53,
 'only': 54,
 'time': 55,
 'story': 56,
 'man': 57,
 'un': 58,
 'di': 59,
 'first': 60,
 'them': 61,
 'en': 62,
 'most': 63,
 'your': 64,
 'que': 65,
 'been': 66,
 'than': 67,
 'so': 68,
 'do': 69,
 'now': 70,
 'family': 71,
 'just': 72,
 'there': 73,
 'years': 74,
 'after': 75,
 'own': 76,
 'if': 77,
 'two': 78,
 'find': 79,
 'its': 80,
 'way': 81,
 'throu

In [123]:
for index, word in enumerate(w2v_model.wv.index_to_key):
    if index == 10:
        break
    print(f"word #{index}/{len(w2v_model.wv.index_to_key)} is {word}")

word #0/245474 is the
word #1/245474 is and
word #2/245474 is of
word #3/245474 is to
word #4/245474 is in
word #5/245474 is is
word #6/245474 is her
word #7/245474 is for
word #8/245474 is his
word #9/245474 is with
