In [11]:
# CONFIG -- Define the target word and number of results
targetWord = "king"
numResults = 10000

In [12]:
# import modules & set up logging
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')

from gensim.models.keyedvectors import KeyedVectors

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [13]:
# Load Google's pre-trained Word2Vec model
path = './google-word2vec.bin.gz'
model = KeyedVectors.load_word2vec_format(path, binary=True)

2024-02-29 23:44:02,130 : INFO : loading projection weights from ./google-word2vec.bin.gz
2024-02-29 23:44:30,327 : INFO : KeyedVectors lifecycle event {'msg': 'loaded (3000000, 300) matrix of type float32 from ./google-word2vec.bin.gz', 'binary': True, 'encoding': 'utf8', 'datetime': '2024-02-29T23:44:30.327946', 'gensim': '4.3.2', 'python': '3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0]', 'platform': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'event': 'load_word2vec_format'}


In [14]:
# Check the size of the vocab
print("vocab_size:",len(model.key_to_index.keys()))

vocab_size: 3000000


In [15]:
model = KeyedVectors.load_word2vec_format(path, binary=True, limit=200000)

2024-02-29 23:44:30,351 : INFO : loading projection weights from ./google-word2vec.bin.gz
2024-02-29 23:44:32,101 : INFO : KeyedVectors lifecycle event {'msg': 'loaded (200000, 300) matrix of type float32 from ./google-word2vec.bin.gz', 'binary': True, 'encoding': 'utf8', 'datetime': '2024-02-29T23:44:32.101620', 'gensim': '4.3.2', 'python': '3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0]', 'platform': 'Linux-5.15.0-91-generic-x86_64-with-glibc2.35', 'event': 'load_word2vec_format'}


In [16]:
print("vocab_size:",len(model.key_to_index.keys()))

vocab_size: 200000


In [17]:
similarityResults=model.most_similar(targetWord,topn=numResults)
print(similarityResults)

[('kings', 0.7138045430183411), ('queen', 0.6510956883430481), ('monarch', 0.6413194537162781), ('crown_prince', 0.6204220056533813), ('prince', 0.6159993410110474), ('sultan', 0.5864824056625366), ('ruler', 0.5797567367553711), ('princes', 0.5646552443504333), ('throne', 0.5422105193138123), ('royal', 0.5239794254302979), ('kingdom', 0.5210405588150024), ('princess', 0.5161998867988586), ('King', 0.5158917903900146), ('emperor', 0.5083796977996826), ('monarchy', 0.4929794371128082), ('royal_palace', 0.49247992038726807), ('Crown_Prince', 0.49117735028266907), ('undisputed_king', 0.4895281195640564), ('palace', 0.4878290593624115), ('constitutional_monarch', 0.4762479066848755), ('queens', 0.47384950518608093), ('monarchs', 0.4711732566356659), ('royals', 0.4668191969394684), ('Gyanendra', 0.4608598053455353), ('His_Majesty', 0.4479662775993347), ('god', 0.4467619061470032), ('deity', 0.4433119297027588), ('King_Bhumibol_Adulyadej', 0.4406883418560028), ('Sultan', 0.4318269193172455), 

In [18]:
# Clean and format the results before writing to a file
from formatter import formatResults

formattedResults=formatResults(targetWord, similarityResults)

# Print to check
formattedResults

[['', 3609],
 ['#', 3683],
 ['###BC', 852],
 ['###cc', 9308],
 ['###hp', 9511],
 ['###lb', 9384],
 ['##:#', 9800],
 ['##:##', 6940],
 ['##:##-##', 7864],
 ['##cc', 6776],
 ['##st', 7253],
 ['##th', 2356],
 ['#.#', 9402],
 ['#.#', 9711],
 ['#.#-#.#', 8974],
 ['#/#-year-old', 8344],
 ['#/#th', 5909],
 ['#:#-#', 8977],
 ['#X#', 4155],
 ['#x#', 8042],
 ['%', 5559],
 ['%', 7665],
 ['%', 7833],
 ['%', 9221],
 ['**', 4531],
 ['**', 7691],
 ['****', 8776],
 ['******', 6134],
 ['*******', 6128],
 ['..........', 8941],
 ['2Pac', 7927],
 ['3i', 6349],
 [':-)', 9610],
 ['=', 1048],
 ['=', 8413],
 ['AD', 7809],
 ['AJ', 6557],
 ['AJ', 9839],
 ['AL', 6519],
 ['AMMAN', 2595],
 ['AOL', 7566],
 ['APJ', 7836],
 ['Aa', 6294],
 ['Aamir', 3313],
 ['Aamir', 6118],
 ['Aaron', 3499],
 ['Abas', 8608],
 ['Abashidze', 6382],
 ['Abba', 8446],
 ['Abbas', 2587],
 ['Abd', 7608],
 ['Abdallah', 563],
 ['Abdallahi', 1835],
 ['Abdel', 2876],
 ['Abdelaziz', 4211],
 ['Abdul', 727],
 ['Abdul', 6798],
 ['Abdul', 7518],
 ['Ab

In [19]:
# Write the results to a file
pathToResults="./results/" + targetWord + ".txt"
with(open(pathToResults, 'w')) as f:
    f.write(str(formattedResults))

In [ ]:
# Next, write a script to search for any given word, with numResults++ as the default
# Need to do more validation, given the whacko results I'm seeing too