In [1]:
# import modules & set up logging
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')

from gensim.models.keyedvectors import KeyedVectors

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
# Load Google's pre-trained Word2Vec model
path = './model.bin.gz'
model = KeyedVectors.load_word2vec_format(path, binary=True)

2024-03-01 11:39:16,550 : INFO : loading projection weights from ./model.bin.gz
2024-03-01 11:39:48,283 : INFO : KeyedVectors lifecycle event {'msg': 'loaded (3000000, 300) matrix of type float32 from ./model.bin.gz', 'binary': True, 'encoding': 'utf8', 'datetime': '2024-03-01T11:39:48.283145', 'gensim': '4.3.2', 'python': '3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0]', 'platform': 'Linux-6.6.10-76060610-generic-x86_64-with-glibc2.35', 'event': 'load_word2vec_format'}


In [3]:
# Check the size of the vocab
print("vocab_size:",len(model.key_to_index.keys()))

vocab_size: 3000000


In [4]:
model = KeyedVectors.load_word2vec_format(path, binary=True, limit=200000)

2024-03-01 11:39:48,407 : INFO : loading projection weights from ./model.bin.gz
2024-03-01 11:39:50,547 : INFO : KeyedVectors lifecycle event {'msg': 'loaded (200000, 300) matrix of type float32 from ./model.bin.gz', 'binary': True, 'encoding': 'utf8', 'datetime': '2024-03-01T11:39:50.547795', 'gensim': '4.3.2', 'python': '3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0]', 'platform': 'Linux-6.6.10-76060610-generic-x86_64-with-glibc2.35', 'event': 'load_word2vec_format'}


In [5]:
print("vocab_size:",len(model.key_to_index.keys()))

vocab_size: 200000


In [6]:
# CONFIG -- Define the target word and number of results
targetWord = "paintbrush"
numResults = 3000

In [7]:
# Run the model
similarityResults=model.most_similar(targetWord,topn=numResults)
print(similarityResults)



In [8]:
# Clean and format the results before writing to a file
from formatter import formatResults

formattedResults=formatResults(targetWord, similarityResults)

# Print to check
formattedResults

[['abacus', 969],
 ['abstract', 750],
 ['abstracted', 767],
 ['abstraction', 1028],
 ['abstractions', 892],
 ['accordion', 1549],
 ['accordionist', 1693],
 ['acetone', 1548],
 ['acorn', 1144],
 ['acrobatically', 829],
 ['acrylic', 49],
 ['acrylics', 10],
 ['acupressure', 1746],
 ['adhesive', 1876],
 ['adjectives', 1902],
 ['aerosol', 522],
 ['afghan', 928],
 ['afro', 989],
 ['aioli', 1204],
 ['airbrush', 45],
 ['airbrushing', 1383],
 ['alabaster', 760],
 ['allegorical', 2181],
 ['almond', 982],
 ['aloe', 1251],
 ['alphabet', 1674],
 ['alphabets', 741],
 ['amaryllis', 1037],
 ['ambidextrous', 373],
 ['amethyst', 1782],
 ['amulet', 817],
 ['androgynous', 1951],
 ['angelic', 1367],
 ['angular', 2158],
 ['animate', 1381],
 ['animation', 851],
 ['animator', 377],
 ['animators', 508],
 ['anise', 1810],
 ['annotate', 1323],
 ['ant', 1934],
 ['anthropomorphic', 2101],
 ['antique', 1328],
 ['antlers', 1262],
 ['anus', 1768],
 ['anvil', 1751],
 ['apertures', 1975],
 ['apple', 1200],
 ['applesauc

In [9]:
# Write the results to a file
pathToResults="./results/" + targetWord + ".txt"
with(open(pathToResults, 'w+')) as f:
    f.write(str(formattedResults))
    
print("Writing to file",pathToResults)

Writing to file ./results/paintbrush.txt


In [10]:
# Next, write a script to search for any given word, with numResults++ as the default
# Need to do more validation, given the whacko results I'm seeing too