In [1]:
# import modules & set up logging
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')

from gensim.models.keyedvectors import KeyedVectors

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
# Load Google's pre-trained Word2Vec model
path = './model.bin.gz'
model = KeyedVectors.load_word2vec_format(path, binary=True)

2024-03-01 21:21:25,366 : INFO : loading projection weights from ./model.bin.gz
2024-03-01 21:22:06,962 : INFO : KeyedVectors lifecycle event {'msg': 'loaded (3000000, 300) matrix of type float32 from ./model.bin.gz', 'binary': True, 'encoding': 'utf8', 'datetime': '2024-03-01T21:22:06.962462', 'gensim': '4.3.2', 'python': '3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0]', 'platform': 'Linux-6.6.10-76060610-generic-x86_64-with-glibc2.35', 'event': 'load_word2vec_format'}


In [3]:
# Check the size of the vocab
print("vocab_size:",len(model.key_to_index.keys()))

vocab_size: 3000000


In [4]:
model = KeyedVectors.load_word2vec_format(path, binary=True, limit=200000)

2024-03-01 21:22:07,521 : INFO : loading projection weights from ./model.bin.gz
2024-03-01 21:22:11,244 : INFO : KeyedVectors lifecycle event {'msg': 'loaded (200000, 300) matrix of type float32 from ./model.bin.gz', 'binary': True, 'encoding': 'utf8', 'datetime': '2024-03-01T21:22:11.244812', 'gensim': '4.3.2', 'python': '3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0]', 'platform': 'Linux-6.6.10-76060610-generic-x86_64-with-glibc2.35', 'event': 'load_word2vec_format'}


In [5]:
print("vocab_size:",len(model.key_to_index.keys()))

vocab_size: 200000


In [41]:
# CONFIG -- Define the target word and number of results
targetWord = "keyboard"
numResults = 5000

In [42]:
# Run the model
similarityResults=model.most_similar(targetWord,topn=numResults)

In [43]:
# Clean and format the results before writing to a file
from formatter import formatResults

formattedResults=formatResults(targetWord, similarityResults)

# Print to check
formattedResults

[['abacus', 2637],
 ['aberration', 1494],
 ['abrasion', 1463],
 ['abstracted', 3555],
 ['accelerator', 2214],
 ['accelerometer', 47],
 ['accelerometers', 837],
 ['accent', 1749],
 ['accented', 3241],
 ['accenting', 3321],
 ['accents', 1750],
 ['accompaniment', 400],
 ['accompaniments', 510],
 ['accompanist', 518],
 ['accordion', 276],
 ['accordionist', 419],
 ['accordions', 977],
 ['accountant', 523],
 ['ace', 583],
 ['acer', 1283],
 ['acer', 1284],
 ['ache', 3713],
 ['acid', 412],
 ['acoustic', 369],
 ['acoustical', 2695],
 ['acoustically', 209],
 ['acoustics', 370],
 ['acrobat', 2394],
 ['acrylic', 1728],
 ['acrylics', 2820],
 ['actionscript', 1928],
 ['activesync', 235],
 ['activex', 2745],
 ['actuation', 1660],
 ['actuator', 822],
 ['actuators', 1132],
 ['acuteness', 3690],
 ['adam', 1482],
 ['adapter', 341],
 ['adapters', 3381],
 ['addicting', 3184],
 ['adhesive', 3594],
 ['adjustability', 542],
 ['adjustable', 1559],
 ['adorable', 3402],
 ['adorably', 2499],
 ['aesthetically', 13

In [44]:
# Write the results to a file
pathToResults="../server/src/main/resources/results/" + targetWord + ".txt"
with(open(pathToResults, 'w+')) as f:
    for result in formattedResults:
        line = f"{result[0]},{result[1]}\n"
        f.write(line)
    
print("Writing to file",pathToResults, "Association size", len(formattedResults))

Writing to file ../server/src/main/resources/results/keyboard.txt Association size 3790


In [45]:
# Next, write a script to search for any given word, with numResults++ as the default
# Need to do more validation, given the whacko results I'm seeing too