In [46]:
# import modules & set up logging
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')

from gensim.models.keyedvectors import KeyedVectors

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [47]:
# Load Google's pre-trained Word2Vec model
path = './models/model.bin.gz'
model = KeyedVectors.load_word2vec_format(path, binary=True)

2024-03-01 21:36:08,265 : INFO : loading projection weights from ./models/model.bin.gz
2024-03-01 21:36:48,451 : INFO : KeyedVectors lifecycle event {'msg': 'loaded (3000000, 300) matrix of type float32 from ./models/model.bin.gz', 'binary': True, 'encoding': 'utf8', 'datetime': '2024-03-01T21:36:48.450906', 'gensim': '4.3.2', 'python': '3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0]', 'platform': 'Linux-6.6.10-76060610-generic-x86_64-with-glibc2.35', 'event': 'load_word2vec_format'}


In [48]:
# Check the size of the vocab
print("vocab_size:",len(model.key_to_index.keys()))

vocab_size: 3000000


In [49]:
model = KeyedVectors.load_word2vec_format(path, binary=True, limit=200000)

2024-03-01 21:36:48,989 : INFO : loading projection weights from ./models/model.bin.gz
2024-03-01 21:36:52,066 : INFO : KeyedVectors lifecycle event {'msg': 'loaded (200000, 300) matrix of type float32 from ./models/model.bin.gz', 'binary': True, 'encoding': 'utf8', 'datetime': '2024-03-01T21:36:52.066488', 'gensim': '4.3.2', 'python': '3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0]', 'platform': 'Linux-6.6.10-76060610-generic-x86_64-with-glibc2.35', 'event': 'load_word2vec_format'}


In [50]:
print("vocab_size:",len(model.key_to_index.keys()))

vocab_size: 200000


In [51]:
# CONFIG -- Define the target word and number of results
targetWord = "casino"
numResults = 5000

In [52]:
# Run the model
similarityResults=model.most_similar(targetWord,topn=numResults)

In [53]:
# Clean and format the results before writing to a file
from formatter import formatResults

formattedResults=formatResults(targetWord, similarityResults)

# Print to check
formattedResults

[['abalone', 1033],
 ['abatement', 1839],
 ['abatements', 1841],
 ['abattoir', 1410],
 ['abba', 2296],
 ['abcc', 1285],
 ['abercrombie', 2881],
 ['abertis', 3045],
 ['abhorrence', 2096],
 ['aboriginal', 1876],
 ['aborigine', 1877],
 ['abortionist', 2591],
 ['abramoff', 491],
 ['absher', 511],
 ['absinth', 2680],
 ['absinthe', 2679],
 ['accor', 1709],
 ['accuride', 2286],
 ['ace', 2420],
 ['aces', 2421],
 ['acme', 415],
 ['acoma', 2214],
 ['acropolis', 2879],
 ['activision', 904],
 ['address', 2706],
 ['adelson', 62],
 ['administration', 364],
 ['adrift', 1392],
 ['aeg', 2545],
 ['aehi', 1748],
 ['aerosmith', 1739],
 ['affliction', 2520],
 ['aga', 1868],
 ['aga', 1869],
 ['agate', 2033],
 ['agilysys', 899],
 ['agoda', 1974],
 ['agreeableness', 1089],
 ['agriprocessors', 1978],
 ['agritourism', 985],
 ['aircard', 2657],
 ['airline', 615],
 ['airlines', 2844],
 ['airport', 318],
 ['airports', 1259],
 ['airshow', 2870],
 ['airstrip', 108],
 ['airtran', 1544],
 ['akwesasne', 1227],
 ['aladd

In [65]:
import os

# Write the results to a file
pathToGames="../server/src/main/resources/games/"

# Make this unique with a number
currentCount = len(os.listdir(pathToGames))
finalPath = pathToGames + str(currentCount) + "-" + targetWord + ".txt"
with(open(finalPath, 'w+')) as f:
    for result in formattedResults:
        line = f"{result[0]},{result[1]}\n"
        f.write(line)
    
print("Writing to file",finalPath, "with", len(formattedResults), "associations")

Writing to file ../server/src/main/resources/games/9-casino.txt with 3110 associations


In [55]:
# Next, write a script to search for any given word, with numResults++ as the default
# Need to do more validation, given the whacko results I'm seeing too