In [1]:
# import modules & set up logging
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')

from gensim.models.keyedvectors import KeyedVectors

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
# Load Google's pre-trained Word2Vec model
path = './model.bin.gz'
model = KeyedVectors.load_word2vec_format(path, binary=True)

2024-03-01 19:15:45,804 : INFO : loading projection weights from ./model.bin.gz
2024-03-01 19:16:20,730 : INFO : KeyedVectors lifecycle event {'msg': 'loaded (3000000, 300) matrix of type float32 from ./model.bin.gz', 'binary': True, 'encoding': 'utf8', 'datetime': '2024-03-01T19:16:20.730734', 'gensim': '4.3.2', 'python': '3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0]', 'platform': 'Linux-6.6.10-76060610-generic-x86_64-with-glibc2.35', 'event': 'load_word2vec_format'}


In [3]:
# Check the size of the vocab
print("vocab_size:",len(model.key_to_index.keys()))

vocab_size: 3000000


In [4]:
model = KeyedVectors.load_word2vec_format(path, binary=True, limit=200000)

2024-03-01 19:16:21,280 : INFO : loading projection weights from ./model.bin.gz
2024-03-01 19:16:24,715 : INFO : KeyedVectors lifecycle event {'msg': 'loaded (200000, 300) matrix of type float32 from ./model.bin.gz', 'binary': True, 'encoding': 'utf8', 'datetime': '2024-03-01T19:16:24.715509', 'gensim': '4.3.2', 'python': '3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0]', 'platform': 'Linux-6.6.10-76060610-generic-x86_64-with-glibc2.35', 'event': 'load_word2vec_format'}


In [5]:
print("vocab_size:",len(model.key_to_index.keys()))

vocab_size: 200000


In [6]:
# CONFIG -- Define the target word and number of results
targetWord = "entrance"
numResults = 5000

In [7]:
# Run the model
similarityResults=model.most_similar(targetWord,topn=numResults)
print(similarityResults)

[('entrances', 0.7411327958106995), ('entranceway', 0.6982936859130859), ('entryway', 0.6561002731323242), ('Entrance', 0.6514840722084045), ('gate', 0.5731144547462463), ('archway', 0.5638287663459778), ('walkway', 0.5483123064041138), ('courtyard', 0.5436859726905823), ('gates', 0.5363094806671143), ('plaza', 0.5159693956375122), ('doorway', 0.515014111995697), ('gatehouse', 0.5037886500358582), ('foyer', 0.4984862208366394), ('entry', 0.4859205186367035), ('exit', 0.4840930998325348), ('passageway', 0.4806051254272461), ('adjacent', 0.48031914234161377), ('entryways', 0.47445228695869446), ('concourse', 0.4723183512687683), ('stairway', 0.4715364873409271), ('parking', 0.4700607359409332), ('portico', 0.4666491448879242), ('admittance', 0.46636319160461426), ('pedestrian_walkway', 0.46541106700897217), ('loading_dock', 0.4551159739494324), ('staircase', 0.4523359537124634), ('fenced', 0.44362953305244446), ('archways', 0.4393758475780487), ('main_thoroughfare', 0.4366554319858551), 

In [8]:
# Clean and format the results before writing to a file
from formatter import formatResults

formattedResults=formatResults(targetWord, similarityResults)

# Print to check
formattedResults

[nltk_data] Downloading package wordnet to /home/chris/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


[['Ab', 2367],
 ['Acre', 1074],
 ['Anchorage', 2821],
 ['Angle', 3193],
 ['Annunciation', 2710],
 ['Ark', 3334],
 ['Bedlam', 1707],
 ['Bell', 1892],
 ['Bench', 778],
 ['Booth', 506],
 ['Bosporus', 891],
 ['Boulder', 1449],
 ['Bush', 895],
 ['Cage', 2199],
 ['Capitol', 2565],
 ['Colosseum', 3483],
 ['Confederacy', 882],
 ['Crater', 765],
 ['Creek', 1605],
 ['Cross', 2454],
 ['Dhahran', 3603],
 ['Dumpster', 966],
 ['East', 300],
 ['Eden', 3062],
 ['English', 663],
 ['Euphrates', 3586],
 ['Exodus', 3635],
 ['Freemasonry', 2364],
 ['Gable', 404],
 ['Galleria', 1377],
 ['Gates', 9],
 ['Giza', 1718],
 ['Gothic', 2532],
 ['Grozny', 3295],
 ['Guest', 1660],
 ['Hall', 61],
 ['Hart', 3347],
 ['Hell', 964],
 ['Hill', 786],
 ['Humvee', 3336],
 ['ID', 1117],
 ['Idaho', 1116],
 ['Inchon', 3638],
 ['Ingres', 768],
 ['J', 3467],
 ['Jalalabad', 2001],
 ['Jeddah', 2922],
 ['Kerbala', 2867],
 ['Lanai', 501],
 ['Lodge', 1042],
 ['Loos', 631],
 ['Lot', 1714],
 ['Louvre', 1898],
 ['Luxor', 3163],
 ['March',

In [9]:
# Write the results to a file
pathToResults="./results/" + targetWord + ".txt"
with(open(pathToResults, 'w+')) as f:
    f.write(str(formattedResults))
    
print("Writing to file",pathToResults, "Association size", len(formattedResults))

Writing to file ./results/entrance.txt Association size 3701


In [10]:
# Next, write a script to search for any given word, with numResults++ as the default
# Need to do more validation, given the whacko results I'm seeing too