In [1]:
%load_ext autoreload
%autoreload 2

# import modules & set up logging
import warnings
import gensim
from dotenv import load_dotenv
import importlib
from gensim.models.keyedvectors import KeyedVectors
import logging

load_dotenv()

warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logger = logging.getLogger()

In [2]:
# Setup the model
path = './models/model.bin.gz'
model = KeyedVectors.load_word2vec_format(path, binary=True)

2024-04-13 15:22:36,514 : INFO : loading projection weights from ./models/model.bin.gz
2024-04-13 15:23:13,807 : INFO : KeyedVectors lifecycle event {'msg': 'loaded (3000000, 300) matrix of type float32 from ./models/model.bin.gz', 'binary': True, 'encoding': 'utf8', 'datetime': '2024-04-13T15:23:13.807349', 'gensim': '4.3.2', 'python': '3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0]', 'platform': 'Linux-6.6.10-76060610-generic-x86_64-with-glibc2.35', 'event': 'load_word2vec_format'}


In [3]:
# CONFIG -- Define the target word and number of results
targetWord = "park"

# Number of associations to try and get for each
numResults = 10

In [4]:
# Get initial similarity results
similarityResults=model.most_similar(targetWord,topn=numResults)
print(similarityResults)

[('parks', 0.7697824835777283), ('Park', 0.613426685333252), ('superintendent_Dave_Uberuaga', 0.5954588055610657), ('skate_park', 0.5911567211151123), ('parkland', 0.5799823999404907), ('Kohler_Andrae', 0.5719486474990845), ('campground', 0.5696098804473877), ('Taraji_Henson_knocked', 0.5658068060874939), ('Castaway_Cove', 0.5577318668365479), ('skateboard_park', 0.5547470450401306)]


In [6]:
# Clean and format the results before writing to a file
from formatterrevision import formatResults
from resultwriter import write_results

In [16]:
results = formatResults(targetWord, similarityResults, model, logger)
print(results)

2024-04-13 16:08:57,186 : INFO : Staring with ['parks', 'Park', 'superintendent_Dave_Uberuaga', 'skate_park', 'parkland', 'Kohler_Andrae', 'campground', 'Taraji_Henson_knocked', 'Castaway_Cove', 'skateboard_park']
2024-04-13 16:08:57,187 : INFO : Considering parks
2024-04-13 16:08:57,188 : INFO : [Validation] parks is the target!
2024-04-13 16:08:57,188 : INFO : Considering park
2024-04-13 16:08:57,188 : INFO : [Validation] park is the target!
2024-04-13 16:08:57,189 : INFO : Considering superintendent_dave_uberuaga
2024-04-13 16:08:57,189 : INFO : [Validation] superintendent_dave_uberuaga is not a valid word!
2024-04-13 16:08:57,190 : INFO : Considering skate_park
2024-04-13 16:08:57,190 : INFO : [Validation] skate_park is not a valid word!
2024-04-13 16:08:57,190 : INFO : Considering parkland
2024-04-13 16:08:57,191 : INFO : [Validation] parkland is the target!
2024-04-13 16:08:57,191 : INFO : Considering kohler_andrae
2024-04-13 16:08:57,191 : INFO : [Validation] kohler_andrae is no

[['campground', 4], ['campgrounds', 2], ['campsite', 1], ['campsites', 3], ['park', 0]]
