In [1]:
# import modules & set up logging
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')

from gensim.models.keyedvectors import KeyedVectors

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
# Load Google's pre-trained Word2Vec model
path = './models/model.bin.gz'
model = KeyedVectors.load_word2vec_format(path, binary=True)

2024-03-03 22:29:57,513 : INFO : loading projection weights from ./models/model.bin.gz
2024-03-03 22:30:30,657 : INFO : KeyedVectors lifecycle event {'msg': 'loaded (3000000, 300) matrix of type float32 from ./models/model.bin.gz', 'binary': True, 'encoding': 'utf8', 'datetime': '2024-03-03T22:30:30.657927', 'gensim': '4.3.2', 'python': '3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0]', 'platform': 'Linux-6.6.10-76060610-generic-x86_64-with-glibc2.35', 'event': 'load_word2vec_format'}


In [3]:
# Check the size of the vocab
print("vocab_size:",len(model.key_to_index.keys()))

vocab_size: 3000000


In [4]:
model = KeyedVectors.load_word2vec_format(path, binary=True, limit=150000)

2024-03-03 22:30:30,852 : INFO : loading projection weights from ./models/model.bin.gz
2024-03-03 22:30:32,436 : INFO : KeyedVectors lifecycle event {'msg': 'loaded (150000, 300) matrix of type float32 from ./models/model.bin.gz', 'binary': True, 'encoding': 'utf8', 'datetime': '2024-03-03T22:30:32.436364', 'gensim': '4.3.2', 'python': '3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0]', 'platform': 'Linux-6.6.10-76060610-generic-x86_64-with-glibc2.35', 'event': 'load_word2vec_format'}


In [5]:
print("vocab_size:",len(model.key_to_index.keys()))

vocab_size: 150000


In [6]:
from wordfinder import generate_target_words
# CONFIG -- Define the target word and number of results
targetWords = generate_target_words(4)

# Number of associations to try and get for each
numResults = 5000

print(targetWords)

Avoiding existing words:[]


2024-03-03 22:30:34,971 : INFO : HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


['elephant', 'mountain', 'pizza', 'guitar']


In [7]:
# Clean and format the results before writing to a file
from hints import generate_hints
from formatter import formatResults
from resultwriter import write_results

for word in targetWords:
    targetWord = word.lower()
    # Run the model
    similarityResults=model.most_similar(targetWord,topn=numResults)
    # Generate results
    hints=generate_hints(targetWord)
    formattedResults=formatResults(targetWord, similarityResults)
    write_results(targetWord, hints, formattedResults)

[nltk_data] Downloading package wordnet to /home/chris/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
2024-03-03 22:30:37,121 : INFO : HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[['abalone', 1725], ['abattoir', 605], ['abattoirs', 2323], ['abdomen', 1668], ['abhorrence', 2359], ['abode', 2811], ['abomination', 2360], ['aboriginals', 3543], ['aborigine', 1850], ['aborigines', 1849], ['aborting', 2259], ['aboveground', 3848], ['abscess', 2332], ['accompaniment', 706], ['accomplice', 4051], ['ace', 2035], ['achuthanandan', 4265], ['acid', 1476], ['acorn', 374], ['acorns', 1346], ['acrobat', 874], ['acrobats', 873], ['actionaid', 4168], ['adage', 3598], ['adb', 2627], ['addle-head', 528], ['adjoining', 3871], ['adjutant', 2785], ['admirer', 3495], ['ado', 4012], ['adolescent', 2047], ['adoptable', 1882], ['adoptee', 2642], ['adoptees', 2643], ['adoption', 2646], ['adoptions', 2645], ['adoptive', 2313], ['adorable', 1051], ['adult', 3057], ['adulteress', 758], ['adventure', 3553], ['afghan', 3271], ['afghan', 3272], ['africa', 4469], ['africa', 4470], ['african', 1508], ['african', 1509], ['africans', 2596], ['afrikaner', 3205], ['afrikaner', 4409], ['afro', 1809],

2024-03-03 22:30:40,500 : INFO : HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-03-03 22:30:40,808 : INFO : HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
2024-03-03 22:30:40,809 : INFO : Retrying request to /chat/completions in 20.000000 seconds


[['abbey', 1595], ['abbot', 2138], ['abbottabad', 1513], ['aberdare', 1237], ['aberdare', 1238], ['abode', 3958], ['abruzzo', 2184], ['abstainer', 3869], ['abyss', 1115], ['acclimatization', 706], ['acid', 3104], ['acme', 42], ['acorn', 1997], ['acorns', 1996], ['acropolis', 789], ['address', 2657], ['adirondack', 1859], ['adirondacks', 274], ['adirondacks', 275], ['adriatic', 3889], ['adriatic', 3890], ['adulteress', 3832], ['advantage', 4041], ['adventure', 741], ['adventurer', 506], ['adventurers', 505], ['adventuring', 1647], ['adventurous', 3148], ['aerial', 3050], ['aerials', 1201], ['afar', 1098], ['afghan', 3756], ['afghan', 3757], ['afghanistan', 3969], ['afghanistan', 3970], ['aggrandizement', 151], ['ain', 3931], ['airbase', 3299], ['airfield', 1421], ['airfields', 1422], ['airlift', 1340], ['airliner', 4069], ['airplane', 1751], ['airship', 4169], ['airstrip', 611], ['airstrips', 2153], ['alamosa', 2728], ['alaska', 3564], ['alaska', 3565], ['alaskan', 1726], ['alaskan', 17

2024-03-03 22:31:02,618 : INFO : HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[['ab', 3120], ['ab', 3121], ['abattoir', 2523], ['abdomen', 1913], ['abdominal', 3118], ['abs', 3117], ['absolut', 1594], ['accent', 3591], ['accompaniment', 2813], ['accompaniments', 2815], ['acid', 2798], ['acme', 1298], ['acquaintance', 1787], ['acquaintances', 3206], ['acrylonitrile-butadiene-styrene', 3119], ['ad', 3307], ['addict', 2742], ['adieu', 598], ['administration', 2002], ['adolescent', 1477], ['adult', 2007], ['adventureland', 2686], ['advertising', 3308], ['afterschool', 1924], ['agustawestland', 1686], ['ahwatukee', 3519], ['airiness', 376], ['airplane', 2332], ['albacore', 1896], ['albanian', 1687], ['albanian', 1688], ['albanians', 3515], ['albertsons', 2816], ['aldi', 1398], ['aldo', 3735], ['ale', 707], ['ales', 1210], ['alfano', 3230], ['alfredo', 3452], ['alibi', 3351], ['ally', 1786], ['aloha', 1185], ['ambiance', 2666], ['ambrosia', 935], ['americana', 1840], ['americana', 1841], ['americanism', 3648], ['americanism', 3649], ['americanized', 2416], ['amore', 2

2024-03-03 22:31:03,130 : INFO : HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 429 Too Many Requests"
2024-03-03 22:31:03,131 : INFO : Retrying request to /chat/completions in 20.000000 seconds
2024-03-03 22:31:24,532 : INFO : HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[['aaj', 806], ['ab', 3091], ['ab', 3092], ['abba', 1441], ['abdominal', 3089], ['aberration', 1728], ['abrasive', 3796], ['abs', 3088], ['accelerator', 1926], ['accent', 2704], ['accented', 2518], ['accents', 2820], ['accident', 645], ['accompaniment', 576], ['accompaniments', 823], ['accompanist', 306], ['accordion', 67], ['accordionist', 106], ['accountant', 3441], ['ace', 206], ['acetate', 3262], ['acid', 640], ['acolyte', 3606], ['acoustic', 88], ['acoustical', 1369], ['acoustically', 335], ['acoustics', 706], ['acquaintance', 3407], ['acrobatics', 2361], ['acrylic', 1047], ['acrylics', 1046], ['acrylonitrile-butadiene-styrene', 3090], ['actor', 1224], ['addicted', 4090], ['addicting', 3610], ['adele', 1261], ['administration', 751], ['adorable', 2696], ['adores', 3433], ['adventuring', 2572], ['aerobatics', 2362], ['aerosmith', 458], ['affectation', 1491], ['affray', 690], ['afro', 971], ['afro', 972], ['afrobeat', 488], ['aiden', 2358], ['air', 1854], ['airboat', 2633], ['airbru

In [8]:
# Next, write a script to search for any given word, with numResults++ as the default
# Need to do more validation, given the whacko results I'm seeing too