Documentations:
https://radimrehurek.com/gensim/auto_examples/howtos/run_downloader_api.html#sphx-glr-auto-examples-howtos-run-downloader-api-py

https://radimrehurek.com/gensim/models/word2vec.html#pretrained-models

https://github.com/piskvorky/gensim-data?tab=readme-ov-file

https://medium.com/@manansuri/a-dummys-guide-to-word2vec-456444f3c673

https://radimrehurek.com/gensim_3.8.3/models/keyedvectors.html

https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

In [8]:
# Useful Libraries
import os
import json
import importlib

def check_package(package_name):
    try:
        importlib.import_module(package_name)
        print(f"{package_name} is already installed.")
    except ImportError:
        print(f"{package_name} is not installed.")
        if package_name == "sklearn":
          !pip install --upgrade scikit-learn
        elif package_name == "google-cloud-storage":
          !pip install --upgrade google-cloud-storage
        else:
          !pip install --upgrade {package_name}

!python --version
packages = ["numpy","pandas","gensim","sklearn","psutil","spacy","google.cloud"]
for package in packages:
  check_package(package)

# Third-Party Libraries
import numpy as np
import pandas as pd
import gensim
import gensim.downloader as api
from gensim.models import Word2Vec
import spacy
import google.cloud.storage as gcs
from gensim.models import KeyedVectors
from sklearn.cluster import KMeans

# !pip install --upgrade gensim
# !pip install --upgrade scikit-learn
# !pip install --upgrade psutil
# !pip install --upgrade spacy

Python 3.10.12
numpy is already installed.
pandas is already installed.
gensim is already installed.
sklearn is already installed.
psutil is already installed.
spacy is already installed.
google.cloud is already installed.


In [9]:
# Download a pre-trained model from https://github.com/piskvorky/gensim-data.
# Model data: Google News (about 100 billion words)
# Model name: "word2vec-google-news-300"
# Metadata: 1662 MB, 3000000 vectors
# This takes up to 1-2 min to download and load.
#word2VecModel = api.load("glove-twitter-25")
word2VecModel = api.load("word2vec-google-news-300")
print(f"The model has been downloaded to\n{word2VecModel} and loaded!")

The model has been downloaded to
KeyedVectors<vector_size=300, 3000000 keys> and loaded!


In [14]:
# NYT puzzles up to today in one JSON
# puzzle ID number and the categories as a dictionary
# it's already cleaned up.
filePath = "simplePuzzles.json"

# If the json file is already fetched from our GCS, do not re-fetch.
if not os.path.exists(filePath):
  # You can upload the file to https://console.cloud.google.com/storage/browser/cs221team
  # Accessible @ https://storage.googleapis.com/cs221team/simplePuzzles.json
  !gsutil cp gs://cs221team/simplePuzzles.json simplePuzzles.json
  # !cat simplePuzzles.json

# Load the json file as an JSON object.
raw_data = pd.read_json(filePath,lines=False,orient='columns')
raw_data.head(5)

Unnamed: 0,id,wordCategories
0,1,"{'WET WEATHER': ['HAIL', 'RAIN', 'SLEET', 'SNO..."
1,2,"{'FOOTWEAR': ['BOOT', 'LOAFER', 'PUMP', 'SNEAK..."
2,3,"{'FACIAL FEATURES': ['CHEEK', 'EYE', 'MOUTH', ..."
3,4,"{'SNEAKER BRANDS': ['ADIDAS', 'NIKE', 'PUMA', ..."
4,5,"{'STREAMING SERVICES': ['HULU', 'NETFLIX', 'PE..."


In [15]:
# Dict: Word -> Category
wordCategory = {}

# Dict: Category -> Word
categoryWord = {}

with open(filePath) as f:
  data = json.load(f)
  #('data', data)
  for i in range(len(data)):
    # Sample: {'id': 1, 'wordCategories': {'WET WEATHER': ['HAIL', 'RAIN', 'SLEET', 'SNOW'], 'NBA TEAMS': ['BUCKS', 'HEAT', 'JAZZ', 'NETS'], 'KEYBOARD KEYS': ['OPTION', 'RETURN', 'SHIFT', 'TAB'], 'PALINDROMES': ['KAYAK', 'LEVEL', 'MOM', 'RACECAR']}}

    # Create Words -> Category
    for category, words in data[i]['wordCategories'].items():
      tempWC = {word.lower(): category.lower() for word in words}
      wordCategory.update(tempWC)

    # Create Category -> Words
    tempCW = {category.lower(): [word.lower() for word in words] for category, words in data[i]['wordCategories'].items()}
    categoryWord.update(tempCW)

  print(wordCategory)
  print(categoryWord)


# Word Embeddings for available words
wordVectors = [word2VecModel[key] for key in wordCategory.keys() if key in word2VecModel]
print('Loaded embeddings for words.')
wordEmbeddings = {key: word2VecModel[key] for key in wordCategory.keys() if key in word2VecModel}
print('Loaded embeddings for words in categories.')


Loaded embeddings for words.
Loaded embeddings for words in categories.


In [16]:
# Query the word against the Word2Vec model
# Get scores for this word compared to categories
def process(word):
    if word not in word2VecModel:
        return f"Word '{word}' not in the Word2Vec model."

    queryEmbed = word2VecModel[word]
    scores = {}
    for word, embed in wordEmbeddings.items():
        category = wordCategory[word]
        dist = np.dot(queryEmbed, embed)
        dist /= len(categoryWord[category])
        scores[category] = scores.get(category, 0) + dist
    return scores

# Testing
print(process('heat'))
print(process('shift'))
print(process('snow'))

{'___ mary': 0.36239259969443083, 'wet weather': 1.4652650952339172, 'analog tv interference': 1.616879552602768, 'nba teams': 0.39121757447719574, 'elements of cooking,  per samin nosrat': 3.7144994735717773, 'music genres': 1.0341646447777748, 'mac keyboard keys': 0.8054496049880981, 'something gained from hard work': 0.2818536013364792, 'container closures': 0.8410280495882034, 'palindromes': 1.5322185307741165, 'on the same plane': 0.8758143186569214, 'throw out': 1.25147046148777, 'comfy shoes': 0.06094127148389816, 'excite, with “up”': 1.5290096998214722, 'footwear': 0.09384234249591827, 'units of length': 1.0143671669065952, 'inherent nature': 0.14790025353431702, 'magazines': 0.7972580268979073, 'duration': 0.7612651810050011, 'words abbreviated with letters': 0.4189169555902481, 'letter homophones': 0.24289904534816742, 'large amount': 0.5760527029633522, 'w + vowel sound': 0.46251989528536797, 'chutzpah': 0.8247515708208084, 'parts of a river': 0.09413501992821693, 'parts of 

In [17]:
# Convert list of vectors to a numpy array
X = np.array(wordVectors)

# Set the number of clusters
num_clusters = 4

# Perform K-means clustering
kmeans = KMeans(n_clusters=num_clusters, random_state=0, n_init="auto")
kmeans.fit(X)

# Get the cluster labels
labels = kmeans.labels_

# Print the clustered words
clustered_words = {i: [] for i in range(num_clusters)}
for word, label in zip(wordEmbeddings, labels):
    clustered_words[label].append(word)

for cluster, words in clustered_words.items():
    print(f"Cluster {cluster}: {words}")

Cluster 0: ['agency', 'bureau']
Cluster 1: ['sleet', 'snow', 'jazz', 'kayak', 'loafer', 'sneaker', 'mouth', 'nose', 'chow', 'scarf', 'wolf', 'puma', 'cabaret', 'cats', 'bat', 'iron', 'spider', 'peacock', 'ketchup', 'mayo', 'tartar', 'blue', 'mustard', 'plum', 'scarlet', 'boardwalk', 'baby', 'powder', 'ankle', 'knee', 'shin', 'thigh', 'calf', 'cub', 'joey', 'jelly', 'silver', 'backgammon', 'queen', 'cherry', 'ruby', 'piano', 'polo', 'squash', 'cami', 'halter', 'tee', 'beet', 'carrot', 'corn', 'onion', 'ant', 'beetle', 'mantis', 'termite', 'kiwi', 'lemon', 'orange', 'crane', 'turkey', 'fish', 'goat', 'cardamom', 'clove', 'coriander', 'cumin', 'honey', 'sugar', 'sweetie', 'angel', 'bird', 'ginger', 'pod', 'loris', 'sloth', 'snail', 'tortoise', 'cot', 'tan', 'virgin', 'gnat', 'gnocchi', 'gnome', 'canopy', 'cluck', 'brown', 'pink', 'turquoise', 'violet', 'rod', 'sparrow', 'cappuccino', 'espresso', 'latte', 'almond', 'cashew', 'pecan', 'walnut', 'emerald', 'forest', 'olive', 'bean', 'fox', '