In [4]:
from gensim.models.word2vec import Word2Vec
import gensim.downloader as api
from sklearn.cluster import KMeans
import numpy as np

# Load the text8 corpus
corpus = api.load('text8')

# Define the hyperparameter combinations
win_sizes = [3, 7, 13, 25]
vector_sizes = [20, 70, 100, 300]

# Words to be used in the tasks
words_to_transform = ['man', 'woman', 'daughter']
words_to_cluster = ['yen', 'yuan', 'france', 'brazil', 'africa', 'asia']

best_params = None
best_similarity = None
best_clusters = None
best_transform_result = None

for win_size in win_sizes:
    for vector_size in vector_sizes:
        print(f"Training model with window size {win_size} and vector size {vector_size}")
        
        # Train Word2Vec model
        model = Word2Vec(sentences=corpus, window=win_size, vector_size=vector_size, min_count=1)
        
        # Define Transform as Embedding('man') - Embedding('woman')
        transform = model.wv['man'] - model.wv['woman']
        
        # Find an embedding most similar to (Transform + Embedding('daughterâ€™))
        transform_result = model.wv.most_similar(positive=['daughter', transform], topn=1)
        print(f"Most similar to (Transform + 'daughter'): {transform_result}")
        
        # Cluster the following embeddings using K-means
        embeddings = np.array([model.wv[word] for word in words_to_cluster])
        kmeans = KMeans(n_clusters=2)
        kmeans.fit(embeddings)
        
        clusters = {}
        for i, label in enumerate(kmeans.labels_):
            clusters[words_to_cluster[i]] = label
        print(f"Cluster assignments: {clusters}")
        
        # Assess the results to determine the best set of hyperparameters
        if best_similarity is None or transform_result[0][1] > best_similarity:
            best_similarity = transform_result[0][1]
            best_params = (win_size, vector_size)
            best_clusters = clusters
            best_transform_result = transform_result

print(f"Best hyperparameters: Window size = {best_params[0]}, Vector size = {best_params[1]}")
print(f"Best transformation result: {best_transform_result}")
print(f"Best cluster assignments: {best_clusters}")

Training model with window size 3 and vector size 20
Most similar to (Transform + 'daughter'): [('bratis', 0.8006808161735535)]
Cluster assignments: {'yen': 1, 'yuan': 1, 'france': 0, 'brazil': 0, 'africa': 0, 'asia': 0}
Training model with window size 3 and vector size 70
Most similar to (Transform + 'daughter'): [('expansion', 0.45818030834198)]
Cluster assignments: {'yen': 1, 'yuan': 1, 'france': 0, 'brazil': 1, 'africa': 1, 'asia': 1}
Training model with window size 3 and vector size 100
Most similar to (Transform + 'daughter'): [('man', 0.4254422187805176)]
Cluster assignments: {'yen': 0, 'yuan': 0, 'france': 0, 'brazil': 0, 'africa': 1, 'asia': 1}
Training model with window size 3 and vector size 300
Most similar to (Transform + 'daughter'): [('man', 0.5618419647216797)]
Cluster assignments: {'yen': 0, 'yuan': 0, 'france': 0, 'brazil': 0, 'africa': 1, 'asia': 1}
Training model with window size 7 and vector size 20
Most similar to (Transform + 'daughter'): [('suffren', 0.741582274