In [4]:
from gensim.models.word2vec import Word2Vec
import gensim.downloader as api
from sklearn.cluster import KMeans
import numpy as np

# Load the text8 corpus
corpus = api.load('text8')

# Define the hyperparameter combinations
win_sizes = [3, 7, 13, 25]
vector_sizes = [20, 70, 100, 300]

# Words to be used in the tasks
words_to_transform = ['man', 'woman', 'daughter']
words_to_cluster = ['yen', 'yuan', 'france', 'brazil', 'africa', 'asia']

best_params = None
best_similarity = None
best_clusters = None
best_transform_result = None

for win_size in win_sizes:
    for vector_size in vector_sizes:
        print(f"Training model with window size {win_size} and vector size {vector_size}")
        
        # Train Word2Vec model
        model = Word2Vec(sentences=corpus, window=win_size, vector_size=vector_size, min_count=1)
        
        # Define Transform as Embedding('man') - Embedding('woman')
        transform = model.wv['man'] - model.wv['woman']
        
        # Find an embedding most similar to (Transform + Embedding('daughter’))
        transform_result = model.wv.most_similar(positive=['daughter', transform], topn=1)
        print(f"Most similar to (Transform + 'daughter'): {transform_result}")
        
        # Cluster the following embeddings using K-means
        embeddings = np.array([model.wv[word] for word in words_to_cluster])
        kmeans = KMeans(n_clusters=2)
        kmeans.fit(embeddings)
        
        clusters = {}
        for i, label in enumerate(kmeans.labels_):
            clusters[words_to_cluster[i]] = label
        print(f"Cluster assignments: {clusters}")
        
        # Assess the results to determine the best set of hyperparameters
        if best_similarity is None or transform_result[0][1] > best_similarity:
            best_similarity = transform_result[0][1]
            best_params = (win_size, vector_size)
            best_clusters = clusters
            best_transform_result = transform_result

print(f"Best hyperparameters: Window size = {best_params[0]}, Vector size = {best_params[1]}")
print(f"Best transformation result: {best_transform_result}")
print(f"Best cluster assignments: {best_clusters}")

Training model with window size 3 and vector size 20
Most similar to (Transform + 'daughter'): [('bratis', 0.8006808161735535)]
Cluster assignments: {'yen': 1, 'yuan': 1, 'france': 0, 'brazil': 0, 'africa': 0, 'asia': 0}
Training model with window size 3 and vector size 70
Most similar to (Transform + 'daughter'): [('expansion', 0.45818030834198)]
Cluster assignments: {'yen': 1, 'yuan': 1, 'france': 0, 'brazil': 1, 'africa': 1, 'asia': 1}
Training model with window size 3 and vector size 100
Most similar to (Transform + 'daughter'): [('man', 0.4254422187805176)]
Cluster assignments: {'yen': 0, 'yuan': 0, 'france': 0, 'brazil': 0, 'africa': 1, 'asia': 1}
Training model with window size 3 and vector size 300
Most similar to (Transform + 'daughter'): [('man', 0.5618419647216797)]
Cluster assignments: {'yen': 0, 'yuan': 0, 'france': 0, 'brazil': 0, 'africa': 1, 'asia': 1}
Training model with window size 7 and vector size 20
Most similar to (Transform + 'daughter'): [('suffren', 0.741582274

In [4]:
from gensim.models.word2vec import Word2Vec
import gensim.downloader as api
from sklearn.cluster import KMeans
import numpy as np

corpus = api.load('text8')

# Train Word2Vec model
model = Word2Vec(sentences=corpus, window=5, vector_size=70)

print("Embedding vector for Paris is: ", model.wv['paris'])

print('Similar to France: ', model.wv.similar_by_vector (model.wv['france'],topn=3))
print('Similar to Paris: ', model.wv.similar_by_vector (model.wv['paris'],topn=3))

# Find most similar embeddings to a transformed embedding
transform = model.wv['france'] - model.wv['paris']
print('Transform: ', model.wv.similar_by_vector ( transform + model.wv['madrid'] ,topn=3))

# Some word embeddings
embeddings =np.array([
model.wv['paris'] , model.wv['he'],
model.wv['vienna'] , model.wv['she']
])

# K-means clustering
kmeans = KMeans(n_clusters=2)
kmeans.fit(embeddings)

# Print cluster assignments
for i, label in enumerate(kmeans.labels_):
    print("Embedding ", i, " is in cluster ", label)

Embedding vector for Paris is:  [-0.19954316 -0.46359336  1.4442711  -0.53030515  0.30910125 -3.21829
 -0.80756897  1.9091936  -3.6736145   0.7791089   0.69424033  0.77002895
 -1.4870739  -0.55771106 -1.1213987  -0.13891496  0.16801234  0.37173176
 -0.79405314 -1.9095919   3.2849033  -2.4157815   0.22409369 -1.5381286
 -1.0676508  -1.7486839  -2.0391848   1.6171279  -1.0335131   3.9903588
 -0.981133   -0.3469918   1.3421159   2.2093349   0.56718886  1.3755047
  1.8072823   0.31739727  0.08437763  1.350574    1.3138663  -1.2692137
  0.8514873   0.9939338  -2.9037025   0.77163196  0.10107175 -0.09788461
  3.04712    -2.8497846   2.0633981   0.98394775 -1.0383935  -1.5304278
 -1.6788906  -1.3294111   4.028691    0.73798347 -1.1571455   0.49761516
 -0.84745634  0.29187015  0.09646317  0.07188845 -0.00774641  1.7129927
  0.20880346  0.77843153  2.0442166  -0.38502645]
Similar to France:  [('france', 0.9999998211860657), ('spain', 0.8404896855354309), ('italy', 0.8255295753479004)]
Similar t