In [1]:
import numpy as np
from os.path import expanduser

In [3]:
# Load the pre-trained GloVe embeddings
def load_glove_embeddings(file_path):
    embeddings_index = {}
    with open(file_path, encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefficients = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefficients
    return embeddings_index

# Load the GloVe embeddings, we will use the 300-dimensional embeddings with 6 billion tokens
PATH = expanduser('~/data/glove.6B.300d.txt')
embeddings_index = load_glove_embeddings(PATH)

In [10]:
# Get the embedding vector for a word
WORD = 'brush'
def get_embedding_vector(embeddings_index, word):
    return embeddings_index.get(word)

embedding_vector = get_embedding_vector(embeddings_index, WORD)
print(embedding_vector.shape)

(300,)


In [19]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def find_similar_words(word, embeddings_index, top_n=5):
    """
    Find the top N most similar words to a given word based on cosine similarity.
    
    :param word: The word for which similar words are to be found.
    :param embeddings_index: A dictionary of word embeddings.
    :param top_n: The number of similar words to return.
    :return: A tuple of two lists - similar words and their cosine similarity scores.
    """
    word_embedding = get_embedding_vector(embeddings_index, word)
    if word_embedding is None:
        return None, None

    similarities = {}

    for key, value in embeddings_index.items():
        if key == word:
            continue
        similarities[key] = cosine_similarity([word_embedding], [value])[0, 0]

    similar_words = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:top_n]
    similar_words, similar_scores = zip(*similar_words) if similar_words else ([], [])
    
    return list(similar_words), list(similar_scores)

# Example usage
WORD = 'brush'
similar_words, similar_scores = find_similar_words(WORD, embeddings_index)
print(f"Similar words to '{WORD}': {similar_words}")
print(f"Similarity scores: {similar_scores}")

Similar words to 'brush': ['brushes', 'scrub', 'dry', 'bristle', 'spray']
Similarity scores: [0.58875906, 0.5571601, 0.47539973, 0.45540044, 0.45286047]


# Fine-tuning GloVe embeddings

We can fine tune the GloVe embeddings by training them on a specific dataset. This can be done by using the GloVe embeddings as the initial weights of an embedding layer in a neural network and then training the network on the specific dataset. The embeddings will be updated during training, and the model will learn task-specific representations.
