In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from gensim.downloader import load

# Load pre-trained GloVe embeddings
glove_model = load("glove-wiki-gigaword-50")

# Sample texts
texts = ["The cat sits on the mat.", "A cat is on the mat.", "Dogs are running in the park.", "Children are playing in the park."]

# Tokenize texts
tokenized_texts = [text.lower().split() for text in texts]

# Function to get GloVe embeddings for a token
def get_glove_embedding(token):
    try:
        return glove_model[token]
    except KeyError:
        return np.zeros(50)  # return zero vector if token not in GloVe

# Get embeddings for all tokens in texts
all_tokens = set([token for text in tokenized_texts for token in text])
glove_embeddings = {token: get_glove_embedding(token) for token in all_tokens}
glove_tokens = list(glove_embeddings.keys())
glove_vectors = np.array(list(glove_embeddings.values()))

# Compute sentence embeddings as the average of token embeddings
sentence_vectors = np.array([
    np.mean([get_glove_embedding(token) for token in text], axis=0)
    for text in tokenized_texts
])

# Combine all embeddings for visualization
all_vectors = np.concatenate((glove_vectors, sentence_vectors), axis=0)
all_tokens = glove_tokens + texts

# Apply PCA
pca = PCA(n_components=2)
reduced_vectors = pca.fit_transform(all_vectors)

# Visualization
plt.figure(figsize=(12, 8))
for i, token in enumerate(all_tokens):
    plt.scatter(reduced_vectors[i, 0], reduced_vectors[i, 1])
    plt.text(reduced_vectors[i, 0] + 0.01, reduced_vectors[i, 1] + 0.01, token, fontsize=9)

plt.title('Embeddings Visualization using PCA')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.grid(True)
plt.show()