In [1]:
import numpy as np
import os

In [2]:
def load_glove_model(glove_file_path):
    """
    Loads GloVe word vectors from a specified file.

    Args:
        glove_file_path (str): Path to the GloVe file (e.g., "glove.6B.50d.txt").

    Returns:
        tuple: (
            W_embedding_norm (np.ndarray): Normalized word embedding matrix.
            word_to_idx_map (dict): Mapping from word to its index in the embedding matrix.
            idx_to_word_map (dict): Mapping from index to word.
            vector_dim (int): Dimension of the word vectors.
        )
    Raises:
        FileNotFoundError: If the GloVe file is not found.
        ValueError: If the GloVe file is empty or malformed.
    """
    if not os.path.exists(glove_file_path):
        raise FileNotFoundError(f"GloVe file not found: {glove_file_path}")

    words_list = []
    word_to_raw_vector = {} 
    expected_dim = None
    
    print(f"Loading GloVe model from {glove_file_path}...")
    with open(glove_file_path, 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f):
            parts = line.strip().split()
            word = parts[0]
            try:
                vector = np.array([float(val) for val in parts[1:]])
                
                if expected_dim is None: # First valid vector sets the expected dimension
                    expected_dim = len(vector)
                    if expected_dim == 0: # Should not happen with valid GloVe files
                        print(f"Warning: Word '{word}' in line {line_num+1} has zero dimension. Skipping file.")
                        raise ValueError("Vector dimension is zero.")
                
                if len(vector) == expected_dim:
                    words_list.append(word)
                    word_to_raw_vector[word] = vector
                else:
                    print(f"Warning: Skipping line {line_num+1} due to inconsistent vector dimension. Word: '{word}', Dim: {len(vector)}, Expected: {expected_dim}")
                    continue
            except ValueError:
                print(f"Warning: Skipping line {line_num+1} due to non-numeric vector component for word '{word}'.")
                continue
            except IndexError:
                 print(f"Warning: Skipping line {line_num+1} due to missing vector components for word '{word}'.")
                 continue
    
    if not words_list:
        raise ValueError("GloVe file is empty or no valid word vectors could be parsed.")

    vocab_size = len(words_list)
    vector_dim = expected_dim 
    
    word_to_idx_map = {word: idx for idx, word in enumerate(words_list)}
    idx_to_word_map = {idx: word for idx, word in enumerate(words_list)}
    
    W_embedding = np.zeros((vocab_size, vector_dim))
    for i, word in enumerate(words_list):
        W_embedding[i, :] = word_to_raw_vector[word]
        
    # Normalize W_embedding rows to unit length for cosine similarity
    norms = np.linalg.norm(W_embedding, axis=1, keepdims=True)
    # Avoid division by zero for zero-norm vectors (e.g. if a word vector was all zeros)
    # Such vectors will remain zero vectors after normalization.
    safe_norms = np.where(norms == 0, 1e-10, norms) # Use 1e-10 to prevent division by zero
    W_embedding_norm = W_embedding / safe_norms
    
    print(f"Loaded {vocab_size} word vectors with dimension {vector_dim}.")
    return W_embedding_norm, word_to_idx_map, idx_to_word_map, vector_dim

In [3]:
def find_closest_words_to_vector(query_vector, n_words, W_embedding_norm, idx_to_word_map):
    """
    Given a query vector and an integer n, returns the n words closest to that vector
    based on cosine similarity with words in the provided embedding matrix.

    Args:
        query_vector (np.ndarray): The input query vector.
        n_words (int): The number of closest words to return.
        W_embedding_norm (np.ndarray): The normalized GloVe word embedding matrix.
                                      Each row must be a normalized word vector.
        idx_to_word_map (dict): A dictionary mapping from row index to word.

    Returns:
        list: A list of tuples (word, cosine_similarity_score), 
              representing the n closest words. Returns an empty list if n_words <= 0
              or if the query_vector has zero norm.
    """
    if n_words <= 0:
        print("Warning: n_words must be positive.")
        return []

    # Normalize the query_vector
    query_vector_norm_val = np.linalg.norm(query_vector)
    if query_vector_norm_val < 1e-9: # Check for effectively zero norm
        print("Warning: Query vector has near-zero norm. Cannot compute meaningful similarity.")
        return []
    normalized_query_vector = query_vector / query_vector_norm_val

    # Calculate cosine similarities
    # W_embedding_norm is (vocab_size, vector_dim)
    # normalized_query_vector is (vector_dim,)
    # similarities will be (vocab_size,)
    similarities = np.dot(W_embedding_norm, normalized_query_vector)

    # Get indices of top n_words similarities
    # np.argsort sorts in ascending order, so we use -similarities
    top_indices = np.argsort(-similarities)[:n_words]

    closest_words = []
    for i in top_indices:
        closest_words.append((idx_to_word_map[i], similarities[i]))
        
    return closest_words

In [4]:
W_normalized_embeddings.shape

NameError: name 'W_normalized_embeddings' is not defined

In [None]:
import matplotlib.pyplot as plt

In [None]:
# Helper for NLTK resources, placed here for use by select_glove_subset
_NLTK_RESOURCES_DOWNLOADED = False
def _ensure_nltk_resources():
    """Checks for NLTK and required resources, attempts download if missing."""
    global _NLTK_RESOURCES_DOWNLOADED
    if _NLTK_RESOURCES_DOWNLOADED:
        return True
    try:
        import nltk
        nltk.data.find('taggers/averaged_perceptron_tagger')
        nltk.data.find('tokenizers/punkt') # For pos_tag with list of words
        _NLTK_RESOURCES_DOWNLOADED = True
        return True
    except ImportError:
        print("Warning: NLTK library not found. POS filtering will be skipped.")
        print("Please install it: pip install nltk")
        return False
    except LookupError:
        print("Warning: NLTK resources 'averaged_perceptron_tagger' or 'punkt' not found for POS filtering.")
        print("Attempting to download...")
        try:
            import nltk # Ensure nltk is imported before download
            nltk.download('averaged_perceptron_tagger', quiet=True)
            nltk.download('punkt', quiet=True)
            _NLTK_RESOURCES_DOWNLOADED = True
            print("NLTK resources downloaded successfully.")
            return True
        except Exception as e:
            print(f"Warning: Failed to download NLTK resources: {e}. POS filtering will be skipped.")
            print("You may need to download them manually: import nltk; nltk.download('averaged_perceptron_tagger'); nltk.download('punkt')")
            return False

In [None]:
def select_glove_subset(W_embedding_norm_full, idx_to_word_map_full, n1, n2, pos_filter=None):
    """
    Down-selects words from GloVe embeddings by frequency rank and optionally by POS.

    Args:
        W_embedding_norm_full (np.ndarray): Full normalized GloVe embedding matrix.
        idx_to_word_map_full (dict): Full mapping from index to word.
        n1 (int): Starting rank (0-indexed, inclusive).
        n2 (int): Ending rank (0-indexed, exclusive).
        pos_filter (str, optional): None, "noun", or "verb". Defaults to None.

    Returns:
        tuple: (
            W_subset (np.ndarray): Embeddings for the selected subset.
            subset_word_to_idx_map (dict): Word to new index map for the subset.
            subset_idx_to_word_map (dict): New index to word map for the subset.
        ) or (None, None, None) if inputs are invalid or selection is empty.
    """
    vocab_size_full = W_embedding_norm_full.shape[0]
    actual_n1 = max(0, n1)
    actual_n2 = min(vocab_size_full, n2)

    if actual_n1 >= actual_n2:
        print(f"Warning: Invalid rank range n1={n1}, n2={n2}. Results in empty selection.")
        return np.array([]).reshape(0, W_embedding_norm_full.shape[1]), {}, {}

    candidate_indices_full = list(range(actual_n1, actual_n2))
    candidate_words = [idx_to_word_map_full[i] for i in candidate_indices_full]
    candidate_vectors = W_embedding_norm_full[candidate_indices_full, :]

    final_selected_words = candidate_words
    W_subset = candidate_vectors

    if pos_filter and candidate_words: # Only attempt POS if filter requested and candidates exist
        if pos_filter not in ["noun", "verb"]:
            print(f"Warning: Invalid pos_filter '{pos_filter}'. Must be 'noun' or 'verb'. Skipping POS filter.")
        elif _ensure_nltk_resources():
            import nltk # Import here after resource check
            tagged_words = nltk.pos_tag(candidate_words)
            
            filtered_word_vector_pairs = []
            for i, (word, tag) in enumerate(tagged_words):
                if (pos_filter == "noun" and tag.startswith("NN")) or \
                   (pos_filter == "verb" and tag.startswith("VB")):
                    filtered_word_vector_pairs.append((word, candidate_vectors[i, :]))
            
            if filtered_word_vector_pairs:
                final_selected_words = [pair[0] for pair in filtered_word_vector_pairs]
                W_subset = np.array([pair[1] for pair in filtered_word_vector_pairs])
            else: # POS filter resulted in no words
                final_selected_words = []
                W_subset = np.array([]).reshape(0, W_embedding_norm_full.shape[1])
                print(f"Warning: POS filter '{pos_filter}' resulted in 0 words from the rank selection.")

    subset_word_to_idx_map = {word: i for i, word in enumerate(final_selected_words)}
    subset_idx_to_word_map = {i: word for i, word in enumerate(final_selected_words)}

    print(f"Selected {len(final_selected_words)} words from ranks {actual_n1}-{actual_n2-1}" +
          (f" with POS filter '{pos_filter}'." if pos_filter and _NLTK_RESOURCES_DOWNLOADED else "."))
    return W_subset, subset_word_to_idx_map, subset_idx_to_word_map

In [None]:
# Path to your downloaded GloVe file
GLOVE_FILE_PATH = "glove.6B/glove.6B.50d.txt" 
# Ensure this file exists at the specified path or change the path.
# You can download GloVe vectors from: https://nlp.stanford.edu/projects/glove/

try:
    # Load the GloVe model (this might take a few seconds to a minute)
    W_normalized_embeddings, word_to_index, index_to_word, embedding_dim = load_glove_model(GLOVE_FILE_PATH)
    
    # --- Example 1: Find closest words to a purely random vector ---
    print("\n--- Example 1: Closest words to a purely random vector ---")
    # Generate a random vector with the same dimension as the GloVe embeddings
    random_vec = np.random.rand(embedding_dim) 
    num_results = 10
    
    print(f"Finding {num_results} closest words to a random vector (dim={embedding_dim})...")
    closest_to_random = find_closest_words_to_vector(random_vec, num_results, W_normalized_embeddings, index_to_word)
    
    if closest_to_random:
        print(f"\nTop {num_results} closest words to the random vector:")
        for word, score in closest_to_random:
            print(f"- {word}: {score:.4f}")
    else:
        print("Could not find closest words for the random vector.")

    # --- Example 2: Find closest words to an average of known words ---
    # This demonstrates using a more structured, though still arbitrary, vector.
    print("\n--- Example 2: Closest words to the average of 'king' and 'royal' ---")
    target_words = ['king', 'royal']
    
    if all(w in word_to_index for w in target_words):
        word_vectors = [W_normalized_embeddings[word_to_index[w]] for w in target_words]
        average_vector = np.mean(word_vectors, axis=0)
        
        num_results_avg = 5
        print(f"Finding {num_results_avg} closest words to the average vector of {target_words}...")
        closest_to_average = find_closest_words_to_vector(average_vector, num_results_avg, W_normalized_embeddings, index_to_word)

        if closest_to_average:
            print(f"\nTop {num_results_avg} closest words to the average of {target_words}:")
            for word, score in closest_to_average:
                # Optionally, filter out the input words themselves if they appear high
                if word not in target_words or score < 0.999: # score check for exact match
                        print(f"- {word}: {score:.4f}")
        else:
            print(f"Could not find closest words for the average of {target_words}.")
    else:
        print(f"One or more words from {target_words} not found in vocabulary for Example 2.")

    # --- Example 3: Using the new select_glove_subset function ---
    print("\n--- Example 3: Select subset of GloVe words by rank and POS ---")
    # Select words ranked between 1000 and 1100 (exclusive of 1100), and filter for nouns
    rank_n1, rank_n2 = 1000, 1100 # 0-indexed ranks
    
    W_noun_subset, noun_subset_to_idx, idx_to_noun_subset = select_glove_subset(
        W_normalized_embeddings, index_to_word, rank_n1, rank_n2, pos_filter="noun"
    )

    if W_noun_subset.shape[0] > 0:
        print(f"Successfully created a subset of {W_noun_subset.shape[0]} nouns.")
        # Now you can use this subset with find_closest_words_to_vector
        # Note: the query vector should still match the original embedding dimension
        if embedding_dim > 0: # Ensure embedding_dim is valid
            random_vec_for_subset = np.random.rand(embedding_dim)
            num_results_subset = 3
            closest_in_noun_subset = find_closest_words_to_vector(
                random_vec_for_subset, num_results_subset, W_noun_subset, idx_to_noun_subset
            )
            print(f"\nTop {num_results_subset} closest words in the noun subset to a random vector:")
            for word, score in closest_in_noun_subset:
                print(f"- {word}: {score:.4f}")
    else:
        print(f"The selection of nouns from ranks {rank_n1}-{rank_n2-1} resulted in an empty set.")

except FileNotFoundError as e:
    print(f"\nERROR: {e}")
    print("Please ensure the GloVe file path is correct and the file exists.")
    print(f"Expected at: {os.path.abspath(GLOVE_FILE_PATH)}")
except ValueError as e:
    print(f"\nERROR during GloVe loading or processing: {e}")
except Exception as e:
    print(f"\nAn unexpected error occurred: {e}")

Loading GloVe model from glove.6B/glove.6B.50d.txt...
Loaded 400000 word vectors with dimension 50.

--- Example 1: Closest words to a purely random vector ---
Finding 10 closest words to a random vector (dim=50)...

Top 10 closest words to the random vector:
- mesnel: 0.5981
- kitchenette: 0.5100
- kareena: 0.5094
- treats: 0.4995
- ljubijankic: 0.4976
- meals: 0.4951
- birthweight: 0.4896
- anoushka: 0.4886
- imron: 0.4868
- tasty: 0.4866

--- Example 2: Closest words to the average of 'king' and 'royal' ---
Finding 5 closest words to the average vector of ['king', 'royal']...

Top 5 closest words to the average of ['king', 'royal']:
- king: 0.9055
- royal: 0.9055
- queen: 0.8439
- prince: 0.8210
- imperial: 0.8151

--- Example 3: Select subset of GloVe words by rank and POS ---
Attempting to download...
NLTK resources downloaded successfully.
Selected 47 words from ranks 1000-1099 with POS filter 'noun'.
Successfully created a subset of 47 nouns.

Top 3 closest words in the noun sub

In [None]:
def cartesian_to_spherical_ndim(cartesian_coords):
    """
    Converts n-dimensional Cartesian coordinates to n-dimensional spherical coordinates.

    The spherical coordinates are (r, phi_1, phi_2, ..., phi_{n-1}), where:
    - r is the radial distance.
    - phi_1, ..., phi_{n-2} are inclination angles in [0, pi].
    - phi_{n-1} is the azimuthal angle in (-pi, pi].

    The transformation maps (x_0, x_1, ..., x_{n-1}) to (r, phi_1, ..., phi_{n-1}) such that:
    x_0 = r * cos(phi_1)
    x_1 = r * sin(phi_1) * cos(phi_2)
    ...
    x_{n-2} = r * sin(phi_1) * ... * sin(phi_{n-2}) * cos(phi_{n-1})
    x_{n-1} = r * sin(phi_1) * ... * sin(phi_{n-2}) * sin(phi_{n-1})

    Args:
        cartesian_coords (array-like): A 1D array or list of n Cartesian coordinates
                                       (x_0, x_1, ..., x_{n-1}).

    Returns:
        np.ndarray: A 1D numpy array of n spherical coordinates
                    (r, phi_1, ..., phi_{n-1}).
                    Returns an empty array if input is empty.
                    If input is 1D, returns [|x_0|].
                    If input is the origin, angles are set to 0.
    """
    coords = np.asarray(cartesian_coords, dtype=float)
    n = coords.shape[0]

    if n == 0:
        return np.array([])

    # Calculate radial distance r
    r = np.linalg.norm(coords)
    
    spherical_coords = np.zeros(n)
    spherical_coords[0] = r

    if n == 1:
        return spherical_coords # Only r, which is |x_0|

    if r < 1e-12: # Effectively at the origin
        # All angles are conventionally 0.
        # spherical_coords[1:] are already 0.0 by np.zeros initialization.
        return spherical_coords

    # Calculate n-1 angles
    # spherical_coords[1] is phi_1, ..., spherical_coords[n-1] is phi_{n-1}
    
    # This variable will hold the sum of squares of remaining coordinates:
    # For phi_1 (spherical_coords[1]), sum_sq = x_0^2 + ... + x_{n-1}^2
    # For phi_2 (spherical_coords[2]), sum_sq = x_1^2 + ... + x_{n-1}^2
    # etc.
    current_sum_sq = r**2 
    epsilon = 1e-12 # Small number for safe division

    for i in range(n - 1): # Loop to calculate n-1 angles
        angle_idx_in_spherical = i + 1 # phi_1 is at index 1, phi_2 at index 2, ...

        if i < n - 2: 
            # This is for angles phi_1 to phi_{n-2}
            # (spherical_coords[1] to spherical_coords[n-2])
            # These are arccos based.
            # Angle phi_{i+1} (spherical_coords[i+1]) uses cartesian_coords[i]
            
            cartesian_component = coords[i]
            
            denominator = np.sqrt(current_sum_sq)

            if denominator < epsilon:
                # This implies coords[i], coords[i+1], ..., coords[n-1] are all zero.
                # The angle is conventionally 0.
                spherical_coords[angle_idx_in_spherical] = 0.0
            else:
                # Clip ratio for numerical stability with arccos
                ratio = np.clip(cartesian_component / denominator, -1.0, 1.0)
                spherical_coords[angle_idx_in_spherical] = np.arccos(ratio)
            
            current_sum_sq -= cartesian_component**2
            # Ensure current_sum_sq does not become negative due to floating point errors
            current_sum_sq = max(0, current_sum_sq) 
        else: 
            # This is for the last angle, phi_{n-1} (spherical_coords[n-1])
            # This angle is arctan2 based.
            # It uses cartesian_coords[n-2] (as x-like) and cartesian_coords[n-1] (as y-like).
            # At this point, i = n-2.
            # The angle is spherical_coords[n-1].
            
            # np.arctan2(y, x)
            # y component is coords[n-1]
            # x component is coords[n-2]
            # If both coords[n-2] and coords[n-1] are zero, arctan2(0,0) is 0, which is correct.
            spherical_coords[angle_idx_in_spherical] = np.arctan2(coords[n-1], coords[n-2])
            
    return spherical_coords

In [None]:
def process_and_save_spherical_embeddings(glove_file_path,
                                          nouns_output_file,
                                          verbs_output_file,
                                          start_rank=1000,
                                          target_count=8192):
    """
    Loads GloVe embeddings, selects nouns and verbs, converts their embeddings
    to spherical coordinates (excluding radius), and saves them to text files.
    Only words longer than 2 characters are included.
    For verbs, an additional filter is applied to reject words ending in '-ing',
    aiming to select more infinitive-like forms.
    """
    print("Starting processing and saving spherical embeddings...")

    # 1. Load full GloVe model
    W_full, _, idx_to_word_full, embedding_dim = load_glove_model(glove_file_path)
    vocab_size_full = W_full.shape[0]

    if embedding_dim == 0:
        print("Error: Embedding dimension is 0. Cannot proceed.")
        return

    pos_types_to_process = [("noun", "NN", nouns_output_file),
                              ("verb", "VB", verbs_output_file)]

    for pos_name, pos_tag_prefix, output_file_path in pos_types_to_process:
        print(f"\nProcessing {pos_name}s...")
        # 2. Select subset by rank and POS
        # We select from start_rank to the end of vocab, then take the first target_count.
        W_subset, _, subset_idx_to_word = select_glove_subset(
            W_full, idx_to_word_full, start_rank, vocab_size_full, pos_filter=pos_name
        )

        num_candidates_after_pos_filter = W_subset.shape[0]

        if num_candidates_after_pos_filter == 0:
            print(f"No {pos_name}s found after POS filtering from rank {start_rank}. Skipping file generation for {output_file_path}")
            continue

        criteria_desc = "longer than 2 characters"
        if pos_name == "verb":
            criteria_desc += " and not ending in '-ing' (heuristic for infinitive form)"

        print(f"Found {num_candidates_after_pos_filter} {pos_name}s after POS filtering. "
              f"Will process up to {target_count} words meeting criteria ({criteria_desc}), excluding radius.")
        
        words_written_count = 0
        with open(output_file_path, 'w', encoding='utf-8') as f:
            for i in range(num_candidates_after_pos_filter):
                if words_written_count >= target_count:
                    break 

                word = subset_idx_to_word[i]

                # Add check for word length
                if len(word) > 2:
                    # For verbs, reject words ending with "-ing" as a heuristic for non-infinitive forms.
                    if pos_name == "verb" and word.endswith("ing"):
                        # Note: This simple heuristic might also exclude some base form verbs that naturally end in "ing" (e.g., "sing", "bring").
                        # However, it primarily targets gerunds/present participles (e.g., "running", "singing").
                        continue # Skip this word

                    cartesian_vector = W_subset[i, :]
                    spherical_vector = cartesian_to_spherical_ndim(cartesian_vector)

                    # spherical_vector[0] is radius, spherical_vector[1:] are angles
                    if len(spherical_vector) > 1: # Ensure there are angles to write
                        f.write(word)
                        for component in spherical_vector[1:]: # Skip radius (the first component)
                            f.write(f" {component:.8f}") 
                        f.write("\n")
                        words_written_count += 1
        
        if words_written_count > 0:
            saved_criteria_desc = "longer than 2 chars"
            if pos_name == "verb":
                saved_criteria_desc += ", non '-ing' form"
            print(f"Saved {words_written_count} {pos_name}s ({saved_criteria_desc}, excluding radius) to {output_file_path}")
            
            if words_written_count < target_count and num_candidates_after_pos_filter >= target_count:
                note_filter_desc = "length > 2 chars criterion"
                if pos_name == "verb":
                    note_filter_desc += " and non '-ing' verb form criterion"
                print(f"  (Note: Target was {target_count}, but only {words_written_count} words met the filtering criteria ({note_filter_desc}) "
                      f"from the {num_candidates_after_pos_filter} available candidates after POS filtering.)")
            elif num_candidates_after_pos_filter < target_count and words_written_count < num_candidates_after_pos_filter:
                note_filter_desc = "length and (if applicable) verb form filtering"
                print(f"  (Note: Initial POS selection had {num_candidates_after_pos_filter} words. "
                      f"After {note_filter_desc}, {words_written_count} words were saved.)")
        else: # words_written_count == 0
            if num_candidates_after_pos_filter > 0:
                filter_details = "longer than 2 characters"
                if pos_name == "verb":
                    filter_details += " and not ending in '-ing'"
                print(f"No {pos_name}s meeting criteria ({filter_details}) found for processing from the selection of {num_candidates_after_pos_filter} words. "
                      f"File {output_file_path} is empty or not created with content.")

    print("\nProcessing complete.")