# Part 1: Word Embeddings with Word Co-occurrence Matrix

In [1]:
!pip install torchtext



In [23]:
import nltk
from nltk.corpus import brown
nltk.download('brown')

corpus = ' '.join(brown.words())  # Join all words into a single text

[nltk_data] Downloading package brown to /Users/User2/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [24]:
def get_word_frequencies(text):
    # Convert the entire text to lowercase to normalize case
    #text = text.lower()
    
    # Split the text into words
    words = text.split()
    
    # Initialize an empty dictionary for word frequencies
    word_frequencies = {}
    
    # Count each word
    for word in words:
        if word in word_frequencies:
            word_frequencies[word] += 1
        else:
            word_frequencies[word] = 1
    
    return word_frequencies

# Example usage
word_frequencies = get_word_frequencies(corpus)

# Print word frequencies
for word, freq in word_frequencies.items():
    print(f"'{word}': {freq}")

'The': 7258
'Fulton': 17
'County': 85
'Grand': 18
'Jury': 4
'said': 1943
'Friday': 60
'an': 3542
'investigation': 43
'of': 36080
'Atlanta's': 4
'recent': 167
'primary': 93
'election': 72
'produced': 90
'``': 8837
'no': 1781
'evidence': 201
'''': 8789
'that': 10237
'any': 1301
'irregularities': 8
'took': 425
'place': 528
'.': 49346
'jury': 63
'further': 194
'in': 19536
'term-end': 1
'presentments': 1
'the': 62713
'City': 134
'Executive': 9
'Committee': 88
',': 58334
'which': 3540
'had': 5102
'over-all': 35
'charge': 120
'deserves': 16
'praise': 17
'and': 27915
'thanks': 27
'Atlanta': 35
'for': 8841
'manner': 124
'was': 9777
'conducted': 55
'September-October': 1
'term': 79
'been': 2470
'charged': 57
'by': 5103
'Superior': 16
'Court': 110
'Judge': 39
'Durwood': 1
'Pye': 1
'to': 25732
'investigate': 11
'reports': 78
'possible': 373
'hard-fought': 2
'won': 68
'Mayor-nominate': 1
'Ivan': 4
'Allen': 20
'Jr.': 75
'Only': 102
'a': 21881
'relative': 44
'handful': 13
'such': 1192
'received': 163

In [25]:
def create_vocab(word_freq, V=20000):
    # Sort the word frequencies dictionary by frequency in descending order
    sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
    
    # Take the top V most frequent words
    most_common_words = sorted_words[:V]
    
    # Create a vocabulary dictionary mapping words to unique indices
    vocab = {word: i for i, (word, _) in enumerate(most_common_words)}
    return vocab

# Example usage
vocab = create_vocab(word_frequencies)

In [26]:
print(vocab)



In [27]:
def windowizer(text):
    words = text.split()
    windows = [f"{words[i]} {words[i + 1]}" for i in range(len(words) - 1)]
    return windows

windows = windowizer(corpus)

In [28]:
print(windows[:5])

['The Fulton', 'Fulton County', 'County Grand', 'Grand Jury', 'Jury said']


In [29]:
import numpy as np

def build_co_occurrence_matrix(vocab, windows):
    vocab_size = len(vocab)
    matrix = np.zeros((vocab_size, vocab_size), dtype=np.int32)
    for window in windows:
        word1, word2 = window.split()
        if word1 in vocab and word2 in vocab:
            idx1, idx2 = vocab[word1], vocab[word2]
            matrix[idx1, idx2] += 1
            matrix[idx2, idx1] += 1  # Symmetric matrix
    return matrix

co_occurrence_matrix = build_co_occurrence_matrix(vocab, windows)

In [30]:
print(co_occurrence_matrix)

[[   0 3758    2 ...    4    0    1]
 [3758  146    1 ...    0    0    0]
 [   2    1    0 ...    1    2    0]
 ...
 [   4    0    1 ...    0    0    0]
 [   0    0    2 ...    0    0    0]
 [   1    0    0 ...    0    0    0]]


In [31]:
from scipy.spatial.distance import cosine

def cosine_similarity(vec1, vec2):
    # Calculate cosine similarity manually
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

def get_most_similar_words(matrix, vocab, test_word, top_n=5):
    if test_word not in vocab:
        return []
    
    word_idx = vocab[test_word]
    word_vector = matrix[word_idx]
    
    # Calculate similarities directly with cosine similarity
    similarities = [
        (word, cosine_similarity(word_vector, matrix[idx]))
        for word, idx in vocab.items() if word != test_word
    ]
    
    # Sort by similarity in descending order and get the top_n
    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities[:top_n]

# Example usage
test_word = 'government'
similar_words = get_most_similar_words(co_occurrence_matrix, vocab, test_word)
print(similar_words)

test_word2 = 'investigation'
similar_words2 = get_most_similar_words(co_occurrence_matrix, vocab, test_word2)
print(similar_words2)

test_word3 = 'county'
similar_words3 = get_most_similar_words(co_occurrence_matrix, vocab, test_word3)
print(similar_words3)

test_word4 = 'jury'
similar_words4 = get_most_similar_words(co_occurrence_matrix, vocab, test_word4)
print(similar_words4)

[('water', 0.9192449021679032), ('light', 0.9170041849389351), ('land', 0.9129898977324399), ('direction', 0.9106910402994063), ('law', 0.910225066628261)]
[('interpretation', 0.8759447578528412), ('destruction', 0.8728507057970645), ('application', 0.8719779764954689), ('expression', 0.8693387231611925), ('absurdity', 0.8636869876790825)]
[('French', 0.8375300809329476), ('city', 0.8356498827161863), ('X-region', 0.8350790119161771), ('Bosphorus', 0.8293899053581143), ('woods', 0.8273224348236593)]
[('congregation', 0.8843715766701069), ('historian', 0.8801438321272178), ('police', 0.8777896985059777), ('road', 0.8756309518966021), ('former', 0.8731587864462476)]


# Part 2: Applications of Dense Representations of Words

## Task A - Fine-grained Supermarket Product Segmentation

In [32]:
#Data Loading
import pandas as pd

products_df = pd.read_csv('products.csv')
aisles_df = pd.read_csv('aisles.csv')

In [33]:
#Preprocessing
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def cleanup_text(text):
    # Clean text here if necessary
    return text.lower()

def process_product_name(name):
    words = cleanup_text(name).split()
    return [word for word in words if word not in stop_words]

products_df['processed_name'] = products_df['product_name'].apply(process_product_name)

[nltk_data] Downloading package stopwords to /Users/User2/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [34]:
#Embedding and Clustering
import torchtext
from torchtext.vocab import FastText

fasttext = FastText(language='en')

def get_mean_embedding(words):
    embeddings = [fasttext[word] for word in words if word in fasttext.stoi]
    return sum(embeddings) / len(embeddings) if embeddings else None

products_df['embedding'] = products_df['processed_name'].apply(get_mean_embedding)
embeddings = products_df['embedding'].dropna().tolist()

In [35]:
#K-Means Clustering
import numpy as np
from sklearn.cluster import KMeans

# Ensure all embeddings are numpy arrays and have the same shape
products_df['embedding'] = products_df['embedding'].apply(lambda x: np.array(x) if x is not None else None)

# Drop any products with missing or inconsistent embeddings
products_df = products_df.dropna(subset=['embedding'])

# Check that all embeddings have the same shape
embedding_length = len(products_df['embedding'].iloc[0])
products_df = products_df[products_df['embedding'].apply(lambda x: len(x) == embedding_length)]

# Convert the list of embeddings to a 2D numpy array
embeddings_array = np.vstack(products_df['embedding'].values)

# Run K-means clustering
n_clusters = 5  # Adjust based on experimentation
kmeans = KMeans(n_clusters=n_clusters, random_state=0)
products_df['cluster'] = kmeans.fit_predict(embeddings_array)

# Display cluster members
for cluster in range(n_clusters):
    cluster_members = products_df[products_df['cluster'] == cluster]['product_name']
    print(f"Cluster {cluster}: {cluster_members.tolist()}")

  super()._check_params_vs_input(X, default_n_init=10)


Cluster 0: ['Smart Ones Classic Favorites Mini Rigatoni With Vodka Cream Sauce', 'Green Chile Anytime Sauce', "Cut Russet Potatoes Steam N' Mash", 'Rendered Duck Fat', 'Pizza for One Suprema  Frozen Pizza', 'Pomegranate Cranberry & Aloe Vera Enrich Drink', 'Salted Caramel Lean Protein & Fiber Bar', 'Fancy Feast Trout Feast Flaked Wet Cat Food', 'Fresh Cut Golden Sweet No Salt Added Whole Kernel Corn', 'White Pearl Onions', 'European Cucumber', 'School Glue, Washable, No Run', 'Pumpkin Muffin Mix', 'Mirabelle Brut Rose', 'Medium Taqueria Style Chipotle Salsa', "Autumn Vegetable & Turkey Dinner with Lil' Bits Purees Dinner", 'European Style Spring Mix', "Artisan Chick'n & Apple Sausage", 'Wild Albacore Tuna No Salt Added', 'French  Tarragon Wine Vinegar', 'Lamb Shank', 'Classics Earl Grey Tea', 'Meat In The Middle Large Rawhide Chews', 'Organic Blueberry Blitz Fruit & Veggie Smoothie Mashups', '2% Yellow American Cheese', 'Bread, Healthy Whole Grain', 'Sprouted Kale Cracker', 'Organic Yu

## Task B - Genre Classification with Movie Titles

In [36]:
#Genre and Title Embeddings
genres = ['action', 'adventure', 'comedy', 'drama', 'fantasy', 'horror', 'romance', 'thriller']
movies = ['The Hangover', 'Shutter Island', 'Fight Club', 'Jumanji', 'Narcos', 'The Matrix',
          'Rush Hour', 'The Mummy', 'Iron Man', 'Silence of the Lambs', 'Batman Begins', 
          'Spider Man', 'The Hobbit', 'Troy', 'Jurassic Park', 'Scary Movie', 
          'Mission Impossible', 'Ted', 'Eat Pray Love', 'The Notebook', 'Love Actually', 
          'The Terminal', 'Crazy Stupid Love', 'Twilight', 'The Martian', 'Pursuit of Happyness']

genre_embeddings = {genre: fasttext[genre] for genre in genres}
movie_embeddings = {movie: get_mean_embedding(process_product_name(movie)) for movie in movies}

In [37]:
#Genre Assignment Using Cosine Similarity
def cosine_similarity(vec1, vec2):
    return 1 - cosine(vec1, vec2)

def assign_genre(movie_embedding, genre_embeddings):
    similarities = {genre: cosine_similarity(movie_embedding, genre_vec) for genre, genre_vec in genre_embeddings.items()}
    return max(similarities, key=similarities.get)

movie_genres = {movie: assign_genre(embedding, genre_embeddings) for movie, embedding in movie_embeddings.items()}
print(movie_genres)

{'The Hangover': 'comedy', 'Shutter Island': 'adventure', 'Fight Club': 'adventure', 'Jumanji': 'adventure', 'Narcos': 'thriller', 'The Matrix': 'action', 'Rush Hour': 'drama', 'The Mummy': 'horror', 'Iron Man': 'adventure', 'Silence of the Lambs': 'thriller', 'Batman Begins': 'adventure', 'Spider Man': 'adventure', 'The Hobbit': 'fantasy', 'Troy': 'romance', 'Jurassic Park': 'adventure', 'Scary Movie': 'horror', 'Mission Impossible': 'adventure', 'Ted': 'comedy', 'Eat Pray Love': 'romance', 'The Notebook': 'fantasy', 'Love Actually': 'romance', 'The Terminal': 'fantasy', 'Crazy Stupid Love': 'comedy', 'Twilight': 'fantasy', 'The Martian': 'fantasy', 'Pursuit of Happyness': 'adventure'}
