# Text Representation

1. Note: for this part, you are free to use all or a subset of the metadata file,
e.g., only using items that are in the train/test splits or using the first n
characters of the text content. Please justify your choice.

In [36]:
import pandas as pd
import numpy as np
df_train = pd.read_pickle("train_dataframe.pkl")
df_test = pd.read_pickle("test_dataframe.pkl")
meta_file = 'metadata.tsv'
df_meta = pd.read_csv(meta_file, sep='\t', low_memory=False)
print(f"Total items in metadata: {len(np.unique(df_meta['item_id']))}")

# Only use items in train/splits
train_items = set(df_train['item_id'].unique())
test_items = set(df_test['item_id'].unique())
all_items = train_items.union(test_items)

df_meta = df_meta[df_meta['item_id'].isin(all_items)]
print(f"Filtered items: {len(np.unique(df_meta['item_id']))}")
# save filtered dataset
df_meta.to_csv('filtered_metadata.tsv', sep='\t', index=False)


Total items in metadata: 23984
Filtered items: 518


2. Select the column description from the metadata file and apply ap-
propriate preprocessing to clean up the data, for example: tokenization,
transformation to lowercase, stopword removal6, or stemming. Motivate
your preprocessing choices and report the vocabulary size before and after preprocessing. There are many libraries you can use, including but not
limited to, NLTK, spaCy or CoreNLP (requires Java).

In [37]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# from nltk.stem import PorterStemmer
import string

nltk.download('punkt')
nltk.download('stopwords')

# Select the column description from the metadata file
df_meta = df_meta.dropna(subset=['description']) 
#df_meta = df_meta[df_meta['description'].str.strip() != ''] 
print(f"Filtered items: {len(np.unique(df_meta['item_id']))}")

original_vocab = set()
for desc in df_meta['description']:
    original_vocab.update(word_tokenize(desc.lower()))
print(f"Vocabulary size before preprocessing: {len(original_vocab)}")

# Pre processing
def preprocess_text(text):
    
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
 #   stemmer = PorterStemmer()
 #   tokens = [stemmer.stem(word) for word in tokens]
    return tokens


df_meta['processed_description'] = df_meta['description'].apply(preprocess_text)

processed_vocab = set()
for tokens in df_meta['processed_description']:
    processed_vocab.update(tokens)
print(f"Vocabulary size after preprocessing: {len(processed_vocab)}")

df_meta.to_csv('preprocessed_metadata.tsv', sep='\t', index=False)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Voiresa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Voiresa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Filtered items: 421
Vocabulary size before preprocessing: 5996
Vocabulary size after preprocessing: 5802


3. Represent each item in the TF-IDF vector space. You can use your
preferred library for text representation, for example, scikit-learn or
gensim.

In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer
df_meta = df_meta.reset_index(drop=True)

df_meta['processed_description'] = df_meta['processed_description'].apply(
    lambda x: ' '.join(x) if isinstance(x, list) else x
)

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df_meta['processed_description'])

print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")


TF-IDF matrix shape: (421, 5743)


4. Represent each item using pretrained word embeddings (e.g., GloVe, word-
2vec).

In [39]:
import numpy as np
import gensim.downloader
word2vec_vectors = gensim.downloader.load('word2vec-google-news-300')

In [40]:
def get_item_embedding(text, word2vec_vectors):
    """Compute the average word2vec embedding for text."""
    tokens = text.split()  # Tokenize 
    embeddings = []
    
    for token in tokens:
        # Skip out-of-vocabulary (OOV) tokens
        if token in word2vec_vectors:  
            embeddings.append(word2vec_vectors[token])
    
    if not embeddings:
        return None
    
    return np.mean(embeddings, axis=0)  # Average the embeddings

# Create a dictionary to store item embeddings
item_embeddings = {}
for idx, row in df_meta.iterrows():
    item_id = row["item_id"]
    description = row["processed_description"]
    if isinstance(description, list):
        description = ' '.join(description)  # Join tokens into a string
    embedding = get_item_embedding(description, word2vec_vectors)
    embedding = get_item_embedding(description, word2vec_vectors)
    if embedding is not None:
        item_embeddings[item_id] = embedding
len(item_embeddings)


419

Explore the similarity between items within the vector spaces by comput-
ing their cosine similarity. Compare results obtained with TF-IDF and
the word embeddings. Discuss what you find. Note that you can select
a subset of items to better highlight the differences between the two text
representations.

In [41]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim_matrix = cosine_similarity(tfidf_matrix)

print(f"TF-IDF: Cosine similarity matrix : {cosine_sim_matrix}")

TF-IDF: Cosine similarity matrix : [[1.         0.04214795 0.03344171 ... 0.         0.03967194 0.02087948]
 [0.04214795 1.         0.         ... 0.         0.         0.        ]
 [0.03344171 0.         1.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 1.         0.         0.        ]
 [0.03967194 0.         0.         ... 0.         1.         0.00521881]
 [0.02087948 0.         0.         ... 0.         0.00521881 1.        ]]


In [42]:
asin1 = 'B0B8M5FJ9W' 
asin2 = 'B0002E2G5Q'
asin3 = 'B07HGRFG5J'

# YOUR CODE HERE
index1 = df_meta[df_meta['item_id'] == asin1].index[0]
index2 = df_meta[df_meta['item_id'] == asin2].index[0]
index3 = df_meta[df_meta['item_id'] == asin3].index[0]

vector1 = tfidf_matrix[index1]
vector2 = tfidf_matrix[index2]
vector3 = tfidf_matrix[index3]

similarity_1_2 = cosine_similarity(vector1, vector2)[0][0]
similarity_1_3 = cosine_similarity(vector1, vector3)[0][0]
similarity_2_3 = cosine_similarity(vector2, vector3)[0][0]

print(f"TF-IDF")
print(f"Cosine similarity between {asin1} and {asin2}: {similarity_1_2:.6f}")
print(f"Cosine similarity between {asin1} and {asin3}: {similarity_1_3:.6f}")
print(f"Cosine similarity between {asin2} and {asin3}: {similarity_2_3:.6f}")




TF-IDF
Cosine similarity between B0B8M5FJ9W and B0002E2G5Q: 0.000000
Cosine similarity between B0B8M5FJ9W and B07HGRFG5J: 0.033442
Cosine similarity between B0002E2G5Q and B07HGRFG5J: 0.000000


In [43]:
import pandas as pd

item_ids = list(item_embeddings.keys())

embedding_matrix = np.array([item_embeddings[item_id] for item_id in item_ids])
similarity_matrix = cosine_similarity(embedding_matrix)

print("Word2Vec Cosine Similarity:")
print(similarity_matrix)


Word2Vec Cosine Similarity:
[[0.9999997  0.62766576 0.5723729  ... 0.52413344 0.7663236  0.74699277]
 [0.62766576 1.0000002  0.48242587 ... 0.46918514 0.5178212  0.5695845 ]
 [0.5723729  0.48242587 1.0000001  ... 0.3600452  0.5467724  0.5513881 ]
 ...
 [0.52413344 0.46918514 0.3600452  ... 1.0000001  0.46827385 0.43967092]
 [0.7663236  0.5178212  0.5467724  ... 0.46827385 1.         0.70941985]
 [0.74699277 0.5695845  0.5513881  ... 0.43967092 0.70941985 1.        ]]


In [44]:
embedding1 = item_embeddings.get(asin1)
embedding2 = item_embeddings.get(asin2)
embedding3 = item_embeddings.get(asin3)

def compute_cosine_similarity(vec1, vec2):
    """Compute cosine similarity between two vectors."""
    if vec1 is None or vec2 is None:
        return None
    return cosine_similarity([vec1], [vec2])[0][0]

similarity_1_2 = compute_cosine_similarity(embedding1, embedding2)
similarity_1_3 = compute_cosine_similarity(embedding1, embedding3)
similarity_2_3 = compute_cosine_similarity(embedding2, embedding3)

print(f"Word2Vec")
print(f"Cosine similarity between {asin1} and {asin2}: {similarity_1_2:.6f}")
print(f"Cosine similarity between {asin1} and {asin3}: {similarity_1_3:.6f}")
print(f"Cosine similarity between {asin2} and {asin3}: {similarity_2_3:.6f}")

Word2Vec
Cosine similarity between B0B8M5FJ9W and B0002E2G5Q: 0.331785
Cosine similarity between B0B8M5FJ9W and B07HGRFG5J: 0.572373
Cosine similarity between B0002E2G5Q and B07HGRFG5J: 0.419767


6. [Optional] Represent each item rated by the users using the word vectors
from the last layer of BERT. Here, you can directly use the vectors from
the pretrained version of BERT available in huggingface7. To load the
model, install the Transformers library and PyTorch8.

In [45]:
# LOAD TRANSFORMER
import torch
import transformers
assert transformers.__version__ > '4.0.0'

from transformers import BertModel, BertTokenizerFast

# set-up environment
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")


modelname = 'bert-base-uncased'
tokenizer = BertTokenizerFast.from_pretrained(modelname)
model = BertModel.from_pretrained(modelname).to(DEVICE)

# Print out the vocabulary size
vocab_size = tokenizer.vocab_size
input_dimension = model.config.hidden_size  # This is typically 768 for BERT
print(f"Vocabulary size of {vocab_size}. Input dimension: {input_dimension}.")

Using device: cpu
Vocabulary size of 30522. Input dimension: 768.


In [46]:
# Represent products in a vector space

def batch_encoding(sentences, batch_size=32):
    all_inputs = []
    all_last_hidden_states = []

    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors='pt', padding=True, truncation=True, max_length=128)
        inputs = {key: value.to(DEVICE) for key, value in inputs.items()}
        
        with torch.no_grad(): 
            outputs = model(**inputs)
        
        last_hidden_states = outputs.last_hidden_state
        all_inputs.append(inputs)
        all_last_hidden_states.append(last_hidden_states)
    
    return all_inputs, all_last_hidden_states

encoded_inputs, description_last_hidden_states = batch_encoding(df_meta['description'].tolist())

average_embeddings = []

for i in range(len(description_last_hidden_states)):
    for j in range(description_last_hidden_states[i].shape[0]):
        hidden_states = description_last_hidden_states[i][j] 
        mask = encoded_inputs[i]['attention_mask'][j] 
        
        masked_hidden_states = hidden_states[mask == 1] 
        if masked_hidden_states.size(0) > 0:
            avg_embedding = masked_hidden_states.mean(dim=0)
        else:
            avg_embedding = torch.zeros(model.config.hidden_size).to(DEVICE)
        
        average_embeddings.append(avg_embedding)

average_embeddings_tensor = torch.stack(average_embeddings)

print(f"Average embeddings shape: {average_embeddings_tensor.shape}")

Average embeddings shape: torch.Size([421, 768])


In [47]:

average_embeddings_np = average_embeddings_tensor.cpu().numpy()

cosine_sim_matrix = cosine_similarity(average_embeddings_np)
np.set_printoptions(precision=8, suppress=True)

print("BERT： Cosine Similarity:")
print(cosine_sim_matrix)

BERT： Cosine Similarity:
[[1.         0.79508483 0.711087   ... 0.5681307  0.8713387  0.8303369 ]
 [0.79508483 1.         0.72398275 ... 0.55087686 0.762762   0.7621898 ]
 [0.711087   0.72398275 1.0000001  ... 0.48726416 0.7183287  0.7013814 ]
 ...
 [0.5681307  0.55087686 0.48726416 ... 1.         0.58081186 0.59511703]
 [0.8713387  0.762762   0.7183287  ... 0.58081186 0.9999999  0.83941567]
 [0.8303369  0.7621898  0.7013814  ... 0.59511703 0.83941567 1.0000001 ]]


In [48]:

embedding1 = average_embeddings_tensor[index1].cpu().detach().numpy()
embedding2 = average_embeddings_tensor[index2].cpu().detach().numpy()
embedding3 = average_embeddings_tensor[index3].cpu().detach().numpy()

similarity_1_2 = cosine_similarity([embedding1], [embedding2])[0][0]
similarity_1_3 = cosine_similarity([embedding1], [embedding3])[0][0]
similarity_2_3 = cosine_similarity([embedding2], [embedding3])[0][0]

print("BERT")
print(f"Cosine similarity between {asin1} and {asin2}: {similarity_1_2:.6f}")
print(f"Cosine similarity between {asin1} and {asin3}: {similarity_1_3:.6f}")
print(f"Cosine similarity between {asin2} and {asin3}: {similarity_2_3:.6f}")

BERT
Cosine similarity between B0B8M5FJ9W and B0002E2G5Q: 0.544365
Cosine similarity between B0B8M5FJ9W and B07HGRFG5J: 0.711087
Cosine similarity between B0002E2G5Q and B07HGRFG5J: 0.573865


In [49]:
import pickle

with open('tfidf.pickle', 'wb') as f:
    pickle.dump(tfidf_matrix, f)

map_asin_id = {asin: i for i, asin in enumerate(df_meta['item_id'].unique())}
with open('map_asin_id.pickle', 'wb') as handle:
    pickle.dump(map_asin_id, handle, protocol=pickle.HIGHEST_PROTOCOL)