### This notebook creates the embeddings using Sentence Transformers for Semantic IDs.

*In particular, it loads the .item file, creates a template to be fed to a sentence transformer and then save the embedding for each item in a .itememb file.*

This notebook currently support:
- RPG

**Load Libraries**

In [9]:
import os
import pandas as pd
from sentence_transformers import SentenceTransformer
import tiktoken
import torch
from tqdm import tqdm
from openai import OpenAI
import numpy as np
import re
import html
from typing import Union

**set the configuration**

In [10]:
data_path = "/home/recsysdatasets"
dataset = "Amazon_Sports_and_Outdoors"
sentence_trf_bs = 2048 # set the highest bs that fits in your GPU memory
sentence_trf_model = "sentence-transformers/sentence-t5-base" # size 768
apply_pca = True
sentence_trf_pca_components = 128
openai_api_key = None
separator = ' '

**specify which columns to use as metadata**


*they depend on the dataset. In this case, it's for ml-100k.*

In [11]:
# la colonna feature non l'ho trovata nei metadata amazon.
columns_to_concatenate = ['title', 'price', 'brand','categories','sales_type','sales_rank','description']

**set the device used by the sentence transformer**

In [12]:
device = "cuda" if torch.cuda.is_available() else "cpu"

**load .item file used to retrieve item's metadata**

In [13]:
item_feat_file = os.path.join(data_path, dataset, f'{dataset}_updated.item')
item_feat_df = pd.read_csv(item_feat_file, sep='\t')

**Rename the columns**

In [14]:
# Rename the columns by removing everything after ':'
new_columns = {}
for col in item_feat_df.columns:
    new_columns[col] = col.split(':')[0]
item_feat_df.rename(columns=new_columns, inplace=True)

**concatenate values and create the final dict**

In [15]:
# utilities functions used to preprocess the feature data of an item.
def list_to_str(l: Union[list, str], remove_blank=False) -> str:
    """
    Converts a list or a string to a string representation.

    Args:
        l (Union[list, str]): The input list or string.

    Returns:
        str: The string representation of the input.
    """
    ret = ''
    if isinstance(l, list):
        ret = ', '.join(map(str, l))
    else:
        ret = l
    if remove_blank:
        ret = ret.replace(' ', '')
    return ret

def clean_text(raw_text: str) -> str:
    """
    Cleans the raw text by removing HTML tags, special characters, and extra spaces.

    Args:
        raw_text (str): The raw text to be cleaned.

    Returns:
        str: The cleaned text.
    """
    text = list_to_str(raw_text)
    text = html.unescape(text)
    text = text.strip()
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'[\n\t]', ' ', text)
    text = re.sub(r' +', ' ', text)
    text = re.sub(r'[^\x00-\x7F]', ' ', text)
    return text

def _sent_process(raw: str) -> str:
    """
    Process the raw input according to the raw data type and return a processed sentence.

    Args:
        raw (str): The raw input to be processed.

    Returns:
        str: The processed sentence.
    """
    sentence = ""
    if isinstance(raw, float):
        sentence += str(raw)
        sentence += '.'
    elif len(raw) > 0 and isinstance(raw[0], list):
        for v1 in raw:
            for v in v1:
                sentence += clean_text(v)[:-1]
                sentence += ', '
        sentence = sentence[:-2]
        sentence += '.'
    elif isinstance(raw, list):
        for v1 in raw:
            sentence += clean_text(v1)
    else:
        sentence = clean_text(raw)
    return sentence + ' '

In [16]:
if 'amazon' in dataset.lower():
    item_dict = dict()
    for index, row in item_feat_df.iterrows():
        item_id = row['item_id']
        meta_sentence = ''
        # Iterate through each column identified for concatenation
        for col in columns_to_concatenate:
            # Convert each value to string and handle potential NaN values by replacing them with an empty string
            meta_sentence += _sent_process(row[col])
        
        # Join the collected parts with a space and remove any leading/trailing whitespace
        item_dict[item_id] = meta_sentence
else:
    item_dict = dict()
    for index, row in item_feat_df.iterrows():
        item_id = row['item_id']
        concatenated_parts = []
        # Iterate through each column identified for concatenation
        for col in columns_to_concatenate:
            # Convert each value to string and handle potential NaN values by replacing them with an empty string
            value = row[col] if pd.notna(row[col]) else ''
            concatenated_parts.append(value)

        # Join the collected parts with a space and remove any leading/trailing whitespace
        item_dict[item_id] = separator.join(concatenated_parts).strip()

**item dict is a dictionary where the key is the item and the value is the description that will be used to obtain from the sentence trasformer the corresponding representation.**

In [17]:
item_dict

{'0000032069': "Adult Ballet Tutu Cheetah Pink 7.89. BubuBibi 'Sports & Outdoors', 'Skirts', 'Clothing', 'Girls', 'Other Sports', 'Dance' nan. nan. nan. ",
 '0000031909': "Girls Ballet Tutu Neon Pink 7.0. Unknown 'Other Sports', 'Dance', 'Sports & Outdoors' Toys & Games 201847.0. High quality 3 layer ballet tutu. 12 inches in length ",
 '0000032034': "Adult Ballet Tutu Yellow 7.87. BubuBibi 'Sports & Outdoors', 'Skirts', 'Clothing', 'Girls', 'Other Sports', 'Dance' nan. nan. nan. ",
 '0000031852': "Girls Ballet Tutu Zebra Hot Pink 3.17. Coxlures 'Other Sports', 'Dance', 'Sports & Outdoors' Toys & Games 211836.0. TUtu ",
 '0000032050': "Adult Ballet Tutu Purple 12.85. BubuBibi 'Sports & Outdoors', 'Skirts', 'Clothing', 'Girls', 'Other Sports', 'Dance' nan. nan. nan. ",
 '0000031895': "Girls Ballet Tutu Neon Blue 2.99. BubuBibi 'Other Sports', 'Dance', 'Sports & Outdoors' Toys & Games 36575.0. Dance tutu for girls ages 2-8 years. Perfect for dance practice, recitals and performances, cos

**Load the Sentence Transformer**

In [18]:
if 'sentence-transformer' in sentence_trf_model:
    print(f"Using {sentence_trf_model} for sentence embeddings")
    sentence_trf = SentenceTransformer(sentence_trf_model).to(device)

    sentence_embeddings = sentence_trf.encode(
        list(item_dict.values()),
        convert_to_numpy=True,
        batch_size=sentence_trf_bs,
        show_progress_bar=True,
        device=device
    )
else:
    print(f"Using OpenAI {sentence_trf_model} for sentence embeddings")
    client = OpenAI(api_key=openai_api_key)

    sentence_embeddings = []
    for i in tqdm(range(0, len(item_dict.values()), sentence_trf_bs), desc='Encoding: '):
        try:
            responses = client.embeddings.create(
                input=list(item_dict.values())[i: i + sentence_trf_bs],
                model=sentence_trf_model
            )
        except:
            print(f'Failed to encode sentence embeddings for {i} - {i + sentence_trf_bs}')
            batch = list(item_dict.values())[
                i: i + sentence_trf_bs]

            new_batch = []
            for sent in batch:
                
                encoding = tiktoken.get_encoding('cl100k_base')
                num_tokens = len(encoding.encode(sent))

                if num_tokens < 8192:
                    new_batch.append(sent)
                else:
                    n_chars = 8192 / num_tokens * len(sent) - 100
                    new_batch.append(sent[:int(n_chars)])

            print(f'Retrying with {len(new_batch)} sentences')
            responses = client.embeddings.create(
                input=new_batch,
                model=sentence_trf_model
            )

        for response in responses.data:
            sentence_embeddings.append(response.embedding)
    sentence_embeddings = np.array(sentence_embeddings, dtype=np.float32)

# for Amazon_Sports_and_Outdoors is about 31minutes

Using sentence-transformers/sentence-t5-base for sentence embeddings


Batches: 100%|██████████| 260/260 [22:45<00:00,  5.25s/it]


In [19]:
if apply_pca:
    from sklearn.decomposition import PCA
    pca = PCA(n_components=sentence_trf_pca_components, whiten=True)
    sentence_embeddings = pca.fit_transform(sentence_embeddings)

**Save the representations**

In [20]:
data = {
    'item_embedding_id:token': item_dict.keys(),
    'item_embedding:float_seq': sentence_embeddings.tolist(),
}
df = pd.DataFrame(data)
df['item_embedding:float_seq'] = df['item_embedding:float_seq'].apply(lambda x: ' '.join(map(str, x)))

In [21]:
df

Unnamed: 0,item_embedding_id:token,item_embedding:float_seq
0,0000032069,-1.446521520614624 1.3857653141021729 0.044562...
1,0000031909,-0.790679395198822 0.2329411506652832 0.123875...
2,0000032034,-1.3797770738601685 1.5031390190124512 0.29644...
3,0000031852,-1.0126373767852783 0.4025305211544037 -0.1421...
4,0000032050,-1.3150359392166138 1.252893328666687 0.236028...
...,...,...
532192,B00LOGH6JM,-0.45534712076187134 0.38706275820732117 -1.20...
532193,B00LOWZGJS,0.6186310052871704 -0.6663661599159241 1.15993...
532194,B00LR2PQ68,-0.554757833480835 1.7835862636566162 0.057923...
532195,B00LUEBKKY,0.13550160825252533 -2.500231981277466 -0.2662...


**Save the final file**

In [22]:
df.to_csv(os.path.join(data_path,dataset,f'{dataset}.itememb'),sep="\t", index=False, header=True)

In [23]:
os.listdir(os.path.join(data_path,dataset))

['Amazon_Sports_and_Outdoors.inter',
 'meta_Sports_and_Outdoors.json',
 'Amazon_Sports_and_Outdoors.itememb',
 'Amazon_Sports_and_Outdoors.item',
 'Amazon_Sports_and_Outdoors_updated.item']