In [7]:
!pip install --upgrade streamlit


Collecting streamlit
  Downloading streamlit-1.38.0-py2.py3-none-any.whl.metadata (8.5 kB)
Downloading streamlit-1.38.0-py2.py3-none-any.whl (8.7 MB)
   ---------------------------------------- 0.0/8.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/8.7 MB ? eta -:--:--
   ---------------------------------------- 0.1/8.7 MB 1.3 MB/s eta 0:00:07
   -- ------------------------------------- 0.5/8.7 MB 4.5 MB/s eta 0:00:02
   ---- ----------------------------------- 1.0/8.7 MB 6.5 MB/s eta 0:00:02
   ------ --------------------------------- 1.5/8.7 MB 7.3 MB/s eta 0:00:01
   ------- -------------------------------- 1.7/8.7 MB 6.2 MB/s eta 0:00:02
   -------- ------------------------------- 1.8/8.7 MB 6.0 MB/s eta 0:00:02
   -------- ------------------------------- 1.8/8.7 MB 6.0 MB/s eta 0:00:02
   --------- ------------------------------ 2.1/8.7 MB 5.4 MB/s eta 0:00:02
   ---------- ----------------------------- 2.4/8.7 MB 5.3 MB/s eta 0:00:02
   ----------- -------------

In [19]:
!pip install -q tensorflow-recommenders
!pip install -q --upgrade tensorflow-datasets

In [8]:
!pip list | grep openpyxl



# import libraries

In [21]:
from typing import Dict, Text 
import pandas as pd
import numpy as np

import tensorflow as tf
import tensorflow_recommenders as tfrs


# Prepare Dataset

Firstly, we have to prepare the dataset and convert it the tensor style.

In [22]:
class PrepareDataset():

    def __init__(self, df):
        self.df_ = df[:100000]
        self.users_df = pd.DataFrame

    def feature_selection(self):
        # The first 100,000 records were selected because the original dataset was too large to train
        self.df_ = self.df_[self.df_['Quantity']>=1].dropna().reset_index(drop = True)

        # Select  essential features 
        self.users_df = self.df_[['Customer ID', 'Description']]
        help_ = self.users_df[['Description']].drop_duplicates()
        help_['product_id'] = [i+1 for i in range(help_.shape[0])]  
        self.users_df = self.users_df.merge(help_, on='Description'
                            ).drop('Description', axis=1
                            ).rename(columns = {'Description' : 'product_id', 'Customer ID': 'user_id'})
        return self.users_df
        
    def create_tensor_dataset(self):
        # Convert the dataframe to tensor format
        self.users_df.user_id = self.users_df.user_id.apply(lambda x: str(int(x)))
        self.users_df.product_id = self.users_df.product_id.apply(lambda x: str(x))

        self.users_df = self.users_df.sample(frac=1).reset_index(drop=True)
        self.products_df = self.users_df[['product_id']]   

        self.users_dataset = tf.data.Dataset.from_tensor_slices(dict(self.users_df))
        self.products_dataset = tf.data.Dataset.from_tensor_slices(dict(self.products_df))
        return self.users_dataset, self.products_dataset

# Implement Recommender System

After converting the dataset, it's time to implement the recommender system on the generated dataset.

In [23]:
class data_manipulation:
    
    def __init__(self, users, products):
        self.users = users
        self.products = products
        
    # keep useful elements
    def keep_useful_elements(self):
        self.users = self.users.map(lambda x: {
                         'product_id' : x['product_id'],
                         'user_id' : x['user_id'],
                    })
        self.products = self.products.map(lambda x: x['product_id'])
        return self.users, self.products 
    
    # Train test split
    def train_test_generator(self, train_range=80_000, all_range=100_000):
        tf.random.set_seed(42)
        shuffled = self.users.shuffle(all_range, seed=42, reshuffle_each_iteration=False)
        train = shuffled.take(train_range)
        test = shuffled.skip(train_range).take(all_range - train_range)
        return train, test
    
    # Create a list of unique products and users
    def pass_unique(self):
        product_ids = self.products.batch(1_000)
        user_ids = self.users.batch(1_000_000).map(lambda x: x["user_id"])

        unique_product_ids = np.unique(np.concatenate(list(product_ids)))
        unique_user_ids = np.unique(np.concatenate(list(user_ids)))
        return unique_product_ids, unique_user_ids

In [46]:
class modelAndLoss:
    
    def __init__(self, unique_product_ids, unique_user_ids, products):
        self.unique_product_ids = unique_product_ids
        self.unique_user_ids = unique_user_ids
        self.products = products

    # Here, we're going to use Keras preprocessing layers to first convert user ids to integers, and then convert those
    # to user embeddings via an Embedding layer.
    def implement_model(self, embedding_dimension = 32):
        user_model = tf.keras.Sequential([
          tf.keras.layers.StringLookup(
              vocabulary=self.unique_user_ids, mask_token=None),
          # Add an additional embedding to account for unknown tokens.
          tf.keras.layers.Embedding(len(self.unique_user_ids) + 1, embedding_dimension)
        ])
        
        # the candidate tower
        self.product_model = tf.keras.Sequential([
        tf.keras.layers.StringLookup(
          vocabulary=self.unique_product_ids, mask_token=None),
        tf.keras.layers.Embedding(len(self.unique_product_ids) + 1, embedding_dimension)
        ])
        return user_model, self.product_model
    
    def metrics_loss(self, batch_size = 128):
        metrics = tfrs.metrics.FactorizedTopK(
          candidates= self.products.batch(batch_size).map(self.product_model)
        )
        task = tfrs.tasks.Retrieval(
            metrics=metrics)
        return task
        # Precompute product embeddings
        # product_embeddings = self.products.batch(batch_size).map(lambda x: self.product_model(x))

        # # Use the precomputed embeddings for the FactorizedTopK metric
        # metrics = tfrs.metrics.FactorizedTopK(
        #     candidates=tf.data.Dataset.from_tensor_slices(self.unique_product_ids).batch(batch_size).map(self.product_model)
        # )
        
        # # Define the retrieval task
        # task = tfrs.tasks.Retrieval(
        #     metrics=metrics
        # )
        
        # return task

In [30]:
# import tensorflow as tf

# class modelAndLoss:
    
#     def __init__(self, unique_product_ids, unique_user_ids, products):
#         self.unique_product_ids = unique_product_ids
#         self.unique_user_ids = unique_user_ids
#         self.products = products

#     # Create the user and product models
#     def implement_model(self, embedding_dimension=32):
#         # User model
#         self.user_model = tf.keras.Sequential([
#             tf.keras.layers.StringLookup(vocabulary=self.unique_user_ids, mask_token=None),
#             tf.keras.layers.Embedding(len(self.unique_user_ids) + 1, embedding_dimension)
#         ])
        
#         # Product model (candidate tower)
#         self.product_model = tf.keras.Sequential([
#             tf.keras.layers.StringLookup(vocabulary=self.unique_product_ids, mask_token=None),
#             tf.keras.layers.Embedding(len(self.unique_product_ids) + 1, embedding_dimension)
#         ])
        
#         return self.user_model, self.product_model

#     # Manual Top-K retrieval function
#     def manual_top_k(self, user_embeddings, product_embeddings, k=10):
#         # Compute the dot product (similarity) between user and product embeddings
#         similarity_scores = tf.matmul(user_embeddings, product_embeddings, transpose_b=True)
        
#         # Get top-k product indices based on similarity scores
#         top_k_values, top_k_indices = tf.nn.top_k(similarity_scores, k=k)
        
#         return top_k_indices, top_k_values

#     # Custom loss and metrics function
#     def metrics_loss(self, batch_size=128, k=10):
#         # Precompute product embeddings for all products in the dataset
#         product_embeddings = tf.concat([self.product_model(tf.constant([prod_id])) for prod_id in self.unique_product_ids], axis=0)
        
#         # Example user embedding (you would replace this with the actual user input)
#         user_embeddings = self.user_model(tf.constant(self.unique_user_ids))  # Assuming all users are considered
        
#         # Compute the top-k products for each user
#         top_k_indices, top_k_values = self.manual_top_k(user_embeddings, product_embeddings, k=k)

#         # Define your retrieval task (if necessary, you can compute a custom loss here)
#         task = {
#             "top_k_indices": top_k_indices,
#             "top_k_values": top_k_values
#         }

#         return task


In [42]:
class userProductModel(tfrs.Model):

    def __init__(self, user_model, product_model):
        super().__init__()
        self.product_model: tf.keras.Model = product_model
        self.user_model: tf.keras.Model = user_model
        self.task: tf.keras.layers.Layer = task

    # Now it's time to implement the full model
    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
        # We pick out the user features and pass them into the user model.
        self.user_embeddings = self.user_model(features["user_id"])
        # And pick out the product features and pass them into the product model,
        # getting embeddings back.
        self.positive_product_embeddings = self.product_model(features["product_id"])

        # The task computes loss and the metrics.
        return self.task(self.user_embeddings, self.positive_product_embeddings)

In [43]:
# fitting and evaluating
class fitAndEvaluateModel:

    # As the final stage, we create, compile, fit, and evaluate our model
    
    def __init__(self, user_model, product_model, train, test):
        self.user_model = user_model
        self.product_model = product_model
        self.model = None
        self.train = train
        self.test = test
        
    def create_model(self):
        self.model = userProductModel(self.user_model, self.product_model) 
    
    def compile_model(self):
        self.model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))
    
    def fit_model(self):
        cached_train = train.shuffle(200_000).batch(8192).cache()
        self.cached_test = test.batch(4096).cache()
        self.model.fit(cached_train, epochs=10)
        return self.model
    
    def evaluate_model(self):
        self.model.evaluate(self.cached_test, return_dict=True)
        return self.model

# Recommend products to users

After fitting and evaluating the model, it's time to make suggestions to users.
In this example, we considered user numnber "15865" as our sample, however, it could be any user number.

In [44]:
def make_predictions(model, products, product_model):
    # Create a model that takes in raw query features, and
    index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
    # recommends products out of the entire products dataset.
    index.index_from_dataset(
      tf.data.Dataset.zip((products.batch(100), products.batch(100).map(model.product_model)))
)

    _, ids = index(tf.constant(['15865']))
    return ids, index

In [18]:
# def make_predictions(model, products, product_model):
#     # Create a model that takes in raw query features, and
#     index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
    
#     # Ensure that the product_model processes batched products correctly
#     product_embeddings = products.map(lambda x: (x, model.product_model(x)))
    
#     # Index the product embeddings into the BruteForce model
#     index.index_from_dataset(product_embeddings.batch(100))

#     # Test the prediction with a constant user id
#     _, ids = index(tf.constant(['15865']))
    
#     return ids, index


In [16]:
import os


# Build, compile, and evaluate the model

In [37]:
path = './Dataset/online_retail_II.xlsx'
df = pd.read_excel(path)

dataset = PrepareDataset(df)
users_df = dataset.feature_selection()
users, products = dataset.create_tensor_dataset()

data = data_manipulation(users, products)
users, products = data.keep_useful_elements()
train, test = data.train_test_generator()
unique_product_ids, unique_user_ids = data.pass_unique()


In [48]:

pre_model = modelAndLoss(unique_product_ids, unique_user_ids, products)
user_model, product_model = pre_model.implement_model()
# task = pre_model.metrics_loss()
# product_model

model = fitAndEvaluateModel(user_model, product_model, train, test)
model.create_model()
model.compile_model()
model_ = model.fit_model()

Epoch 1/10


TypeError: 'TrackedDict' object is not callable

# Torch model 

In [50]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, Dataset

# Mock dataset
class InteractionDataset(Dataset):
    def __init__(self, user_ids, item_ids):
        self.user_ids = user_ids
        self.item_ids = item_ids

    def __len__(self):
        return len(self.user_ids)

    def __getitem__(self, idx):
        return self.user_ids[idx], self.item_ids[idx]

# Define the User and Item Embedding Towers
class TwoTowerRecommender(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim=32):
        super(TwoTowerRecommender, self).__init__()
        # User embedding tower
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.user_fc = nn.Sequential(
            nn.Linear(embedding_dim, 128),
            nn.ReLU(),
            nn.Linear(128, embedding_dim)
        )
        # Item embedding tower
        self.item_embedding = nn.Embedding(num_items, embedding_dim)
        self.item_fc = nn.Sequential(
            nn.Linear(embedding_dim, 128),
            nn.ReLU(),
            nn.Linear(128, embedding_dim)
        )

    def forward(self, user_ids, item_ids):
        # Forward pass for users and items
        user_embeds = self.user_fc(self.user_embedding(user_ids))
        item_embeds = self.item_fc(self.item_embedding(item_ids))
        
        return user_embeds, item_embeds

    def get_user_embedding(self, user_id):
        # Get user embedding for a single user
        user_embedding = self.user_fc(self.user_embedding(user_id))
        return user_embedding

    def get_all_item_embeddings(self):
        # Get embeddings for all items
        all_items = torch.arange(self.item_embedding.num_embeddings).to(device)
        item_embeddings = self.item_fc(self.item_embedding(all_items))
        return item_embeddings

    def recommend(self, user_id, top_n=5):
        # Get the user embedding for the specific user
        user_embedding = self.get_user_embedding(user_id)

        # Get all item embeddings
        item_embeddings = self.get_all_item_embeddings()

        # Compute similarity (dot product) between the user and all items
        scores = torch.matmul(item_embeddings, user_embedding.T).squeeze()

        # Get top-N item indices based on the scores
        top_n_scores, top_n_indices = torch.topk(scores, top_n)

        return top_n_indices, top_n_scores

# Loss function (contrastive loss using cosine similarity)
def cosine_similarity_loss(user_embeds, item_embeds):
    similarity = torch.cosine_similarity(user_embeds, item_embeds)
    loss = 1 - similarity.mean()
    return loss

# Example training loop
def train_model(model, data_loader, epochs=10, lr=0.001):
    optimizer = optim.Adam(model.parameters(), lr=lr)
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for user_ids, item_ids in data_loader:
            user_ids, item_ids = user_ids.to(device), item_ids.to(device)
            
            # Forward pass
            user_embeds, item_embeds = model(user_ids, item_ids)
            
            # Compute loss
            loss = cosine_similarity_loss(user_embeds, item_embeds)
            total_loss += loss.item()

            # Backward pass and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(data_loader)}")

# Mock Data
users = ['user1', 'user2', 'user3', 'user4']
items = ['item1', 'item2', 'item3', 'item4', 'item5']

# Encoding user and item IDs to integers
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

user_ids = user_encoder.fit_transform(users)
item_ids = item_encoder.fit_transform(items)

# Dataset and DataLoader
dataset = InteractionDataset(torch.tensor(user_ids), torch.tensor(item_ids))
data_loader = DataLoader(dataset, batch_size=2, shuffle=True)

# Define model parameters
num_users = len(user_encoder.classes_)
num_items = len(item_encoder.classes_)
embedding_dim = 32

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize the model
model = TwoTowerRecommender(num_users, num_items, embedding_dim).to(device)

# Train the model
train_model(model, data_loader, epochs=10)

# Example inference: Recommend top-N items for a specific user
user_id = torch.tensor([user_encoder.transform(['user1'])[0]]).to(device)
top_n_items, top_n_scores = model.recommend(user_id, top_n=3)

# Decode item indices to item labels
recommended_items = item_encoder.inverse_transform(top_n_items.cpu().numpy())

print(f"Top 3 recommended items for user1: {recommended_items}")


Epoch 1/10, Loss: 0.9895404577255249
Epoch 2/10, Loss: 0.6394259035587311
Epoch 3/10, Loss: 0.377838134765625
Epoch 4/10, Loss: 0.20817303657531738
Epoch 5/10, Loss: 0.11031857132911682
Epoch 6/10, Loss: 0.06631645560264587
Epoch 7/10, Loss: 0.04353860020637512
Epoch 8/10, Loss: 0.03541871905326843
Epoch 9/10, Loss: 0.027033984661102295
Epoch 10/10, Loss: 0.021920382976531982
Top 3 recommended items for user1: ['item3' 'item1' 'item5']
