In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Example data
users_df = pd.read_csv("user_table.csv")         # user_id, gender, occupation
items_df = pd.read_csv("item_table.csv")         # item_id, category
interactions_df = pd.read_csv("interaction.csv") # user_id, item_id, interaction

# Label encode categorical fields
for col in ['user_id', 'gender', 'occupation']:
    users_df[col] = LabelEncoder().fit_transform(users_df[col])

for col in ['item_id', 'category']:
    items_df[col] = LabelEncoder().fit_transform(items_df[col])

interactions_df['user_id'] = users_df.set_index('user_id').index.get_indexer(interactions_df['user_id'])
interactions_df['item_id'] = items_df.set_index('item_id').index.get_indexer(interactions_df['item_id'])

# Map interaction types to labels/weights
event_weight = {'view': 0.5, 'click': 1.0, 'buy': 2.0}
interactions_df['weight'] = interactions_df['interaction'].map(event_weight)
interactions_df['label'] = 1  # All these are positive interactions


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import random

class RecSysDataset(Dataset):
    def __init__(self, users_df, items_df, interactions_df, num_negatives=1):
        self.users_df = users_df
        self.items_df = items_df
        self.interactions = interactions_df
        self.num_negatives = num_negatives
        self.item_ids = set(items_df['item_id'])

        self.data = []

        # Positive samples
        for _, row in interactions_df.iterrows():
            self.data.append((row['user_id'], row['item_id'], row['label'], row['weight']))

            # Negative samples
            for _ in range(num_negatives):
                neg_item = random.choice(list(self.item_ids - {row['item_id']}))
                self.data.append((row['user_id'], neg_item, 0, 1.0))  # label=0, weight=1

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        uid, iid, label, weight = self.data[idx]
        user = self.users_df.iloc[uid]
        item = self.items_df.iloc[iid]
        return {
            "user_id": torch.tensor(user['user_id']),
            "gender": torch.tensor(user['gender']),
            "occupation": torch.tensor(user['occupation']),
            "item_id": torch.tensor(item['item_id']),
            "category": torch.tensor(item['category']),
            "label": torch.tensor(label, dtype=torch.float32),
            "weight": torch.tensor(weight, dtype=torch.float32)
        }


In [None]:
import torch.nn as nn

class TwoTowerRecSys(nn.Module):
    def __init__(self, num_users, num_genders, num_occupations,
                 num_items, num_categories, embedding_dim=32):
        super().__init__()
        self.user_id_embed = nn.Embedding(num_users, embedding_dim)
        self.gender_embed = nn.Embedding(num_genders, embedding_dim)
        self.occupation_embed = nn.Embedding(num_occupations, embedding_dim)

        self.item_id_embed = nn.Embedding(num_items, embedding_dim)
        self.category_embed = nn.Embedding(num_categories, embedding_dim)

        self.user_proj = nn.Linear(embedding_dim * 3, embedding_dim)
        self.item_proj = nn.Linear(embedding_dim * 2, embedding_dim)

    def forward(self, user_id, gender, occupation, item_id, category):
        u = torch.cat([
            self.user_id_embed(user_id),
            self.gender_embed(gender),
            self.occupation_embed(occupation)
        ], dim=-1)
        u = self.user_proj(u)

        i = torch.cat([
            self.item_id_embed(item_id),
            self.category_embed(category)
        ], dim=-1)
        i = self.item_proj(i)

        return (u * i).sum(dim=-1)  # dot product
    
    
    def get_user_embedding(self, user_id, gender, occupation):
        u = torch.cat([
            self.user_id_embed(user_id),
            self.gender_embed(gender),
            self.occupation_embed(occupation)
        ], dim=-1)
        return self.user_proj(u)

    def get_item_embedding(self, item_id, category):
        i = torch.cat([
            self.item_id_embed(item_id),
            self.category_embed(category)
        ], dim=-1)
        return self.item_proj(i)

    def get_all_item_embeddings(self):
        all_ids = torch.arange(self.item_id_embed.num_embeddings)
        all_cats = torch.zeros_like(all_ids)  # you can change to real categories if you want
        return self.get_item_embedding(all_ids, all_cats)


In [None]:
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn.functional as F

# Setup
dataset = RecSysDataset(users_df, items_df, interactions_df, num_negatives=3)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

model = TwoTowerRecSys(
    num_users=len(users_df),
    num_genders=users_df['gender'].nunique(),
    num_occupations=users_df['occupation'].nunique(),
    num_items=len(items_df),
    num_categories=items_df['category'].nunique()
)

optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Training
for epoch in range(5):
    total_loss = 0
    for batch in dataloader:
        user_id = batch['user_id']
        gender = batch['gender']
        occupation = batch['occupation']
        item_id = batch['item_id']
        category = batch['category']
        label = batch['label']
        weight = batch['weight']

        scores = torch.sigmoid(model(user_id, gender, occupation, item_id, category))
        loss = F.binary_cross_entropy(scores, label, weight=weight)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch + 1}, Loss: {total_loss:.4f}")


In [None]:
class ReRanker(nn.Module):
    def __init__(self, embedding_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(embedding_dim * 2, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )

    def forward(self, user_embed, item_embed):
        x = torch.cat([user_embed, item_embed], dim=-1)
        return self.model(x).squeeze(-1)  # shape: [batch]


In [None]:
# Assume you already retrieved top 100 item IDs
topk_item_ids = topk_indices

# Get item features for re-ranking
topk_item_categories = torch.tensor([items_df.iloc[i]['category'] for i in topk_item_ids])
topk_item_embeds = model.get_item_embedding(topk_item_ids, topk_item_categories)

# Expand user embedding to match top-k
user_embed = model.get_user_embedding(user_id, gender, occupation)  # shape: [embedding_dim]
user_embed_expanded = user_embed.unsqueeze(0).repeat(len(topk_item_ids), 1)

# Re-rank
reranker = ReRanker(embedding_dim=32)  # use same embedding dim as in TwoTower
final_scores = reranker(user_embed_expanded, topk_item_embeds)

# Get final sorted items
sorted_indices = torch.argsort(final_scores, descending=True)
final_top_items = topk_item_ids[sorted_indices]
