<a href="https://colab.research.google.com/github/yguo005/Recommendation_System/blob/main/Retrieval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --quiet torch torchvision torchaudio faiss-cpu

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
# Load interaction data
df = pd.read_csv("amazon-beauty-train.inter", sep="\t", dtype=str)

# Keep positive interactions
df["label"] = pd.to_numeric(df["label"], errors="coerce").fillna(0).astype(int)
df = df[df["label"] == 1]

# Map user/item IDs to indices
user2idx = {u: idx for idx, u in enumerate(df["user_id"].unique())}
item2idx = {i: idx for idx, i in enumerate(df["item_id"].unique())}

df["user_idx"] = df["user_id"].map(user2idx)
df["item_idx"] = df["item_id"].map(item2idx)

num_users = len(user2idx)
num_items = len(item2idx)
print(f"#users: {num_users}, #items: {num_items}")


Using device: cuda
#users: 1210271, #items: 212506


In [None]:
class InterDataset(Dataset):
    def __init__(self, df):
        self.users = torch.tensor(df["user_idx"].values, dtype=torch.long)
        self.items = torch.tensor(df["item_idx"].values, dtype=torch.long)

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx]

dataset = InterDataset(df)
dataloader = DataLoader(dataset, batch_size=256, shuffle=True)


In [None]:
import torch.nn as nn
import torch.nn.functional as F

embedding_dim = 64

class TwoTowerModel(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim):
        super().__init__()
        self.user_emb = nn.Embedding(num_users, embedding_dim)
        self.item_emb = nn.Embedding(num_items, embedding_dim)

    def forward(self, user_idx, item_idx):
        u = self.user_emb(user_idx)
        i = self.item_emb(item_idx)
        # Dot product for retrieval score
        return (u * i).sum(dim=1)

    def get_user_embedding(self, user_idx):
        return self.user_emb(user_idx)

    def get_item_embedding(self, item_idx):
        return self.item_emb(item_idx)


In [None]:
class TwoTower(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim):
        super().__init__()
        self.user_emb = nn.Embedding(num_users, embedding_dim)
        self.item_emb = nn.Embedding(num_items, embedding_dim)

    def forward(self, user_idx, item_idx):
        u = self.user_emb(user_idx)
        i = self.item_emb(item_idx)
        score = (u * i).sum(dim=1)  # dot product
        return score

model = TwoTower(num_users, num_items, embedding_dim).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()

In [None]:
model = TwoTowerModel(num_users, num_items, embedding_dim)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()

# Simple binary labels for positive interactions
labels = torch.ones(256)

for epoch in range(3):
    for batch_users, batch_items in dataloader:
        optimizer.zero_grad()
        scores = model(batch_users, batch_items)
        batch_labels = torch.ones_like(scores)  # all positive
        loss = criterion(scores, batch_labels)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1} done")


Epoch 1 done


In [None]:
torch.save(model.user_emb.state_dict(), "user_emb.pt")
torch.save(model.item_emb.state_dict(), "item_emb.pt")

In [None]:
index = faiss.IndexFlatIP(embedding_dim)  # Inner product (dot product)
faiss.normalize_L2(item_embeddings)       # normalize for cosine similarity
index.add(item_embeddings)
print("FAISS index size:", index.ntotal)

In [None]:
import faiss
import numpy as np

# Convert embeddings to numpy
user_embeddings = model.user_emb.weight.detach().numpy().astype("float32")
item_embeddings = model.item_emb.weight.detach().numpy().astype("float32")

# Build Faiss index on items
index = faiss.IndexFlatL2(embedding_dim)
index.add(item_embeddings)

# Example: get top 5 items for first user
query_emb = user_embeddings[0:1]
distances, indices = index.search(query_emb, k=5)
print("Top 5 recommended item indices:", indices)


In [None]:
# Example: recommend for the first user in dataset
sample_user = df["user_id"].iloc[0]
print("Recommendations for user", sample_user, ":", recommend_for_user(sample_user, k=10))