<a href="https://colab.research.google.com/github/zrghassabi/System_Design_Coding/blob/main/accountEmbeddingsInInstagram.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

✅ Goal:

Train embeddings of account IDs from user sessions using Skip-gram with negative sampling.

📦 Required Libraries

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import defaultdict


🧾 Sample Sessions

In [None]:
# Simulated user sessions
sessions = [
    ["catsdaily", "meowworld", "funnycats"],
    ["travelgram", "naturepics", "worldexplorer"],
    ["catsdaily", "catvideos", "meowworld"],
    ["worldexplorer", "earthpix", "travelgram"]
]

# Build vocabulary
all_accounts = list(set(acc for session in sessions for acc in session))
word2idx = {w: idx for idx, w in enumerate(all_accounts)}
idx2word = {idx: w for w, idx in word2idx.items()}
vocab_size = len(all_accounts)

print(all_accounts)
print(word2idx)
print(idx2word)
print(vocab_size)


🧠 Skip-gram Dataset Generator

In [None]:
def generate_skipgram_data(sessions, window_size=1):
    pairs = []
    for session in sessions:
        for center_pos in range(len(session)):
            for offset in range(-window_size, window_size + 1):
                context_pos = center_pos + offset
                if context_pos < 0 or context_pos >= len(session) or context_pos == center_pos:
                    continue
                center = word2idx[session[center_pos]]
                context = word2idx[session[context_pos]]
                pairs.append((center, context))
    return pairs

print(generate_skipgram_data(sessions,1))

🧠 Simple Embedding Model (Skip-gram with Negative Sampling)

In [None]:
class SkipGramModel(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__()
        self.input_embeddings = nn.Embedding(vocab_size, embed_dim)
        self.output_embeddings = nn.Embedding(vocab_size, embed_dim)

    def forward(self, center, context, negs):
        center_emb = self.input_embeddings(center)  # (batch, dim)
        context_emb = self.output_embeddings(context)  # (batch, dim)
        neg_emb = self.output_embeddings(negs)  # (batch, neg_samples, dim)

        pos_score = torch.sum(center_emb * context_emb, dim=1)  # dot product
        #neg_score = torch.bmm(neg_emb, center_emb.unsqueeze(2)).squeeze()
        neg_score = torch.bmm(neg_emb, center_emb.unsqueeze(2)).squeeze(2)  # shape: (batch, neg_samples)
        loss = -torch.log(torch.sigmoid(pos_score)) - torch.sum(torch.log(torch.sigmoid(-neg_score)), dim=1)
        #loss = -torch.log(torch.sigmoid(pos_score)) - torch.sum(torch.log(torch.sigmoid(-neg_score)), dim=1)
        return loss.mean()


🔁 Training

In [None]:
embed_dim = 2
model = SkipGramModel(vocab_size, embed_dim)
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Generate training pairs
pairs = generate_skipgram_data(sessions)
num_epochs = 100
neg_samples = 3

for epoch in range(num_epochs):
    total_loss = 0
    random.shuffle(pairs)
    for center, context in pairs:
        negs = [random.randint(0, vocab_size - 1) for _ in range(neg_samples)]
        center_tensor = torch.tensor([center])
        context_tensor = torch.tensor([context])
        neg_tensor = torch.tensor([negs])

        loss = model(center_tensor, context_tensor, neg_tensor)
        total_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {total_loss:.4f}")


📤 View the Learned Embeddings

In [None]:
embeddings = model.input_embeddings.weight.data
for idx, vec in enumerate(embeddings):
    print(f"{idx2word[idx]} → {vec.numpy()}")


📊 Step 1: Visualize Embeddings in 2D

In [None]:
import matplotlib.pyplot as plt

# Get embedding weights
embeddings = model.input_embeddings.weight.data

# Plot each point
plt.figure(figsize=(8, 6))
for idx, vec in enumerate(embeddings):
    x, y = vec.numpy()
    plt.scatter(x, y)
    plt.text(x + 0.01, y + 0.01, idx2word[idx], fontsize=12)

plt.title("Account Embeddings (ig2vec-style)")
plt.xlabel("Dim 1")
plt.ylabel("Dim 2")
plt.grid(True)
plt.show()


✅ This will show accounts like @catsdaily, @meowworld clustered together if the model worked well — indicating topical similarity.

🧭 Step 2: Find Similar Accounts (Cosine Similarity)

In [None]:
import torch.nn.functional as F

def get_similar_accounts(account_name, top_k=3):
    account_id = word2idx[account_name]
    target_vec = embeddings[account_id].unsqueeze(0)  # shape (1, dim)
    print(f" account id {account_id}, target_vec {target_vec},\n embed \n {embeddings}")

    # Normalize all embeddings and compute cosine similarity
    norms = F.normalize(embeddings, dim=1)
    print(f"noms  : \n  {norms}")
    sim = torch.matmul(F.normalize(target_vec, dim=1), norms.T).squeeze()

    # Get top k similar accounts (excluding self)
    topk = torch.topk(sim, top_k + 1)
    for i, idx in enumerate(topk.indices):
        if idx == account_id:
            continue
        print(f"{i}. {idx2word[idx.item()]} → similarity: {sim[idx]:.4f}")


▶️ Example usage:

In [None]:
get_similar_accounts("catsdaily", top_k=3)


This function returns the most similar accounts to "catsdaily" based on learned embeddings — just like Instagram uses to find similar content sources.

🔧 Part 1: Improve Negative Sampling

Instead of choosing random negatives (which may be too easy), we’ll sample harder negatives that are:

    Not in the same session as the current center

    Appear frequently in other sessions (more informative)

🔁 Step 1: Improve Negative Sampling

In [None]:
from collections import Counter

# Build frequency counter of all accounts
account_freq = Counter()
for session in sessions:
    for acc in session:
        account_freq[word2idx[acc]] += 1

def get_hard_negative_samples(center_id, session_ids, k=3):
    all_ids = set(range(vocab_size))
    context_ids = set(session_ids)
    candidate_neg_ids = list(all_ids - context_ids - {center_id})

    if len(candidate_neg_ids) < k:
        # Fallback to random sampling from all non-center accounts
        candidate_neg_ids = list(all_ids - {center_id})
        k = min(len(candidate_neg_ids), k)

    # Sort by frequency (optional for harder negatives)
    candidate_neg_ids.sort(key=lambda x: -account_freq[x])

    return random.sample(candidate_neg_ids, k)



🔹 Replace the old random.randint() sampling inside training loop with:

In [None]:
embed_dim = 2
model = SkipGramModel(vocab_size, embed_dim)
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Generate training pairs
pairs = generate_skipgram_data(sessions)
num_epochs = 100
neg_samples = 3

for epoch in range(num_epochs):
    total_loss = 0
    random.shuffle(pairs)
    for center, context in pairs:
        #negs = [random.randint(0, vocab_size - 1) for _ in range(neg_samples)]
        negs = get_hard_negative_samples(center, [w for w, _ in pairs if w != center])
        center_tensor = torch.tensor([center])
        context_tensor = torch.tensor([context])
        neg_tensor = torch.tensor([negs])

        loss = model(center_tensor, context_tensor, neg_tensor)
        total_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {total_loss:.4f}")

✅ Now, negative examples are more meaningful, helping the model learn better embeddings.

In [None]:
embeddings = model.input_embeddings.weight.data
for idx, vec in enumerate(embeddings):
    print(f"{idx2word[idx]} → {vec.numpy()}")

import matplotlib.pyplot as plt

# Get embedding weights
embeddings = model.input_embeddings.weight.data

# Plot each point
plt.figure(figsize=(8, 6))
for idx, vec in enumerate(embeddings):
    x, y = vec.numpy()
    plt.scatter(x, y)
    plt.text(x + 0.01, y + 0.01, idx2word[idx], fontsize=12)

plt.title("Account Embeddings (ig2vec-style)")
plt.xlabel("Dim 1")
plt.ylabel("Dim 2")
plt.grid(True)
plt.show()


import torch.nn.functional as F

def get_similar_accounts(account_name, top_k=3):
    account_id = word2idx[account_name]
    target_vec = embeddings[account_id].unsqueeze(0)  # shape (1, dim)
    print(f" account id {account_id}, target_vec {target_vec},\n embed \n {embeddings}")

    # Normalize all embeddings and compute cosine similarity
    norms = F.normalize(embeddings, dim=1)
    print(f"noms  : \n  {norms}")
    sim = torch.matmul(F.normalize(target_vec, dim=1), norms.T).squeeze()

    # Get top k similar accounts (excluding self)
    topk = torch.topk(sim, top_k + 1)
    for i, idx in enumerate(topk.indices):
        if idx == account_id:
            continue
        print(f"{i}. {idx2word[idx.item()]} → similarity: {sim[idx]:.4f}")


get_similar_accounts("catsdaily", top_k=3)

🌱 Part 2: Use Seed Accounts for Explore Recommendations
🔎 Step 2: Find Seed Accounts

In [None]:
def get_seed_accounts(user_sessions):
    """Return unique recent accounts a user interacted with"""
    last_session = user_sessions[-1]  # most recent session
    return [word2idx[acc] for acc in last_session]


🎯 Step 3: Recommend Similar Accounts via ig2vec

In [None]:
def recommend_accounts_from_seeds(seed_ids, top_k=5):
    seed_vecs = embeddings[seed_ids]
    avg_vec = torch.mean(seed_vecs, dim=0).unsqueeze(0)  # Average embedding

    norms = F.normalize(embeddings, dim=1)
    sim = torch.matmul(F.normalize(avg_vec, dim=1), norms.T).squeeze()

    recommended_ids = torch.topk(sim, top_k + len(seed_ids)).indices.tolist()
    recommended_ids = [i for i in recommended_ids if i not in seed_ids][:top_k]

    print("🔎 Recommended accounts based on seed:")
    for idx in recommended_ids:
        print(f"→ {idx2word[idx]} (similarity: {sim[idx]:.4f})")


🧪 Example Usage:

In [None]:
user_sessions = [
    ["catsdaily", "funnycats", "meowworld"],  # past sessions
    ["catvideos", "meowworld"]                # most recent session
]

# 1. Find seed accounts
seed_ids = get_seed_accounts(user_sessions)

print(f"seed ids{seed_ids}")

# 2. Recommend similar accounts
recommend_accounts_from_seeds(seed_ids, top_k=6)


✅ This mimics Instagram’s Explore sourcing step:

    Recent user interactions → seed accounts

    ig2vec embeddings → similar accounts

    Media from similar accounts → candidates for recommendation

✅ Part 1: Ranking Candidate Posts Using a Value Model

Just like Instagram’s Explore system, we’ll assign a score to each post based on predicted user actions.
🎯 Value Model Formula:

In [None]:
# Example predicted probabilities for a post (you can get them from a model or simulate)
P_like = 0.7
P_save = 0.4
P_hide = 0.1

# Define weights
w1 = 1.0  # weight for like
w2 = 1.2  # weight for save
w3 = 2.0  # penalty for hide

# Compute value score
score = w1 * P_like + w2 * P_save - w3 * P_hide
print(f"Final Value Score: {score:.4f}")

#or
def value_model(P_like, P_save, P_hide, w1=1.0, w2=1.2, w3=2.0):
    return w1 * P_like + w2 * P_save - w3 * P_hide

score = value_model(P_like, P_save, P_hide)




We simulate probabilities using dummy prediction functions or random scores (in real systems, they’re outputs from neural nets).

🧪 Step 1: Simulate Candidate Posts

Let’s say the accounts recommended (from ig2vec) posted the following content:

In [None]:
# Fake post content metadata from recommended accounts
posts = [
    {"post_id": "p101", "account": "catvideos"},
    {"post_id": "p102", "account": "meowworld"},
    {"post_id": "p103", "account": "funnycats"},
]


🎲 Step 2: Simulate Predicted User Actions

In [None]:
import numpy as np

def predict_user_actions(post):
    """Simulated probabilities of like, save, hide"""
    return {
        "P_like": np.random.uniform(0.3, 0.9),
        "P_save": np.random.uniform(0.1, 0.8),
        "P_hide": np.random.uniform(0.0, 0.2)
    }


📊 Step 3: Rank Candidates Using Value Model

In [None]:
def rank_posts(posts, w_like=1.0, w_save=1.2, w_hide=2.0):
    ranked_posts = []
    for post in posts:
        probs = predict_user_actions(post)
        value_score = (
            w_like * probs["P_like"] +
            w_save * probs["P_save"] -
            w_hide * probs["P_hide"]
        )
        ranked_posts.append((post["post_id"], post["account"], value_score))

    ranked_posts.sort(key=lambda x: -x[2])  # Descending order

    print("📈 Ranked Recommended Posts:")
    for pid, acc, score in ranked_posts:
        print(f"→ Post {pid} from @{acc}, Score: {score:.4f}")

    return ranked_posts  # ✅ this line is the fix


▶️ Example usage:

In [None]:
rank_posts(posts)


🖼️ Part 2: Explore-style Grid Display (Console Simulation)

Let’s simulate a basic Explore feed grid (3x3) of top posts.

In [None]:
def show_explore_grid(ranked_posts, grid_size=3):
    print("\n🧱 Instagram Explore Grid\n")
    for i in range(grid_size):
        row = ranked_posts[i * grid_size:(i + 1) * grid_size]
        row_str = " | ".join([f"{pid}@{acc}" for pid, acc, _ in row])
        print(row_str)


🔁 Run it all together:

In [None]:
top_ranked = rank_posts(posts)
show_explore_grid(top_ranked, grid_size=1)


✅ Part 1: Add Diversity Logic

To avoid showing multiple posts from the same account, we'll add a penalty if the same account appears again in the ranking list.
🧠 Logic:

    For each post, check if its account already appeared.

    If yes, apply a penalty (e.g. subtract 0.2 from score).

    This mimics Instagram’s "don’t repeat author" heuristic.

🛠 Update to rank_posts() with diversity logic:

In [None]:
def rank_posts(posts, w_like=1.0, w_save=1.2, w_hide=2.0, diversity_penalty=0.2):
    ranked_posts = []
    seen_accounts = set()

    for post in posts:
        probs = predict_user_actions(post)
        score = (
            w_like * probs["P_like"] +
            w_save * probs["P_save"] -
            w_hide * probs["P_hide"]
        )

        # Apply diversity penalty if we've already seen this account
        if post["account"] in seen_accounts:
            score -= diversity_penalty
        else:
            seen_accounts.add(post["account"])

        ranked_posts.append((post["post_id"], post["account"], score))

    ranked_posts.sort(key=lambda x: -x[2])  # Sort descending by score

    print("📈 Ranked Recommended Posts (with diversity penalty):")
    for pid, acc, score in ranked_posts:
        print(f"→ Post {pid} from @{acc}, Score: {score:.4f}")

    return ranked_posts


In [None]:
top_ranked = rank_posts(posts)
show_explore_grid(top_ranked, grid_size=1)

If you'd like, we can simulate more posts, apply filtering, or build this into a streamlit or Flask demo.

✅ Part 2: Use Real Neural Network for Prediction

We'll simulate a tiny neural network that predicts user actions (like/save/hide) based on post + account embeddings.

In [None]:
class PostScorer(nn.Module):
    def __init__(self, embed_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(embed_dim, 32),
            nn.ReLU(),
            nn.Linear(32, 3),  # 3 outputs: like, save, hide
            nn.Sigmoid()       # get probabilities
        )

    def forward(self, x):
        return self.model(x)


🧠 Connect Account Embedding to Model:

In [None]:
# Initialize model
scoring_model = PostScorer(embed_dim=2)

def predict_user_actions_nn(post):
    """Use NN to predict like/save/hide"""
    acc_id = word2idx.get(post["account"])
    if acc_id is None:
        return {"P_like": 0.5, "P_save": 0.3, "P_hide": 0.1}  # default

    embed = model.input_embeddings(torch.tensor([acc_id])).detach()
    with torch.no_grad():
        preds = scoring_model(embed).squeeze().numpy()

    return {
        "P_like": preds[0],
        "P_save": preds[1],
        "P_hide": preds[2]
    }


🔄 Replace:

In rank_posts(), replace this line:

In [None]:
def rank_posts(posts, w_like=1.0, w_save=1.2, w_hide=2.0, diversity_penalty=0.2):
    ranked_posts = []
    seen_accounts = set()

    for post in posts:
        probs = predict_user_actions(post)
        score = (
            w_like * probs["P_like"] +
            w_save * probs["P_save"] -
            w_hide * probs["P_hide"]
        )

        # Apply diversity penalty if we've already seen this account
        if post["account"] in seen_accounts:
            score -= diversity_penalty
        else:
            seen_accounts.add(post["account"])

        ranked_posts.append((post["post_id"], post["account"], score))

    ranked_posts.sort(key=lambda x: -x[2])  # Sort descending by score

    print("📈 Ranked Recommended Posts (with diversity penalty):")
    for pid, acc, score in ranked_posts:
        print(f"→ Post {pid} from @{acc}, Score: {score:.4f}")

    return ranked_posts



In [None]:
top_ranked = rank_posts(posts)
show_explore_grid(top_ranked, grid_size=1)

✅ This simulates Instagram’s setup:

    Embedding from ig2vec

    Feed into NN

    Get probabilities of user actions

    Rank using value model

    Add diversity penalty