In [2]:
# CONFIG - adjust for scale
NUM_USERS = 5000          # recommended: 5k for medium experiments
NUM_POSTS = 20000         # recommended: 20k
NUM_INTERACTIONS = 150000 # recommended: 150k - 300k depending on memory/time

DATA_DIR = './data_synthetic_notebook'  # where CSV outputs will be saved

In [5]:
# Imports and helper functions
import os, random, uuid, math
from datetime import datetime, timedelta
from collections import defaultdict, Counter
import numpy as np, pandas as pd
from tqdm import tqdm
from faker import Faker
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize
from sklearn.linear_model import LogisticRegression
from sklearn.metrics.pairwise import cosine_similarity

fake = Faker()
os.makedirs(DATA_DIR, exist_ok=True)

def precision_at_k(recommended, relevant_set, k):
    if k == 0: return 0.0
    return sum(1 for x in recommended[:k] if x in relevant_set) / k

def recall_at_k(recommended, relevant_set, k):
    if not relevant_set: return 0.0
    return sum(1 for x in recommended[:k] if x in relevant_set) / len(relevant_set)

def average_precision(recommended, relevant_set, k):
    if not relevant_set: return 0.0
    hits=0; sum_prec=0.0
    for i, r in enumerate(recommended[:k], start=1):
        if r in relevant_set:
            hits += 1
            sum_prec += hits / i
    return sum_prec / max(1, len(relevant_set))

def dcg_at_k(recommended, relevant_set, k):
    dcg=0.0
    for i, r in enumerate(recommended[:k], start=1):
        rel = 1 if r in relevant_set else 0
        dcg += (2**rel - 1) / math.log2(i+1)
    return dcg

def idcg_at_k(n_rel, k):
    idcg=0.0
    for i in range(1, min(n_rel,k)+1):
        idcg += 1 / math.log2(i+1)
    return idcg

def ndcg_at_k(recommended, relevant_set, k):
    idcg = idcg_at_k(len(relevant_set), k)
    if idcg == 0: return 0.0
    return dcg_at_k(recommended, relevant_set, k) / idcg

In [None]:
# DATA GENERATION - realistic synthetic dataset
TOPICS = ['travel','food','entertainment','tech','education','meme','sports','lifestyle']

def generate_users(n_users):
    users = []
    user_ids = [str(uuid.uuid4()) for _ in range(n_users)]
    for uid in user_ids:
        r = random.random()
        if r < 0.15: user_type = 'power'
        elif r < 0.80: user_type = 'normal'
        else: user_type = 'lurker'
        preferred_topics = random.sample(TOPICS, random.randint(2,3))
        users.append({'id':uid, 'name': fake.name(), 'user_type': user_type, 'preferred_topics': preferred_topics, 'following': [], 'followers': []})
    return users

def build_follow_graph(users):
    user_ids = [u['id'] for u in users]
    for u in users:
        if u['user_type']=='power':
            fcount = random.randint(200, 1000)
        elif u['user_type']=='normal':
            fcount = random.randint(50,200)
        else:
            fcount = random.randint(5,30)
        followings = random.sample([x for x in user_ids if x!=u['id']], min(len(user_ids)-1, fcount))
        u['following'] = followings
    # populate followers lists
    id_to_user = {u['id']:u for u in users}
    for u in users:
        for fid in u['following']:
            id_to_user[fid]['followers'].append(u['id'])

def generate_posts(users, n_posts):
    posts = []
    post_ids = [str(uuid.uuid4()) for _ in range(n_posts)]
    for pid in post_ids:
        author = random.choice(users)
        topic = random.choice(author['preferred_topics'])
        created = fake.date_time_between(start_date='-365d', end_date='now')
        content = fake.sentence(nb_words=12) + ' ' + random.choice(['#fun','#tips','#review','#howto','#news'])
        posts.append({'id':pid, 'author': author['id'], 'topic': topic, 'content': content, 'createdAt': created.isoformat()})
    return posts

def generate_interactions(users, posts, n_interactions):
    ACTIONS = [('LIKE',0.8), ('POST_VIEW',0.12), ('CLICK',0.04), ('SHARE',0.03), ('COMMENT',0.01)]
    ACTIONS_list = [a for a,_ in ACTIONS]
    post_pop = {p['id']: 1.0 + random.random()*0.5 for p in posts}
    viral = set(random.sample([p['id'] for p in posts], max(1,int(len(posts)*0.01))))
    for vid in viral: post_pop[vid] += 3.0
    
    interactions = []
    now = datetime.now()
    for _ in range(n_interactions):
        u = random.choice(users)
        if u['following'] and random.random() < 0.7:
            candidate = [p for p in posts if p['author'] in u['following']]
            if not candidate:
                candidate = posts
        else:
            candidate = posts
        weights = [post_pop[p['id']] + (2.0 if p['topic'] in u['preferred_topics'] else 0.0) + 1.0/((now - datetime.fromisoformat(p['createdAt'])).days+1) for p in candidate]
        p = random.choices(candidate, weights=weights, k=1)[0]
        score = 0.0
        if p['topic'] in u['preferred_topics']: score += 1.2
        if p['author'] in u['following']: score += 1.0
        if p['id'] in viral: score += 2.0
        r = random.random()
        if score > 3 and r < 0.7:
            action = 'LIKE' if random.random() < 0.85 else 'SHARE'
        elif score > 2 and r < 0.4:
            action = 'LIKE'
        else:
            action = random.choices(ACTIONS_list, [w for _,w in ACTIONS])[0]
        created = fake.date_time_between(start_date='-30d', end_date='now')
        interactions.append({'id': str(uuid.uuid4()), 'userId': u['id'], 'postId': p['id'], 'action': action, 'createdAt': created.isoformat()})
    return interactions

# Run generation
print('Generating users...')
users = generate_users(NUM_USERS)
print('Building follow graph...')
build_follow_graph(users)
print('Generating posts...')
posts = generate_posts(users, NUM_POSTS)
print('Generating interactions... (this can take some time)')
interactions = generate_interactions(users, posts, NUM_INTERACTIONS)

# write CSVs
users_df = pd.DataFrame([{k:v for k,v in u.items() if k!='preferred_topics'} for u in users])
users_df['preferred_topics'] = users_df.index.map(lambda i: '|'.join(users[i]['preferred_topics']))
users_df.to_csv(os.path.join(DATA_DIR, 'users.csv'), index=False)

posts_df = pd.DataFrame(posts)
posts_df.to_csv(os.path.join(DATA_DIR, 'posts.csv'), index=False)

inter_df = pd.DataFrame(interactions)
inter_df.to_csv(os.path.join(DATA_DIR, 'interactions_all.csv'), index=False)

print('Saved CSVs to', DATA_DIR)

Generating users...
Building follow graph...
Generating posts...
Generating interactions... (this can take some time)


In [None]:
# TRAIN/TEST SPLIT - keep last high-intent (LIKE/SHARE/COMMENT) per user as test
HIGH_INTENT = set(['LIKE','SHARE','COMMENT'])

inter_df = pd.read_csv(os.path.join(DATA_DIR, 'interactions_all.csv'))
inter_df['createdAt'] = pd.to_datetime(inter_df['createdAt'])

inter_df = inter_df.sort_values(['userId','createdAt'])

train_rows = []
test_rows = []
users_with_test = set()

grouped = inter_df.groupby('userId')
for uid, g in grouped:
    high = g[g['action'].isin(HIGH_INTENT)]
    if len(high)>0:
        last_high = high.iloc[-1]
        test_rows.append(last_high.to_dict())
        train = g.drop(index=last_high.name)
        train_rows.extend(train.to_dict('records'))
        users_with_test.add(uid)
    else:
        train_rows.extend(g.to_dict('records'))

train_df = pd.DataFrame(train_rows)
test_df = pd.DataFrame(test_rows)

train_df.to_csv(os.path.join(DATA_DIR, 'interactions_train.csv'), index=False)
test_df.to_csv(os.path.join(DATA_DIR, 'interactions_test.csv'), index=False)

print('Train interactions:', len(train_df))
print('Test interactions (high-intent last per user):', len(test_df))
print('Users with test:', len(users_with_test))

In [None]:
# Build mappings and stats
users_df = pd.read_csv(os.path.join(DATA_DIR, 'users.csv'))
posts_df = pd.read_csv(os.path.join(DATA_DIR, 'posts.csv'))

user_ids = users_df['id'].tolist()
post_ids = posts_df['id'].tolist()

user_map = {uid:i for i,uid in enumerate(user_ids)}
post_map = {pid:i for i,pid in enumerate(post_ids)}

pop = train_df['postId'].value_counts().to_dict()

In [None]:
# CF: lightweight matrix factorization using TruncatedSVD on implicit-weighted matrix
ACTION_WEIGHT = {'LIKE':3.0, 'SHARE':4.0, 'COMMENT':2.5, 'CLICK':1.0, 'POST_VIEW':0.3}

from scipy.sparse import coo_matrix
rows, cols, vals = [], [], []
for _, r in train_df.iterrows():
    uidx = user_map.get(r['userId'])
    pidx = post_map.get(r['postId'])
    if uidx is None or pidx is None: continue
    w = ACTION_WEIGHT.get(r['action'], 0.5)
    rows.append(pidx); cols.append(uidx); vals.append(w)

mat = coo_matrix((vals, (rows, cols)), shape=(len(post_ids), len(user_ids)))
print('Interaction matrix shape:', mat.shape, 'nnz=', mat.nnz)

svd_dim = 64
svd = TruncatedSVD(n_components=min(svd_dim, mat.shape[0]-1), random_state=42)
item_factors = svd.fit_transform(mat)  # (n_items, dim)
item_factors = normalize(item_factors)
user_factors = (mat.T @ item_factors)
user_factors = normalize(user_factors)

print('CF factors shapes:', item_factors.shape, user_factors.shape)

In [None]:
# CBF: TF-IDF on post content
tfidf = TfidfVectorizer(max_features=20000, stop_words='english')
post_texts = posts_df['content'].fillna('').values.tolist()
post_tfidf = tfidf.fit_transform(post_texts)
post_tfidf = normalize(post_tfidf)

user_profiles = np.zeros((len(user_ids), post_tfidf.shape[1]))
counts = np.zeros(len(user_ids))
for _, r in train_df[train_df['action']=='LIKE'].iterrows():
    uidx = user_map.get(r['userId'])
    pidx = post_map.get(r['postId'])
    if uidx is None or pidx is None: continue
    user_profiles[uidx] += post_tfidf[pidx].toarray().ravel()
    counts[uidx] += 1
for i in range(len(user_ids)):
    if counts[i] > 0:
        user_profiles[i] /= counts[i]
user_profiles = normalize(user_profiles)
print('CBF shapes:', post_tfidf.shape, user_profiles.shape)

In [None]:
# HYBRID: candidate generation and simple logistic re-ranker
from heapq import nlargest
K_CF = 100; K_CBF = 100; TOP_POP = 200

cf_scores = item_factors @ user_factors.T  # (n_items, n_users)

def get_cf_topk_for_user(uidx, k):
    scores = cf_scores[:, uidx]
    top_idx = np.argsort(scores)[-k:][::-1]
    return top_idx, scores[top_idx]

from sklearn.metrics.pairwise import cosine_similarity
cbf_scores = cosine_similarity(user_profiles, post_tfidf)

def get_cbf_topk_for_user(uidx, k):
    scores = cbf_scores[uidx]
    top_idx = np.argsort(scores)[-k:][::-1]
    return top_idx, scores[top_idx]

pop_sorted = [pid for pid,_ in sorted(pop.items(), key=lambda x:-x[1])][:TOP_POP]
pop_idx = [post_map[pid] for pid in pop_sorted if pid in post_map]

LABEL_POS = set(['LIKE','SHARE','COMMENT'])
X = []; y = []
sampled_users = list(set(train_df['userId'].unique()) & set(user_ids))
random.shuffle(sampled_users)
max_users = min(2000, len(sampled_users))
for uid in tqdm(sampled_users[:max_users]):
    uidx = user_map[uid]
    cf_idx, cf_sc = get_cf_topk_for_user(uidx, K_CF)
    cbf_idx, cbf_sc = get_cbf_topk_for_user(uidx, K_CBF)
    candidates = list(dict.fromkeys(list(cf_idx[:50]) + list(cbf_idx[:50]) + pop_idx[:50]))
    user_pos = set(train_df[(train_df['userId']==uid) & (train_df['action'].isin(LABEL_POS))]['postId'].tolist())
    user_pos_idx = set([post_map[p] for p in user_pos if p in post_map])
    for pidx in candidates:
        features = [
            float(cf_scores[pidx, uidx]),
            float(cbf_scores[uidx, pidx]),
            float(pop.get(posts_df.loc[pidx,'id'], 0)),
            (datetime.now() - datetime.fromisoformat(posts_df.loc[pidx,'createdAt'])).days,
            0
        ]
        X.append(features); y.append(1 if pidx in user_pos_idx else 0)

X = np.array(X); y = np.array(y)
print('Re-ranker training size:', X.shape, 'pos_ratio=', y.mean())
clf = LogisticRegression(max_iter=200)
clf.fit(X, y)
print('Re-ranker trained.')

In [None]:
# EVALUATION: get top-K per user and compute metrics
K = 10
users_eval = list(test_df['userId'].unique())
precisions = []; recalls = []; aps = []; ndcgs = []

for uid in tqdm(users_eval):
    uidx = user_map.get(uid)
    cf_idx, cf_sc = get_cf_topk_for_user(uidx, K_CF)
    cbf_idx, cbf_sc = get_cbf_topk_for_user(uidx, K_CBF)
    candidates = list(dict.fromkeys(list(cf_idx[:200]) + list(cbf_idx[:200]) + pop_idx[:200]))
    feats = []
    for pidx in candidates:
        feats.append([cf_scores[pidx, uidx], cbf_scores[uidx, pidx], float(pop.get(posts_df.loc[pidx,'id'], 0)), (datetime.now() - datetime.fromisoformat(posts_df.loc[pidx,'createdAt'])).days, 0])
    scores = clf.predict_proba(np.array(feats))[:,1]
    ranked_idx = [candidates[i] for i in np.argsort(scores)[::-1][:K]]
    rel = set(test_df[test_df['userId']==uid]['postId'].tolist())
    rel_idx = set([post_map[p] for p in rel if p in post_map])
    precisions.append(precision_at_k(ranked_idx, rel_idx, K))
    recalls.append(recall_at_k(ranked_idx, rel_idx, K))
    aps.append(average_precision(ranked_idx, rel_idx, K))
    ndcgs.append(ndcg_at_k(ranked_idx, rel_idx, K))

print('Users evaluated:', len(precisions))
print('Mean Precision@10:', np.mean(precisions))
print('Mean Recall@10:', np.mean(recalls))
print('MAP@10:', np.mean(aps))
print('NDCG@10:', np.mean(ndcgs))