In [1]:
!pip install pandas numpy sentence-transformers scikit-learn torch

Collecting sentence-transformers
  Using cached sentence_transformers-5.1.1-py3-none-any.whl.metadata (16 kB)
Collecting scikit-learn
  Using cached scikit_learn-1.7.2-cp313-cp313-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting torch
  Downloading torch-2.9.0-cp313-none-macosx_11_0_arm64.whl.metadata (30 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.57.1-py3-none-any.whl.metadata (43 kB)
Collecting scipy (from sentence-transformers)
  Using cached scipy-1.16.2-cp313-cp313-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Using cached huggingface_hub-0.35.3-py3-none-any.whl.metadata (14 kB)
Collecting filelock (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Using cached filelock-3.20.0-py3-none-any.whl.metadata (2.1 kB)
Collecting regex!=2019.12.17 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Using cached regex-2025.9.18-cp313-cp313-macosx_11_0_arm64

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
from sentence_transformers import SentenceTransformer, util
from sklearn.cluster import KMeans
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Load data
users = pd.read_csv("Assessment data - users.csv")
posts = pd.read_csv("Assessment data - posts.csv")

users.head()

Unnamed: 0,user_id,interested_in
0,2,"Memes,Money,News,Personal Finance,Learn,Invest..."
1,3,"Memes,Money,News,Personal Finance,Learn,Invest..."
2,4,"Memes,Money,News,Personal Finance,Learn,Invest..."
3,5,"Memes,Money,News,Personal Finance,Learn,Invest..."
4,6,"Memes,Money,News,Personal Finance,Learn,Invest..."


In [5]:
posts.head()

Unnamed: 0,post_id,user_id,content,is_anonymous,created_at,updated_at,topics,like_user_ids,shares,reports,likes
0,4858,18.0,Indian Companies Exposure To US . The List Con...,False,"Oct. 6, 2025, 12:50 p.m.","Oct. 6, 2025, 12:54 p.m.",,4106691413262119712358,26,0,15
1,4857,17.0,Many Of IPO Are Oversubscribing To 100X 200X A...,False,"Oct. 6, 2025, 12:44 p.m.","Oct. 6, 2025, 12:54 p.m.",,426235,20,0,5
2,4830,,Where do you see the potential of this stock i...,True,"Oct. 6, 2025, 12:35 p.m.","Oct. 6, 2025, 12:39 p.m.",,69262,23,0,4
3,4829,18.0,Note printing cost,False,"Oct. 6, 2025, 12:31 p.m.","Oct. 6, 2025, 12:39 p.m.",,69262,33,0,3
4,4855,15.0,Look At Global Debt Market Chart. US Have High...,False,"Oct. 6, 2025, 12:15 p.m.","Oct. 6, 2025, 12:54 p.m.",,4626297358,25,0,9


In [6]:
posts.columns

Index(['post_id', 'user_id', 'content', 'is_anonymous', 'created_at',
       'updated_at', 'topics', 'like_user_ids', 'shares', 'reports', 'likes'],
      dtype='object')

In [9]:
posts.isna().sum()

post_id            0
user_id          215
content            0
is_anonymous       0
created_at         0
updated_at         0
topics           794
like_user_ids      3
shares             0
reports            0
likes              0
dtype: int64

In [12]:
users['interested_in'] = users['interested_in'].fillna("").str.lower().str.strip()
posts['content'] = posts['content'].fillna("").str.lower().str.strip()
posts.drop('topics', axis=1, inplace=True)

### Generating popularity score first

In [13]:
# Fill missing values with 0
posts['likes'] = posts['likes'].fillna(0)
posts['shares'] = posts['shares'].fillna(0)
posts['reports'] = posts['reports'].fillna(0)

# Weighted combination
posts['popularity_raw'] = (
    0.7 * posts['likes'] +         # likes contribute most
    0.2 * posts['shares'] -        # shares contribute positively
    0.1 * posts['reports']         # reports reduce popularity
)

In [14]:
posts['popularity_raw'] = posts['popularity_raw'].clip(lower=0)

In [15]:
if posts['popularity_raw'].max() > 0:
    posts['popularity'] = posts['popularity_raw'] / posts['popularity_raw'].max()
else:
    posts['popularity'] = 0

### Recency

In [17]:
posts['created_at'] = pd.to_datetime(posts['created_at'], errors='coerce')

In [18]:
earliest = posts['created_at'].min()
latest = posts['created_at'].max()

posts['recency'] = (posts['created_at'] - earliest) / (latest - earliest)

posts['recency'] = posts['recency'].fillna(0)

### Semantic similarity

In [21]:
from sentence_transformers import SentenceTransformer, util
import torch

# Load the model
model = SentenceTransformer('all-MiniLM-L6-v2')

In [22]:
# Post embeddings
post_embeddings = model.encode(posts['content'].tolist(), convert_to_tensor=True)

# User embeddings (some may have empty interests)
user_embeddings = []
for interest in users['interested_in']:
    if interest.strip() == "":
        user_embeddings.append(None) 
    else:
        emb = model.encode(interest, convert_to_tensor=True)
        user_embeddings.append(emb)

users['embedding'] = user_embeddings

In [26]:
semantic_scores_list = []

for i, user in users.iterrows():
    user_emb = user['embedding']
    if user_emb is not None:
        # Cosine similarity with all posts
        cosine_scores = util.cos_sim(user_emb, post_embeddings)[0].cpu().numpy()
    else:
        # No interests → semantic score = 0
        cosine_scores = [0] * len(posts)
    
    semantic_scores_list.append(cosine_scores)

In [29]:
semantic_matrix = np.array(semantic_scores_list)

### Collabourative 

In [30]:
from sklearn.preprocessing import MultiLabelBinarizer

# Ensure like_user_ids is a list of integers
posts['like_user_ids'] = posts['like_user_ids'].apply(lambda x: str(x).split(',') if pd.notna(x) else [])
posts['like_user_ids'] = posts['like_user_ids'].apply(lambda x: [int(u) for u in x if u.strip().isdigit()])

# Create a binary interaction matrix: rows = posts, columns = users
mlb = MultiLabelBinarizer()
interaction_matrix = mlb.fit_transform(posts['like_user_ids'])
interaction_df = pd.DataFrame(interaction_matrix, columns=mlb.classes_, index=posts['post_id'])

In [32]:
from sklearn.metrics.pairwise import cosine_similarity

# User vectors (columns of interaction_df)
user_vectors = interaction_df.T.values

# Cosine similarity between all users
user_sim_matrix = cosine_similarity(user_vectors)
user_sim_df = pd.DataFrame(user_sim_matrix, index=interaction_df.columns, columns=interaction_df.columns)

In [33]:
def collaborative_score(target_user_id, top_k_sim=5):
    if target_user_id not in user_sim_df.index:
        # Fallback: no similar users
        return pd.Series(0, index=interaction_df.index)
    
    # Top-K similar users (excluding self)
    sim_scores = user_sim_df.loc[target_user_id].drop(target_user_id)
    top_sim_users = sim_scores.nlargest(top_k_sim)

    # Weighted sum of posts liked by similar users
    weighted_likes = interaction_df[top_sim_users.index].dot(top_sim_users.values)
    weighted_likes = weighted_likes / top_sim_users.values.sum()  # normalize
    return weighted_likes

In [34]:
# Example for one user
target_user_id = users.loc[0, 'user_id']
collab_scores = collaborative_score(target_user_id)

# Merge with posts
temp_df = posts.copy()
temp_df['collaborative_score'] = temp_df['post_id'].map(collab_scores)

In [35]:
temp_df

Unnamed: 0,post_id,user_id,content,is_anonymous,created_at,updated_at,like_user_ids,shares,reports,likes,popularity_raw,popularity,recency,collaborative_score
0,4858,18.0,indian companies exposure to us . the list con...,False,2025-10-06 12:50:00,"Oct. 6, 2025, 12:54 p.m.","[4, 10, 6, 69, 14, 13, 2, 62, 11, 9, 7, 12, 3,...",26,0,15,15.7,0.415344,1.000000,1.000000
1,4857,17.0,many of ipo are oversubscribing to 100x 200x a...,False,2025-10-06 12:44:00,"Oct. 6, 2025, 12:54 p.m.","[4, 2, 62, 3, 5]",20,0,5,7.5,0.198413,0.999937,0.634472
2,4830,,where do you see the potential of this stock i...,True,2025-10-06 12:35:00,"Oct. 6, 2025, 12:39 p.m.","[69, 2, 62]",23,0,4,7.4,0.195767,0.999843,0.000000
3,4829,18.0,note printing cost,False,2025-10-06 12:31:00,"Oct. 6, 2025, 12:39 p.m.","[69, 2, 62]",33,0,3,8.7,0.230159,0.999801,0.000000
4,4855,15.0,look at global debt market chart. us have high...,False,2025-10-06 12:15:00,"Oct. 6, 2025, 12:54 p.m.","[4, 6, 2, 62, 9, 7, 3, 5, 8]",25,0,9,11.3,0.298942,0.999633,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1479,11.0,trump says china violated its agreement with us,False,NaT,"June 1, 2025, 10:48 a.m.","[4, 6, 2, 3, 5]",6,0,6,5.4,0.142857,0.000000,0.823843
996,1477,76.0,help me,False,NaT,"May 31, 2025, 6:30 p.m.","[4, 6, 2, 7, 3, 5]",6,0,11,8.9,0.235450,0.000000,1.000000
997,1499,84.0,"hal isn’t falling, just resting. classic pre-b...",False,NaT,"May 31, 2025, 10:04 p.m.","[4, 10, 6, 2, 11, 9, 7, 12, 3, 5, 8]",5,0,6,5.2,0.137566,0.000000,1.000000
998,1476,88.0,anything i can do while filing itr to get refu...,False,NaT,"May 31, 2025, 6:30 p.m.","[4, 6, 2, 3, 5]",6,0,5,4.7,0.124339,0.000000,0.823843


### Recommendations part

In [46]:
semantic_scores_list

[array([ 2.98593163e-01,  6.53310716e-02,  2.57688373e-01,  2.19295286e-02,
         1.60744518e-01, -2.75596566e-02,  1.82426423e-01,  2.92812526e-01,
        -2.60378979e-02,  1.16945207e-01,  1.49954945e-01, -8.93407911e-02,
         2.82251477e-01,  4.51828092e-02,  9.87191573e-02,  1.21567242e-01,
         2.25093424e-01,  2.29381308e-01,  1.44962091e-02,  6.51541054e-02,
         1.27174810e-01,  1.68902859e-01,  1.86517030e-01,  5.96084371e-02,
         1.42639905e-01,  1.69064850e-01,  7.00986758e-02,  3.88134047e-02,
         3.94707508e-02,  2.02586398e-01,  7.18350932e-02,  2.63880968e-01,
         3.07201922e-01,  3.00662220e-01,  2.00135425e-01,  1.32873863e-01,
         6.24476001e-02,  9.69492570e-02,  1.44962091e-02,  1.74880728e-01,
         1.30847961e-01, -3.01551186e-02,  8.14318135e-02,  2.15661526e-02,
         2.10832387e-01,  1.45046264e-01,  3.24907675e-02,  3.24907675e-02,
         2.19960123e-01,  4.59169224e-02,  2.82133311e-01,  2.34810919e-01,
         1.2

In [48]:
# Pick a user
user_index = 0
user_id = users.loc[user_index, 'user_id']

# --- Semantic score ---
semantic_scores = semantic_scores_list[user_index]  # array of length = number of posts

# --- Collaborative score ---
# Make sure you already have `user_sim_df` and `interaction_df` as per collaborative filtering step
def collaborative_score(target_user_id, top_k_sim=5):
    if target_user_id not in user_sim_df.index:
        return pd.Series(0, index=interaction_df.index)
    sim_scores = user_sim_df.loc[target_user_id].drop(target_user_id)
    top_sim_users = sim_scores.nlargest(top_k_sim)
    weighted_likes = interaction_df[top_sim_users.index].dot(top_sim_users.values)
    weighted_likes = weighted_likes / top_sim_users.values.sum() if top_sim_users.values.sum() != 0 else weighted_likes*0
    return weighted_likes

collab_scores = collaborative_score(user_id)

# --- Build temp DataFrame for this user ---
temp_df = posts.copy()
temp_df['semantic_score'] = semantic_scores
temp_df['collaborative_score'] = temp_df['post_id'].map(collab_scores)  # map post_id to collab score

# --- Final score ---
temp_df['final_score'] = (
    0.5 * temp_df['semantic_score'] +
    0.25 * temp_df['popularity'] +
    0.15 * temp_df['recency'] +
    0.1 * temp_df['collaborative_score']
)

# --- Inspect first 10 rows ---
print(temp_df[['post_id', 'semantic_score', 'collaborative_score', 'popularity', 'recency', 'final_score']].head(10))

# --- Top 10 posts ---
top_posts = temp_df.sort_values('final_score', ascending=False).head(10)['post_id'].tolist()
print(f"Top 10 posts for user {user_id}: {top_posts}")

   post_id  semantic_score  collaborative_score  popularity   recency  \
0     4858        0.298593             1.000000    0.415344  1.000000   
1     4857        0.065331             0.634472    0.198413  0.999937   
2     4830        0.257688             0.000000    0.195767  0.999843   
3     4829        0.021930             0.000000    0.230159  0.999801   
4     4855        0.160745             1.000000    0.298942  0.999633   
5     4828       -0.027560             1.000000    0.314815  0.999580   
6     4853        0.182426             1.000000    0.275132  0.000000   
7     4852        0.292813             1.000000    0.328042  0.000000   
8     4827       -0.026038             1.000000    0.227513  0.000000   
9     4826        0.116945             1.000000    0.687831  0.000000   

   final_score  
0     0.503133  
1     0.295706  
2     0.327762  
3     0.218475  
4     0.405053  
5     0.314861  
6     0.259996  
7     0.328417  
8     0.143859  
9     0.330430  
Top 10 po

In [49]:
recommendations = []

for i, user in users.iterrows():
    user_id = user['user_id']
    
    # Semantic score for this user
    semantic_scores = semantic_scores_list[i] 
    
    temp_df = posts.copy()
    temp_df['semantic_score'] = semantic_scores
    
    if 'collaborative_score' in temp_df.columns:
        collab_weight = 0.1
    else:
        collab_weight = 0
    
    # Compute final score with weights
    if user['interested_in'].strip() != "":
        temp_df['final_score'] = (
            0.5 * temp_df['semantic_score'] +
            0.25 * temp_df['popularity'] +
            0.15 * temp_df['recency'] +
            collab_weight * temp_df.get('collaborative_score', 0)
        )
    else:
        # User has no interests
        temp_df['final_score'] = (
            0.2 * temp_df['semantic_score'] +
            0.5 * temp_df['popularity'] +
            0.3 * temp_df['recency'] +
            collab_weight * temp_df.get('collaborative_score', 0)
        )
    
    # Pick top 10 posts
    top_posts = temp_df.sort_values('final_score', ascending=False).head(10)['post_id'].tolist()
    
    # Store for final CSV
    recommendations.append({
        'user_id': user_id,
        'recommended_post_ids': ",".join(map(str, top_posts))
    })

In [50]:
recommendations_df = pd.DataFrame(recommendations)
recommendations_df.head()

Unnamed: 0,user_id,recommended_post_ids
0,2,3638303048194858473648184734473548232934
1,3,3638303048194858473648184734473548232934
2,4,3638303048194858473648184734473548232934
3,5,3638303048194858473648184734473548232934
4,6,3638303048194858473648184734473548232934


In [51]:
# CSV
recommendations_df.to_csv("boomm_recommendations.csv", index=False)