In [3]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate
from collections import defaultdict

## Data Loading and Preparation

In [4]:
try:
    engagements_df = pd.read_csv('../data/Engagements.csv')
    posts_df = pd.read_csv('../data/Posts.csv')
except FileNotFoundError:
    print("Make sure your CSV files are in a 'data' folder one level above your notebook directory.")


reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(engagements_df[['user_id', 'post_id', 'engagement']], reader)
print("Data loaded into Surprise's format successfully.")
display(engagements_df.head())

Data loaded into Surprise's format successfully.


Unnamed: 0,user_id,post_id,engagement
0,U1,P52,1
1,U1,P44,0
2,U1,P1,1
3,U1,P4,1
4,U1,P65,0


## Model Training (SVD)

In [5]:
trainset = data.build_full_trainset()
svd_model = SVD(n_factors=50, n_epochs=20, random_state=42)
print("Training the SVD model...")
svd_model.fit(trainset)
print("Model training complete.")

Training the SVD model...
Model training complete.


## Model Evaluation

In [6]:
print("\nRunning 5-fold cross-validation...")
cv_results = cross_validate(SVD(n_factors=50, n_epochs=20, random_state=42), data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
print("\n--- Cross-Validation Results ---")
print(f"Average RMSE: {cv_results['test_rmse'].mean():.4f}")
print(f"Average MAE:  {cv_results['test_mae'].mean():.4f}")


Running 5-fold cross-validation...
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.5090  0.5367  0.5221  0.5338  0.5034  0.5210  0.0132  
MAE (testset)     0.4894  0.5131  0.5024  0.5075  0.4841  0.4993  0.0109  
Fit time          0.01    0.01    0.01    0.01    0.01    0.01    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    0.00    0.00    

--- Cross-Validation Results ---
Average RMSE: 0.5210
Average MAE:  0.4993


## Generating Recommendations

In [7]:
def get_collaborative_filtering_recommendations(user_id, top_n=3):
    all_post_ids = posts_df['post_id'].unique()
    engaged_posts = engagements_df[engagements_df['user_id'] == user_id]['post_id'].unique()
    
    predictions = []
    for post_id in all_post_ids:
        if post_id not in engaged_posts:
            predicted_rating = svd_model.predict(user_id, post_id).est
            predictions.append((post_id, predicted_rating))
            
    predictions.sort(key=lambda x: x[1], reverse=True)
    top_post_ids = [post_id for post_id, rating in predictions[:top_n]]
    
    recommended_posts_details = posts_df[posts_df['post_id'].isin(top_post_ids)]
    scores_map = dict(predictions[:top_n])
    recommended_posts_details['predicted_engagement'] = recommended_posts_details['post_id'].map(scores_map)
    
    return recommended_posts_details.sort_values('predicted_engagement', ascending=False)

## Recommendations for User 'U14'

In [12]:
test_user = 'U14'
recommendations = get_collaborative_filtering_recommendations(test_user)
print(f"\n--- Recommendations for {test_user} (Based on Similar Users) ---")
display(recommendations[['post_id', 'tags', 'predicted_engagement']])


--- Recommendations for U14 (Based on Similar Users) ---


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recommended_posts_details['predicted_engagement'] = recommended_posts_details['post_id'].map(scores_map)


Unnamed: 0,post_id,tags,predicted_engagement
87,P88,literature,0.692874
56,P57,"sports, literature",0.654068
8,P9,travel,0.650805


## Why Were These Recommended?

In [13]:
def get_similar_users(user_id, top_n=5):
    try:
        target_user_inner_id = trainset.to_inner_uid(user_id)
    except ValueError:
        return f"User {user_id} not in the training set."

    target_user_vector = svd_model.pu[target_user_inner_id]
    similarities = []
    for inner_id, vector in enumerate(svd_model.pu):
        if inner_id != target_user_inner_id:
            sim = np.dot(target_user_vector, vector) / (np.linalg.norm(target_user_vector) * np.linalg.norm(vector))
            similarities.append((trainset.to_raw_uid(inner_id), sim))

    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities[:top_n]

In [14]:
similar_users = get_similar_users(test_user)
print(f"\nTop 5 users most similar to {test_user}:")
for user, score in similar_users:
    print(f"  - User: {user}, Similarity Score: {score:.4f}")


Top 5 users most similar to U14:
  - User: U30, Similarity Score: 0.3432
  - User: U10, Similarity Score: 0.2456
  - User: U33, Similarity Score: 0.2102
  - User: U18, Similarity Score: 0.2041
  - User: U4, Similarity Score: 0.1800


In [11]:
similar_user_id = similar_users[0][0]
print(f"\nPosts that a similar user ({similar_user_id}) engaged with (engagement=1):")
display(engagements_df[(engagements_df['user_id'] == similar_user_id) & (engagements_df['engagement'] == 1)].merge(posts_df, on='post_id'))


Posts that a similar user (U5) engaged with (engagement=1):


Unnamed: 0,user_id,post_id,engagement,creator_id,content_type,tags
0,U5,P92,1,U36,text,"tech, food"
1,U5,P3,1,U32,text,"sports, travel"
2,U5,P21,1,U16,image,"art, literature"
3,U5,P28,1,U3,text,sports
4,U5,P37,1,U18,text,"tech, music"
5,U5,P46,1,U30,video,"food, sports"
6,U5,P79,1,U35,image,literature
7,U5,P89,1,U35,image,music
