In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
posts_df = pd.read_csv("Posts.csv")  
users_df = pd.read_csv("Users (1).csv")

In [4]:
posts_df.head()

Unnamed: 0,post_id,creator_id,content_type,tags
0,P1,U44,video,"sports, food"
1,P2,U26,video,"music, travel"
2,P3,U32,text,"sports, travel"
3,P4,U6,image,"music, gaming"
4,P5,U32,image,"food, fashion"


In [5]:
users_df.head()

Unnamed: 0,user_id,age,gender,top_3_interests,past_engagement_score
0,U1,24,F,"sports, art, gaming",0.61
1,U2,32,F,"travel, food, fashion",0.93
2,U3,28,Other,"sports, travel, fashion",0.4
3,U4,25,M,"fashion, music, tech",0.53
4,U5,24,M,"fashion, food, fitness",0.8


In [6]:
posts_df['post_text'] = posts_df['content_type'].astype(str) + " " + posts_df['tags'].astype(str).str.replace(',', ' ')


In [7]:
posts_df.head()

Unnamed: 0,post_id,creator_id,content_type,tags,post_text
0,P1,U44,video,"sports, food",video sports food
1,P2,U26,video,"music, travel",video music travel
2,P3,U32,text,"sports, travel",text sports travel
3,P4,U6,image,"music, gaming",image music gaming
4,P5,U32,image,"food, fashion",image food fashion


In [10]:
posts_df.drop(columns=['content_type','tags'],inplace=True)

In [11]:
posts_df.head()

Unnamed: 0,post_id,post_text
0,P1,video sports food
1,P2,video music travel
2,P3,text sports travel
3,P4,image music gaming
4,P5,image food fashion


In [12]:
users_df.drop(columns=['age','gender','past_engagement_score'],inplace=True)

In [13]:
users_df.head()

Unnamed: 0,user_id,top_3_interests
0,U1,"sports, art, gaming"
1,U2,"travel, food, fashion"
2,U3,"sports, travel, fashion"
3,U4,"fashion, music, tech"
4,U5,"fashion, food, fitness"


In [14]:
post_ids = posts_df['post_id'].tolist()

In [15]:
# Convert top_3_interests to a single string per user
users_df['user_text'] = users_df['top_3_interests'].astype(str).str.replace(',', ' ')
user_ids = users_df['user_id'].tolist()

In [16]:
# Fit on combined post + user text to ensure same vocabulary
corpus = pd.concat([posts_df['post_text'], users_df['user_text']])
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(corpus)

In [17]:
# Transform separately
post_vectors = tfidf_vectorizer.transform(posts_df['post_text'])
user_vectors = tfidf_vectorizer.transform(users_df['user_text'])

In [18]:
# 5. Compute Cosine Similarity
# ------------------------------
similarity_matrix = cosine_similarity(user_vectors, post_vectors)

In [19]:
top_k = 3
recommendations = {}

for i, user in enumerate(user_ids):
    top_indices = similarity_matrix[i].argsort()[::-1][:top_k]  # indices of top-3 posts
    top_posts = [post_ids[j] for j in top_indices]
    recommendations[user] = top_posts

In [20]:
recommendations_df = pd.DataFrame([
    {"user_id": user, "top_posts": top_posts} for user, top_posts in recommendations.items()
])


In [21]:
# Example: test for user_id "U10"
test_user = "U10"

if test_user in recommendations:
    print(f"Top {top_k} recommended posts for {test_user}: {recommendations[test_user]}")
else:
    print(f"User {test_user} not found in the dataset.")


Top 3 recommended posts for U10: ['P96', 'P63', 'P54']


In [22]:
recommendations_df.to_csv("content_based_recommendations.csv", index=False)

In [23]:
import numpy as np
np.save("similarity_matrix.npy", similarity_matrix)
np.save("user_ids.npy", np.array(user_ids))
np.save("post_ids.npy", np.array(post_ids))
