In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [6]:
users_df = pd.read_csv('../data/Users.csv')
posts_df = pd.read_csv('../data/Posts.csv')
engagements_df = pd.read_csv('../data/Engagements.csv')


print("Users DataFrame:")
users_df.info()
print("\nPosts DataFrame:")
posts_df.info()
print("\nEngagements DataFrame:")
engagements_df.info()

users_df.head()

Users DataFrame:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   user_id                50 non-null     object 
 1   age                    50 non-null     int64  
 2   gender                 50 non-null     object 
 3   top_3_interests        50 non-null     object 
 4   past_engagement_score  50 non-null     float64
dtypes: float64(1), int64(1), object(3)
memory usage: 2.1+ KB

Posts DataFrame:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   post_id       100 non-null    object
 1   creator_id    100 non-null    object
 2   content_type  100 non-null    object
 3   tags          100 non-null    object
dtypes: object(4)
memory usage: 3.2+ KB

Engagements DataFrame:
<class 'pandas.cor

Unnamed: 0,user_id,age,gender,top_3_interests,past_engagement_score
0,U1,24,F,"sports, art, gaming",0.61
1,U2,32,F,"travel, food, fashion",0.93
2,U3,28,Other,"sports, travel, fashion",0.4
3,U4,25,M,"fashion, music, tech",0.53
4,U5,24,M,"fashion, food, fitness",0.8


In [4]:
posts_df.head()

Unnamed: 0,post_id,creator_id,content_type,tags
0,P1,U44,video,"sports, food"
1,P2,U26,video,"music, travel"
2,P3,U32,text,"sports, travel"
3,P4,U6,image,"music, gaming"
4,P5,U32,image,"food, fashion"


In [5]:
engagements_df.head()

Unnamed: 0,user_id,post_id,engagement
0,U1,P52,1
1,U1,P44,0
2,U1,P1,1
3,U1,P4,1
4,U1,P65,0


In [11]:
tfidf_vectorizer = TfidfVectorizer(min_df=0.01, max_df=0.8)

user_interests_matrix = tfidf_vectorizer.fit_transform(users_df['top_3_interests'].str.replace(',', ''))


post_tags_matrix = tfidf_vectorizer.transform(posts_df['tags'].str.replace(',', ''))


print("User-Interests Matrix Shape:", user_interests_matrix.shape)
print("\n\n", user_interests_matrix)


print("\n\n\nPost-Tags Matrix Shape:", post_tags_matrix.shape)
print("\n\n", post_tags_matrix)

User-Interests Matrix Shape: (50, 10)


 <Compressed Sparse Row sparse matrix of dtype 'float64'
	with 150 stored elements and shape (50, 10)>
  Coords	Values
  (0, 7)	0.55283233192069
  (0, 0)	0.6235004615605861
  (0, 4)	0.55283233192069
  (1, 9)	0.6012737541034223
  (1, 3)	0.5434709610516644
  (1, 1)	0.585755227991998
  (2, 7)	0.6032503674945283
  (2, 9)	0.5712783326325583
  (2, 1)	0.5565339709148764
  (3, 1)	0.474498826386959
  (3, 6)	0.599692251495972
  (3, 8)	0.6443757190126661
  (4, 3)	0.5265690729197029
  (4, 1)	0.5675383037297076
  (4, 2)	0.6329496703878521
  (5, 3)	0.5265690729197029
  (5, 1)	0.5675383037297076
  (5, 2)	0.6329496703878521
  (6, 0)	0.6673915190726118
  (6, 3)	0.5065141619536844
  (6, 1)	0.5459230385414308
  (7, 7)	0.6049317682554912
  (7, 4)	0.6049317682554912
  (7, 3)	0.5177983309267897
  (8, 7)	0.5617615400154977
  :	:
  (41, 5)	0.6120913861440629
  (42, 3)	0.46690231240074337
  (42, 8)	0.6833919823369773
  (42, 2)	0.5612286781271743
  (43, 9)	0.5559560573313

In [9]:
interest_match_scores = cosine_similarity(user_interests_matrix, post_tags_matrix)

print("Interest Match Score Matrix Shape:", interest_match_scores.shape)
print("\n\n", interest_match_scores)

Interest Match Score Matrix Shape: (50, 100)


 [[0.41998665 0.         0.40140386 ... 0.         0.35990239 0.62350046]
 [0.35340497 0.37907434 0.41343812 ... 0.         0.         0.        ]
 [0.45828922 0.36016366 0.83082485 ... 0.         0.         0.        ]
 ...
 [0.         0.33539324 0.36579725 ... 0.         0.36571544 0.        ]
 [0.         0.48498127 0.         ... 0.         0.47425757 0.60435423]
 [0.         0.82501604 0.35764553 ... 0.         0.48610538 0.        ]]


In [12]:
def get_recommendations(user_id, alpha=0.7, beta=0.3):
    """
    Generates top 3 post recommendations for a given user.
    """
    # 1. Find the user's index and their data
    try:
        user_index = users_df[users_df['user_id'] == user_id].index[0]
        user_engagement_score = users_df.loc[user_index, 'past_engagement_score']
    except IndexError:
        return "User ID not found."

    # 2. Get the interest match scores for this user against all posts
    user_interest_scores = interest_match_scores[user_index]

    # 3. Calculate the hybrid score
    # We are adding the user's general engagement score to the specific interest match score
    hybrid_scores = (alpha * user_interest_scores) + (beta * user_engagement_score)

    # 4. Create a DataFrame for easy sorting
    recommendations_df = pd.DataFrame({
        'post_id': posts_df['post_id'],
        'hybrid_score': hybrid_scores
    })

    # 5. Filter out posts the user has already engaged with
    engaged_posts = engagements_df[engagements_df['user_id'] == user_id]['post_id']
    recommendations_df = recommendations_df[~recommendations_df['post_id'].isin(engaged_posts)]

    # 6. Sort by score and get the top 3
    top_3_recommendations = recommendations_df.sort_values(by='hybrid_score', ascending=False).head(3)

    return top_3_recommendations['post_id'].tolist()

In [13]:
# Example: Get recommendations for user 'U1'
test_user = 'U1'
recommendations = get_recommendations(test_user)

print(f"Top 3 recommendations for {test_user}: {recommendations}")

# Let's see why these were recommended
user_info = users_df[users_df['user_id'] == test_user]
print(f"\nUser {test_user} is interested in: {user_info['top_3_interests'].values[0]}")
print("\nRecommended Post Details:")
print(posts_df[posts_df['post_id'].isin(recommendations)])

Top 3 recommendations for U1: ['P78', 'P22', 'P10']

User U1 is interested in: sports, art, gaming

Recommended Post Details:
   post_id creator_id content_type         tags
9      P10        U24        video          art
21     P22        U29        audio  sports, art
77     P78        U20        video  sports, art
