In [381]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import NearestNeighbors

In [383]:
df = pd.read_csv('./data/synthetic_events_data.csv')

<h1>Preprocessing and cleaning</h1>

In [386]:
print(df.head())

        user_id      event_id        tier_id  price_paid event_category  \
0  usr_7de8c5ad  evt_4a3fd306     tier_basic      185.46           Food   
1  usr_7de8c5ad  evt_89a36d0f   tier_premium      197.02           Tech   
2  usr_7de8c5ad  evt_e76d385b     tier_basic      128.56           Food   
3  usr_7de8c5ad  evt_452082ea  tier_standard       19.54           Tech   
4  usr_7de8c5ad  evt_7623447b  tier_standard       77.85           Tech   

  event_owner_id  event_date user_gender user_birth_date  interaction  
0   org_56f351e2  2025-03-14        Male      2000-06-21            1  
1   org_78cedd62  2025-09-01  Non-binary      1977-02-25            1  
2   org_519e33bb  2024-10-21  Non-binary      2000-05-05            1  
3   org_db680007  2024-08-24      Female      1994-05-02            1  
4   org_97adfec6  2025-08-07      Female      1988-07-01            1  


In [388]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3210 entries, 0 to 3209
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   user_id          3210 non-null   object 
 1   event_id         3210 non-null   object 
 2   tier_id          3210 non-null   object 
 3   price_paid       3210 non-null   float64
 4   event_category   3210 non-null   object 
 5   event_owner_id   3210 non-null   object 
 6   event_date       3210 non-null   object 
 7   user_gender      3210 non-null   object 
 8   user_birth_date  3210 non-null   object 
 9   interaction      3210 non-null   int64  
dtypes: float64(1), int64(1), object(8)
memory usage: 250.9+ KB
None


In [390]:
print(df.describe()) 

        price_paid  interaction
count  3210.000000       3210.0
mean    103.743293          1.0
std      54.221180          0.0
min      10.100000          1.0
25%      56.862500          1.0
50%     102.730000          1.0
75%     149.617500          1.0
max     199.970000          1.0


In [392]:
print(df.columns)

Index(['user_id', 'event_id', 'tier_id', 'price_paid', 'event_category',
       'event_owner_id', 'event_date', 'user_gender', 'user_birth_date',
       'interaction'],
      dtype='object')


In [394]:
print(df.isnull().sum())

user_id            0
event_id           0
tier_id            0
price_paid         0
event_category     0
event_owner_id     0
event_date         0
user_gender        0
user_birth_date    0
interaction        0
dtype: int64


In [396]:
df.dropna(inplace=True)

In [398]:
df.drop_duplicates(inplace=True)

In [400]:
print(df.dtypes)

user_id             object
event_id            object
tier_id             object
price_paid         float64
event_category      object
event_owner_id      object
event_date          object
user_gender         object
user_birth_date     object
interaction          int64
dtype: object


<h1>Encoding</h1>

In [403]:
# these are strings but we need numerical data  there for we will use Label Encoding to converts each unique string ID into a unique integer number.
user_encoder = LabelEncoder()
event_encoder = LabelEncoder()

# Fit and transform user_id and event_id
df['user_id_encoded'] = user_encoder.fit_transform(df['user_id'])
df['event_id_encoded'] = event_encoder.fit_transform(df['event_id'])

# Check the result
print(df[['user_id', 'user_id_encoded', 'event_id', 'event_id_encoded']].head())

        user_id  user_id_encoded      event_id  event_id_encoded
0  usr_7de8c5ad              254  evt_4a3fd306                93
1  usr_7de8c5ad              254  evt_89a36d0f               154
2  usr_7de8c5ad              254  evt_e76d385b               271
3  usr_7de8c5ad              254  evt_452082ea                90
4  usr_7de8c5ad              254  evt_7623447b               128


<h1>Collaborative Filtering Model</h1>

In [406]:
#Collaborative filtering tries to recommend items to a user based on past interactions of many users.
# in simple terms : - You find users who behave similarly. - You recommend to a user what their “similar users” liked or bought.

# Example:
#User 1 bought events A and B.

#User 2 bought events B and C.

#User 3 bought events A and C.

#If you want to recommend to User 1, since User 2 also liked event B and bought C, you might recommend event C to User 1.

#Collaborative filtering only uses user-item interactions (like purchases or ratings). It doesn’t require extra info like user age or event category.

<h2>Train-Test Split</h2>

In [409]:
# train_list = []
# test_list = []

# # Group by user and split
# for user, group in df_sorted.groupby('user_id_encoded'):
#     if len(group) > 1:
#         # Take last interaction as test
#         test_list.append(group.tail(1))
#         train_list.append(group.head(len(group)-1))
#     else:
#         # If only one interaction, put in train (or test, your choice)
#         train_list.append(group)
        
# # Concatenate back
# train_df = pd.concat(train_list).reset_index(drop=True)
# test_df = pd.concat(test_list).reset_index(drop=True)

# print(f"Train size: {len(train_df)}, Test size: {len(test_df)}")

<h2>Train</h2>

In [412]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

print(f"Train interactions: {len(train_df)}")
print(f"Test interactions: {len(test_df)}")

# Optional: show user overlap
train_users = set(train_df['user_id_encoded'])
test_users = set(test_df['user_id_encoded'])
shared_users = train_users & test_users
print(f"Users in both train and test: {len(shared_users)}")

Train interactions: 2568
Test interactions: 642
Users in both train and test: 362


<h2>User-Item Interaction Matrix from Training Data</h2>

In [417]:
interaction_matrix_train = train_df.pivot(
    index='user_id_encoded',
    columns='event_id_encoded',
    values='interaction'
).fillna(0)

In [419]:
model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
model_knn.fit(interaction_matrix_train)

In [421]:
def get_user_recommendations(user_id, n_neighbors=5, n_recommendations=5):
    if user_id not in interaction_matrix_train.index:
        return []  # no recommendations for unknown users

    user_vector = interaction_matrix_train.loc[user_id].values.reshape(1, -1)
    distances, indices = model_knn.kneighbors(user_vector, n_neighbors=n_neighbors+1)
    
    neighbor_ids = indices.flatten()[1:]  # exclude the user itself

    user_items = set(np.where(user_vector.flatten() > 0)[0])
    neighbor_interactions = interaction_matrix_train.iloc[neighbor_ids].sum(axis=0)
    neighbor_interactions = neighbor_interactions.drop(user_items, errors='ignore')
    
    recommended_items = neighbor_interactions.sort_values(ascending=False).head(n_recommendations).index.tolist()
    return recommended_items

<h2>Evaluation</h2>

In [424]:
def precision_recall_at_k(recommended, actual, k):
    recommended_k = recommended[:k]
    hits = len(set(recommended_k) & set(actual))
    precision = hits / k if k else 0
    recall = hits / len(actual) if actual else 0
    return precision, recall

# Prepare test user interactions as a dict: user_id -> list of event_id_encoded they interacted with
test_user_events = test_df.groupby('user_id_encoded')['event_id_encoded'].apply(list).to_dict()

precisions, recalls = [], []

valid_test_users = [user for user in test_user_events.keys() if user in interaction_matrix_train.index]

for user in valid_test_users:
    true_events = test_user_events[user]
    recommended = get_user_recommendations(user, n_neighbors=5, n_recommendations=10)
    p, r = precision_recall_at_k(recommended, true_events, k=10)
    precisions.append(p)
    recalls.append(r)

print(f"Mean Precision@10: {np.mean(precisions):.4f}")
print(f"Mean Recall@10: {np.mean(recalls):.4f}")


Mean Precision@10: 0.0077
Mean Recall@10: 0.0371


In [426]:
print(f"Valid test users in train: {len(valid_test_users)} / {len(test_user_events)}")


Valid test users in train: 362 / 363


<h3>switching to matrix factorization technique</h3>

In [431]:
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split
reader = Reader(rating_scale=(0, 1))
data = Dataset.load_from_df(train_df[['user_id_encoded', 'event_id_encoded', 'interaction']], reader)
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)
model = SVD()
model.fit(trainset)
predictions = model.test(testset)

from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)
event_features = encoder.fit_transform(df[['event_category', 'tier_id']])


from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import NearestNeighbors
param_grid = {'n_neighbors': [5, 10, 20, 50], 'metric': ['cosine', 'jaccard']}
# Custom scoring based on precision/recall

popular_events = train_df['event_id_encoded'].value_counts().head(10).index.tolist()
def get_user_recommendations(user_id, n_neighbors=5, n_recommendations=5):
    if user_id not in interaction_matrix_train.index:
        return popular_events[:n_recommendations]
    # Existing kNN logic

def map_at_k(recommended, actual, k):
    score = 0
    hits = 0
    for i, item in enumerate(recommended[:k]):
        if item in actual:
            hits += 1
            score += hits / (i + 1)
    return score / min(len(actual), k) if actual else 0

print(train_df['user_id_encoded'].value_counts().describe())
print(train_df['event_id_encoded'].value_counts().describe())


df['event_date'] = pd.to_datetime(df['event_date'])
df['recency'] = (pd.to_datetime('2025-07-23') - df['event_date']).dt.days

sparsity = 1 - (train_df['interaction'].sum() / (len(train_df['user_id_encoded'].unique()) * len(train_df['event_id_encoded'].unique())))
print(f"Sparsity: {sparsity:.4f}")

ModuleNotFoundError: No module named 'surprise'

In [435]:
conda install -c conda-forge scikit-surprise

error: incomplete escape \U at position 28