In [1]:
import pandas as pd

In [2]:
# Parse ratings.dat
ratings = pd.read_csv("/ratings.dat", sep="::", engine="python",
                      names=["userId", "movieId", "rating", "timestamp"],
                      encoding='ISO-8859-1')

# Parse movies.dat
movies = pd.read_csv("/movies.dat", sep="::", engine="python",
                     names=["movieId", "title", "genres"],
                     encoding='ISO-8859-1')

# Parse users.dat (optional)
users = pd.read_csv("/users.dat", sep="::", engine="python",
                    names=["userId", "gender", "age", "occupation", "zip"],
                    encoding='ISO-8859-1')

In [3]:
# Save cleaned ratings and movies to CSV
ratings.to_csv("ml_ratings_clean.csv", index=False)
movies.to_csv("ml_movies_clean.csv", index=False)
users.to_csv("ml_users_clean.csv", index=False)

In [4]:
# Try loading with correct delimiter and column names
events = pd.read_csv("/events.csv")
events.columns = ["timestamp", "visitorid", "event_type", "itemid", "transactionid"]

# Load item properties and concatenate both parts
props1 = pd.read_csv("/item_properties_part1.csv")
props2 = pd.read_csv("/item_properties_part2.csv")
item_props = pd.concat([props1, props2])

# Load category hierarchy
categories = pd.read_csv("/category_tree.csv")

In [5]:
print(events.columns.tolist())

['timestamp', 'visitorid', 'event_type', 'itemid', 'transactionid']


In [6]:
# Convert timestamp
events["timestamp"] = pd.to_datetime(events["timestamp"], unit='ms')

# Filter valid events
valid_events = ["view", "addtocart", "transaction"]
events = events[events["event_type"].isin(valid_events)]

# Map weights
event_weights = {
    "view": 1,
    "addtocart": 2,
    "transaction": 3
}
events["event_strength"] = events["event_type"].map(event_weights)

In [7]:
# Aggregate user-item interactions
user_item_strength = events.groupby(["visitorid", "itemid"])["event_strength"].sum().reset_index()
user_item_strength.rename(columns={"visitorid": "userId", "itemid": "itemId"}, inplace=True)

# Save for modeling
user_item_strength.to_csv("rr_interactions_clean.csv", index=False)

In [8]:
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix
from lightfm import LightFM
from lightfm.evaluation import precision_at_k

In [9]:
# Load cleaned interactions
interactions = pd.read_csv("rr_interactions_clean.csv")

# Encode users and items
from sklearn.preprocessing import LabelEncoder

user_enc = LabelEncoder()
item_enc = LabelEncoder()

interactions['user_idx'] = user_enc.fit_transform(interactions['userId'])
interactions['item_idx'] = item_enc.fit_transform(interactions['itemId'])

# Save mappings (optional, if deploying later)
user_id_map = dict(zip(interactions['userId'], interactions['user_idx']))
item_id_map = dict(zip(interactions['itemId'], interactions['item_idx']))

In [10]:
# Build sparse matrix (user x item) with event_strength as value
n_users = interactions['user_idx'].nunique()
n_items = interactions['item_idx'].nunique()

sparse_matrix = coo_matrix(
    (interactions['event_strength'],
     (interactions['user_idx'], interactions['item_idx'])),
    shape=(n_users, n_items)
)

In [11]:
# Train a LightFM model on implicit feedback using WARP loss
model = LightFM(loss='warp')
model.fit(sparse_matrix, epochs=10, num_threads=2)

<lightfm.lightfm.LightFM at 0x78be804fee10>

In [None]:
# Evaluate precision at K
precision = precision_at_k(model, sparse_matrix, k=5).mean()
print(f"Precision@5: {precision:.4f}")

Precision@5: 0.0086


In [12]:
def recommend_items(model, user_index, sparse_matrix, item_encoder, n=5):
    scores = model.predict(user_index, np.arange(sparse_matrix.shape[1]))
    top_items = np.argsort(-scores)[:n]
    return item_encoder.inverse_transform(top_items)

# Example: Recommend for user index 10
recommended = recommend_items(model, user_index=10, sparse_matrix=sparse_matrix, item_encoder=item_enc)
print("Top recommendations (item IDs):", recommended)

Top recommendations (item IDs): [461686 142466 394678 312728  12217]


In [13]:
# Re-load and combine item properties
props1 = pd.read_csv("/item_properties_part1.csv")
props2 = pd.read_csv("/item_properties_part2.csv")
item_props = pd.concat([props1, props2])

# Filter to only category values
item_categories = item_props[item_props["property"] == "categoryid"]

# Drop duplicates: one category per item (latest timestamp)
item_categories.sort_values("timestamp", ascending=False, inplace=True)
item_categories = item_categories.drop_duplicates(subset=["itemid"])
item_categories = item_categories[["itemid", "value"]].rename(columns={"value": "categoryid"})

# Ensure this matches the item_cat_ohe row order
itemid_encoded = item_categories["itemid"].values

# Build the index maps
itemid_to_index = {item_id: idx for idx, item_id in enumerate(itemid_encoded)}
index_to_itemid = {idx: item_id for item_id, idx in itemid_to_index.items()}


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  item_categories.sort_values("timestamp", ascending=False, inplace=True)


In [14]:
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder()
item_cat_ohe = enc.fit_transform(item_categories[["categoryid"]])

# Map itemid to encoded category vector
itemid_encoded = item_categories["itemid"].values

In [15]:
from scipy.sparse import csr_matrix

user_profiles = {}

for user_id, group in interactions.groupby("userId"):
    item_ids = group["itemId"].values
    valid_idxs = [itemid_to_index[i] for i in item_ids if i in itemid_to_index]

    if not valid_idxs:
        continue

    # Initialize sparse sum
    sum_vec = csr_matrix((1, item_cat_ohe.shape[1]))

    for idx in valid_idxs:
        sum_vec += item_cat_ohe[idx]

    avg_vec = sum_vec / len(valid_idxs)
    user_profiles[user_id] = avg_vec

In [16]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def recommend_content_based(user_id, top_n=5):
    if user_id not in user_profiles:
        return []

    user_vec = user_profiles[user_id]
    sims = cosine_similarity(user_vec, item_cat_ohe).flatten()
    top_indices = sims.argsort()[::-1]

    # Exclude items the user already interacted with
    seen = set(interactions[interactions["userId"] == user_id]["itemId"])
    recs = []
    for idx in top_indices:
        item_id = index_to_itemid[idx]
        if item_id not in seen:
            recs.append(item_id)
        if len(recs) >= top_n:
            break
    return recs

# Example: Recommend for userId = 12345
recommend_content_based(12345)


[]

In [17]:
def get_cf_scores(model, user_idx, num_items):
    return model.predict(user_idx, np.arange(num_items))

In [18]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def get_content_scores(user_id):
    if user_id not in user_profiles:
        return np.zeros(item_cat_ohe.shape[0])

    user_vec = user_profiles[user_id]
    sims = cosine_similarity(user_vec, item_cat_ohe).flatten()
    return sims


In [19]:
def hybrid_recommend(user_id, alpha=0.7, top_n=5):
    user_idx = user_id_map.get(user_id)
    if user_idx is None:
        return []

    # Get LightFM scores
    cf_scores = model.predict(user_idx, np.arange(sparse_matrix.shape[1]))

    # Get item index → itemId map
    item_indices = np.arange(len(cf_scores))
    item_ids_cf = item_enc.inverse_transform(item_indices)

    # Get content scores (aligned by itemId)
    content_scores = get_content_scores(user_id)

    matched_scores = np.zeros_like(cf_scores)
    for idx in range(len(cf_scores)):
        item_id = item_ids_cf[idx]
        content_idx = itemid_to_index.get(item_id)
        if content_idx is not None:
            matched_scores[idx] = content_scores[content_idx]

    final_scores = alpha * cf_scores + (1 - alpha) * matched_scores
    top_indices = final_scores.argsort()[::-1][:top_n]

    return item_enc.inverse_transform(top_indices)

In [21]:
interactions = pd.read_csv("rr_interactions_clean.csv")
interactions = interactions.sort_values("userId")  # Just to group nicely

# Convert timestamp again if needed
events = pd.read_csv("/events.csv")
events["timestamp"] = pd.to_datetime(events["timestamp"], unit='ms')
interactions = interactions.merge(events[["visitorid", "itemid", "timestamp"]],
                                  left_on=["userId", "itemId"],
                                  right_on=["visitorid", "itemid"],
                                  how="left")

# Sort by time and split: 80% train, 20% test (per user)
train_set = []
test_set = []

for uid, group in interactions.groupby("userId"):
    group = group.sort_values("timestamp")
    n = len(group)
    if n < 5:  # skip users with too few interactions
        continue
    cutoff = int(n * 0.8)
    train_set.append(group.iloc[:cutoff])
    test_set.append(group.iloc[cutoff:])

train_df = pd.concat(train_set)
test_df = pd.concat(test_set)

In [22]:
def precision_recall_at_k(user_id, k=5):
    true_items = test_df[test_df["userId"] == user_id]["itemId"].tolist()
    if not true_items:
        return None, None

    recs = hybrid_recommend(user_id, top_n=k)
    if recs is None or len(recs) == 0:
        return 0.0, 0.0

    hits = len(set(true_items) & set(recs))
    precision = hits / k
    recall = hits / len(true_items)
    return precision, recall

In [23]:
precisions = []
recalls = []

for uid in test_df["userId"].unique():
    p, r = precision_recall_at_k(uid, k=5)
    if p is not None:
        precisions.append(p)
        recalls.append(r)

print(f"Hybrid Precision@5: {np.mean(precisions):.4f}")
print(f"Hybrid Recall@5: {np.mean(recalls):.4f}")

KeyboardInterrupt: 

In [24]:
import pandas as pd

ratings = pd.read_csv("ml_ratings_clean.csv")
movies = pd.read_csv("ml_movies_clean.csv")

In [25]:
ratings = ratings[ratings["rating"] >= 4]

In [26]:
from sklearn.preprocessing import LabelEncoder

user_enc = LabelEncoder()
item_enc = LabelEncoder()

ratings['user_idx'] = user_enc.fit_transform(ratings['userId'])
ratings['item_idx'] = item_enc.fit_transform(ratings['movieId'])

user_id_map = dict(zip(ratings['userId'], ratings['user_idx']))
item_id_map = dict(zip(ratings['movieId'], ratings['item_idx']))

In [27]:
from scipy.sparse import coo_matrix

n_users = ratings['user_idx'].nunique()
n_items = ratings['item_idx'].nunique()

sparse_matrix = coo_matrix(
    (np.ones(len(ratings)), (ratings['user_idx'], ratings['item_idx'])),
    shape=(n_users, n_items)
)

In [28]:
from lightfm import LightFM

model = LightFM(loss='warp')
model.fit(sparse_matrix, epochs=10, num_threads=2)

<lightfm.lightfm.LightFM at 0x78be44b98b90>

In [31]:
# Convert genres to binary features
movies['genres'] = movies['genres'].str.replace('|', ' ')
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
item_genres_matrix = tfidf.fit_transform(movies['genres'])

# Keep only movies that appear in the ratings dataset
rated_movie_ids = ratings['movieId'].unique()
movies = movies[movies['movieId'].isin(rated_movie_ids)].reset_index(drop=True)

# Align to movieId order used in item_enc
movieid_to_index = dict(zip(movies['movieId'], movies.index))
itemid_encoded = item_enc.transform(movies['movieId'])
index_to_movieid = {v: k for k, v in item_id_map.items()}

In [32]:
from scipy.sparse import csr_matrix

user_profiles = {}

for user_id, group in ratings.groupby("userId"):
    item_ids = group["movieId"].values
    valid_idxs = [movieid_to_index[i] for i in item_ids if i in movieid_to_index]

    if not valid_idxs:
        continue

    sum_vec = csr_matrix((1, item_genres_matrix.shape[1]))
    for idx in valid_idxs:
        sum_vec += item_genres_matrix[idx]

    avg_vec = sum_vec / len(valid_idxs)
    user_profiles[user_id] = avg_vec

In [33]:
!pip install -q huggingface_hub

In [34]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [35]:
app_code = """
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import coo_matrix, csr_matrix
from lightfm import LightFM
import gradio as gr

# Load data
ratings = pd.read_csv("ml_ratings_clean.csv")
movies = pd.read_csv("ml_movies_clean.csv")

# Filter for positive implicit feedback
ratings = ratings[ratings["rating"] >= 4]

# Encode users/items
user_enc = LabelEncoder()
item_enc = LabelEncoder()
ratings["user_idx"] = user_enc.fit_transform(ratings["userId"])
ratings["item_idx"] = item_enc.fit_transform(ratings["movieId"])

user_id_map = dict(zip(ratings["userId"], ratings["user_idx"]))
item_id_map = dict(zip(ratings["movieId"], ratings["item_idx"]))
movie_id_lookup = dict(zip(movies["movieId"], movies["title"]))

# Sparse interaction matrix
n_users = ratings["user_idx"].nunique()
n_items = ratings["item_idx"].nunique()
sparse_matrix = coo_matrix(
    (np.ones(len(ratings)), (ratings["user_idx"], ratings["item_idx"])),
    shape=(n_users, n_items)
)

# Train LightFM model
model = LightFM(loss="warp")
model.fit(sparse_matrix, epochs=10, num_threads=2)

# Prepare genres
movies["genres"] = movies["genres"].str.replace("|", " ")
tfidf = TfidfVectorizer()
item_genres_matrix = tfidf.fit_transform(movies["genres"])
movieid_to_index = dict(zip(movies["movieId"], movies.index))

# Build user content profiles
user_profiles = {}
for user_id, group in ratings.groupby("userId"):
    item_ids = group["movieId"].values
    valid_idxs = [movieid_to_index[i] for i in item_ids if i in movieid_to_index]
    if not valid_idxs:
        continue
    sum_vec = csr_matrix((1, item_genres_matrix.shape[1]))
    for idx in valid_idxs:
        sum_vec += item_genres_matrix[idx]
    avg_vec = sum_vec / len(valid_idxs)
    user_profiles[user_id] = avg_vec

# Recommender function
def recommend_movies(user_id_str, method="Hybrid", alpha=0.7, k=5):
    user_id = int(user_id_str)
    if user_id not in user_id_map:
        return "❌ User ID not found."

    user_idx = user_id_map[user_id]
    item_indices = np.arange(sparse_matrix.shape[1])
    movie_ids_cf = item_enc.inverse_transform(item_indices)
    cf_scores = model.predict(user_idx, item_indices)

    if user_id in user_profiles:
        user_vec = user_profiles[user_id]
        content_scores = cosine_similarity(user_vec, item_genres_matrix).flatten()
    else:
        content_scores = np.zeros_like(cf_scores)

    if method == "Hybrid":
        final_scores = alpha * cf_scores + (1 - alpha) * content_scores
    elif method == "Collaborative":
        final_scores = cf_scores
    else:
        final_scores = content_scores

    top_indices = final_scores.argsort()[::-1]
    seen = set(ratings[ratings["userId"] == user_id]["movieId"])
    recs = []
    for idx in top_indices:
        mid = movie_ids_cf[idx]
        if mid not in seen:
            title = movie_id_lookup.get(mid, f"Movie ID {mid}")
            recs.append(f"🎬 {title}")
        if len(recs) == k:
            break

    return "\\n".join(recs) if recs else "No recommendations found."

# Gradio interface
user_choices = list(map(str, np.random.choice(ratings["userId"].unique(), 25, replace=False)))

interface = gr.Interface(
    fn=recommend_movies,
    inputs=[
        gr.Dropdown(choices=user_choices, label="Select User ID"),
        gr.Radio(choices=["Hybrid", "Collaborative", "Content-Based"], label="Model Type"),
        gr.Slider(minimum=0.0, maximum=1.0, value=0.7, label="Hybrid Weight (α)"),
        gr.Slider(minimum=1, maximum=10, value=5, label="Number of Recommendations")
    ],
    outputs=gr.Textbox(label="Recommended Movies"),
    title="🎥 MovieLens Hybrid Recommender",
    description="Personalized movie recommendations using hybrid, collaborative, or content-based filtering."
)

interface.launch()
"""

In [36]:
with open("app.py", "w") as f:
    f.write(app_code)

In [40]:
with open("requirements.txt", "w") as f:
    f.write("pandas\nnumpy\nscikit-learn\nscipy\nlightfm\ngradio")

In [42]:
from huggingface_hub import upload_file

# using username and space name to upload app
username = "zda23"
space_name = "hybrid-movielens-recommender"
repo_id = f"{username}/{space_name}"

# Upload the main app file
upload_file(
    path_or_fileobj="app.py",
    path_in_repo="app.py",
    repo_id=repo_id,
    repo_type="space"
)

# Upload the fixed requirements.txt
upload_file(
    path_or_fileobj="requirements.txt",
    path_in_repo="requirements.txt",
    repo_id=repo_id,
    repo_type="space"
)

# Upload your dataset files
upload_file(
    path_or_fileobj="ml_ratings_clean.csv",
    path_in_repo="ml_ratings_clean.csv",
    repo_id=repo_id,
    repo_type="space"
)

upload_file(
    path_or_fileobj="ml_movies_clean.csv",
    path_in_repo="ml_movies_clean.csv",
    repo_id=repo_id,
    repo_type="space"
)

No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/spaces/zda23/hybrid-movielens-recommender/commit/4ccf0b8c35a674beede343fc8e310af1da24df51', commit_message='Upload ml_movies_clean.csv with huggingface_hub', commit_description='', oid='4ccf0b8c35a674beede343fc8e310af1da24df51', pr_url=None, repo_url=RepoUrl('https://huggingface.co/spaces/zda23/hybrid-movielens-recommender', endpoint='https://huggingface.co', repo_type='space', repo_id='zda23/hybrid-movielens-recommender'), pr_revision=None, pr_num=None)

In [47]:
!ls *.ipynb

ls: cannot access '*.ipynb': No such file or directory


In [48]:
!nbstripout Hybrid_Recommendation_System_with_RetailRocket_and_MovieLens.ipynb