In [9]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from collections import defaultdict
import random

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## USER CONFIG / PATHS

In [10]:
MOVIE_INFO_CSV = r"/content/drive/MyDrive/Project /Intrenship/DataSets/movie/movie_info.csv"
RATINGS_CSV   = r"/content/drive/MyDrive/Project /Intrenship/DataSets/movie/movie_ratings.csv"
USERS_CSV     = r"/content/drive/MyDrive/Project /Intrenship/DataSets/movie/movie_user_demographics.csv"

##  1) Load / Inspect

In [11]:
def load_data(info_path=MOVIE_INFO_CSV, ratings_path=RATINGS_CSV, users_path=USERS_CSV):
    df_info = pd.read_csv(info_path)
    df_ratings = pd.read_csv(ratings_path)
    df_users = pd.read_csv(users_path)
    print("Loaded:\n", df_info.shape, "movie info |", df_ratings.shape, "ratings |", df_users.shape, "users")
    return df_info, df_ratings, df_users

## 2) Preprocess

In [12]:
def build_rating_matrix(df_ratings):
    """Pivot ratings into a user-item matrix with NaNs for missing ratings."""
    rating_matrix = df_ratings.pivot(index='user_id', columns='movie_id', values='rating')
    return rating_matrix

##  3) Similarities

In [13]:
def compute_user_similarity(rating_matrix):
    """Mean-center each user's ratings then compute cosine similarity.

    Returns DataFrame where index/columns are user_id.
    """
    user_ids = rating_matrix.index
    # Mean-center by user (subtract user mean); leave NaNs -> fill with 0 after centering
    user_mean = rating_matrix.mean(axis=1)
    rating_centered = rating_matrix.sub(user_mean, axis=0).fillna(0)
    sim = cosine_similarity(rating_centered)
    sim_df = pd.DataFrame(sim, index=user_ids, columns=user_ids)
    return sim_df


In [14]:
def compute_item_similarity(rating_matrix):
    """Compute item-item cosine similarity. Mean-center by item then fill NaNs with 0 for similarity calc."""
    movie_ids = rating_matrix.columns
    item_mean = rating_matrix.mean(axis=0)
    item_centered = rating_matrix.sub(item_mean, axis=1).T.fillna(0)  # items x users
    sim = cosine_similarity(item_centered)
    sim_df = pd.DataFrame(sim, index=movie_ids, columns=movie_ids)
    return sim_df

##  4) Recommendation

In [16]:
def user_based_recommend(user_id, rating_matrix, user_sim_df, df_info, k=10, n_recommendations=10):
    """Recommend movies for a user using weighted sum of other users' ratings.

    - rating_matrix: user x item (with NaNs)
    - user_sim_df: user x user similarity
    Returns a DataFrame of top-n recommended movie_id + predicted score + title
    """
    if user_id not in rating_matrix.index:
        return pd.DataFrame(columns=['movie_id', 'pred_score', 'movie_title'])

    # mappings and arrays
    user_index = list(rating_matrix.index)
    movie_index = list(rating_matrix.columns)
    user_pos = user_index.index(user_id)

    sim_row = user_sim_df.loc[user_id].values  # similarity to all users
    ratings_filled = rating_matrix.fillna(0).values  # users x movies

    # For each movie j: compute weighted sum over other users who rated that movie
    # numerator = sum(sim_i * rating_i_j)
    # denom = sum(abs(sim_i) for i who rated j)
    numerators = sim_row.dot(ratings_filled)  # shape (movies,)

    # Build mask of which users rated each movie
    rated_mask = (~rating_matrix.isna()).values.T  # movies x users
    # denom: for each movie j, sum abs(sim) for users who rated j
    denom = np.abs(sim_row).dot(rated_mask.T)

    # Avoid division by zero
    denom = np.where(denom == 0, 1e-9, denom)
    pred_scores = numerators / denom

    # Turn into Series aligned with movie ids
    preds = pd.Series(pred_scores, index=movie_index)

    # Exclude movies user has already rated
    user_rated = rating_matrix.loc[user_id].dropna().index
    preds = preds.drop(index=user_rated, errors='ignore')

    topn = preds.sort_values(ascending=False).head(n_recommendations).reset_index()
    topn.columns = ['movie_id', 'pred_score']
    topn = topn.merge(df_info[['movie_id', 'movie_title']], on='movie_id', how='left')
    return topn


In [24]:
def item_based_recommend(user_id, rating_matrix, item_sim_df, df_info, n_recommendations=10):
    """Item-based CF: predicted_score for item j = sum_over_items_i (sim(j,i) * rating(user,i)) / sum_abs_sim
    """
    if user_id not in rating_matrix.index:
        return pd.DataFrame(columns=['movie_id', 'pred_score', 'movie_title'])

    movie_index = list(rating_matrix.columns)
    user_row = rating_matrix.loc[user_id]
    user_ratings = user_row.dropna()

    if user_ratings.empty:
        return pd.DataFrame(columns=['movie_id', 'pred_score', 'movie_title'])

  # Weighted sum: for each candidate movie j we compute weight from rated items
    scores = {}
    for candidate in movie_index:
        if pd.notna(user_row.get(candidate)):
            continue  # skip already rated
        sims = item_sim_df.loc[candidate, user_ratings.index].values  # sim between candidate and each rated
        numer = (sims * user_ratings.values).sum()
        denom = np.abs(sims).sum()
        score = numer / (denom if denom != 0 else 1e-9)
        scores[candidate] = score

    preds = pd.Series(scores).sort_values(ascending=False).head(n_recommendations).reset_index()
    preds.columns = ['movie_id', 'pred_score']
    preds = preds.merge(df_info[['movie_id', 'movie_title']], on='movie_id', how='left')
    return preds

In [25]:
def svd_recommend(user_id, rating_matrix, df_info, n_components=50, n_recommendations=10):
    """SVD-based recommender (TruncatedSVD on user-item matrix with missing filled by user mean)
    Note: This is a simple demonstration. Better MF algorithms (e.g., surprise/PyTorch) often perform better.
    """
    if user_id not in rating_matrix.index:
        return pd.DataFrame(columns=['movie_id', 'pred_score', 'movie_title'])

    user_ids = rating_matrix.index
    movie_ids = rating_matrix.columns

    # Fill missing with user mean
    user_mean = rating_matrix.mean(axis=1)
    filled = rating_matrix.copy()
    for u in user_ids:
        filled.loc[u] = filled.loc[u].fillna(user_mean.loc[u])

    X = filled.values  # users x movies
    # TruncatedSVD on items (approx factorization)
    svd = TruncatedSVD(n_components=min(n_components, X.shape[1]-1), random_state=RANDOM_STATE)
    user_factors = svd.fit_transform(X)  # users x k
    item_factors = svd.components_.T      # movies x k

    reconstructed = user_factors.dot(item_factors.T)  # users x movies
    preds = pd.Series(reconstructed[user_ids.get_loc(user_id)], index=movie_ids)

    # Exclude already rated
    already_rated = rating_matrix.loc[user_id].dropna().index
    preds = preds.drop(index=already_rated, errors='ignore')

    topn = preds.sort_values(ascending=False).head(n_recommendations).reset_index()
    topn.columns = ['movie_id', 'pred_score']
    topn = topn.merge(df_info[['movie_id', 'movie_title']], on='movie_id', how='left')
    return topn


## 5) Evaluation

In [26]:
def train_test_split_per_user(df_ratings, test_size=0.2, min_ratings=5):
    """For each user with >= min_ratings, hold out a fraction of their ratings for test.
    Returns train_df, test_df
    """
    train_list = []
    test_list = []
    for user, group in df_ratings.groupby('user_id'):
        if len(group) < min_ratings:
            train_list.append(group)
            continue
        train_g, test_g = train_test_split(group, test_size=test_size, random_state=RANDOM_STATE)
        train_list.append(train_g)
        test_list.append(test_g)
    train_df = pd.concat(train_list).reset_index(drop=True)
    test_df = pd.concat(test_list).reset_index(drop=True) if test_list else pd.DataFrame(columns=df_ratings.columns)
    return train_df, test_df

In [27]:
def precision_at_k(recommended_list, ground_truth, k):
    """recommended_list: list of movie ids (ordered), ground_truth: set of relevant movie ids
       We consider relevant if rating >= 4 in test set (configurable outside)
    """
    if len(recommended_list) == 0:
        return 0.0
    recommended_at_k = recommended_list[:k]
    hits = sum([1 for m in recommended_at_k if m in ground_truth])
    return hits / k

In [28]:
def evaluate_model(recommender_func, rating_matrix_train, train_df, test_df, df_info, K_list=[5,10], method_name="model", **kwargs):
    """Evaluate recommender function which must accept (user_id, rating_matrix_train, ...) and return DataFrame with 'movie_id'.
    ground-truth: items in test_df with rating >= 4 are considered relevant.
    Returns average Precision@K over all users that have relevant items in test set.
    """
    # Build dict of ground truth per user (only keep items with rating >=4)
    gt = defaultdict(set)
    for _, row in test_df.iterrows():
        if row['rating'] >= 4:
            gt[row['user_id']].add(row['movie_id'])

    users_to_eval = [u for u in gt.keys() if u in rating_matrix_train.index]
    print(f"Evaluating {method_name} on {len(users_to_eval)} users (users with relevant test items)")

    results = {k: [] for k in K_list}
    for user in users_to_eval:
        recs = recommender_func(user, rating_matrix_train, df_info=df_info, **kwargs)
        rec_ids = recs['movie_id'].tolist() if not recs.empty else []
        for k in K_list:
            p = precision_at_k(rec_ids, gt[user], k)
            results[k].append(p)

    avg_results = {f'Precision@{k}': np.mean(results[k]) if results[k] else 0.0 for k in K_list}
    print(f"{method_name} results:", avg_results)
    return avg_results

## 6) Example workflow

In [30]:
def main():
    df_info, df_ratings, df_users = load_data()

    # Normalize column names after loading
    df_info = normalize_cols(df_info)
    df_ratings = normalize_cols(df_ratings)
    df_users = normalize_cols(df_users)


    # Quick filtering / sanity check: ensure ids are ints
    df_ratings['user_id'] = df_ratings['user_id'].astype(int)
    df_ratings['movie_id'] = df_ratings['movie_id'].astype(int)

    # Train/test split per user
    train_df, test_df = train_test_split_per_user(df_ratings, test_size=0.2, min_ratings=5)
    print('Train ratings:', train_df.shape, 'Test ratings:', test_df.shape)

    # Build rating matrix on train data
    rating_matrix_train = build_rating_matrix(train_df)

    # ----- User-based CF -----
    print('\nComputing user similarity...')
    user_sim_df = compute_user_similarity(rating_matrix_train)

    # Wrap user-based recommender to match evaluate_model signature
    def user_recommender_wrapper(user_id, rating_matrix_train, df_info=None, n_recommendations=10):
        return user_based_recommend(user_id, rating_matrix_train, user_sim_df, df_info, n_recommendations=n_recommendations)

    # Evaluate user-based CF
    user_results = evaluate_model(user_recommender_wrapper, rating_matrix_train, train_df, test_df, df_info, K_list=[5,10], method_name='User-CF', n_recommendations=10)

    # ----- Item-based CF -----
    print('\nComputing item similarity...')
    item_sim_df = compute_item_similarity(rating_matrix_train)

    def item_recommender_wrapper(user_id, rating_matrix_train, df_info=None, n_recommendations=10):
        return item_based_recommend(user_id, rating_matrix_train, item_sim_df, df_info, n_recommendations=n_recommendations)

    item_results = evaluate_model(item_recommender_wrapper, rating_matrix_train, train_df, test_df, df_info, K_list=[5,10], method_name='Item-CF', n_recommendations=10)

    # ----- SVD-based -----
    print('\nRunning SVD-based recommender...')
    def svd_recommender_wrapper(user_id, rating_matrix_train, df_info=None, n_recommendations=10):
        return svd_recommend(user_id, rating_matrix_train, df_info, n_components=50, n_recommendations=n_recommendations)

    svd_results = evaluate_model(svd_recommender_wrapper, rating_matrix_train, train_df, test_df, df_info, K_list=[5,10], method_name='SVD', n_recommendations=10)

    print('\nSummary:')
    print('User-CF:', user_results)
    print('Item-CF:', item_results)
    print('SVD   :', svd_results)

    # Example: show top 10 recommendations for a specific user (change user_id as needed)
    example_user = rating_matrix_train.index[0]
    print(f"\nTop 10 User-CF recommendations for user {example_user}:")
    print(user_recommender_wrapper(example_user, rating_matrix_train, df_info=df_info, n_recommendations=10))

    print(f"\nTop 10 Item-CF recommendations for user {example_user}:")
    print(item_recommender_wrapper(example_user, rating_matrix_train, df_info=df_info, n_recommendations=10))

    print(f"\nTop 10 SVD recommendations for user {example_user}:")
    print(svd_recommender_wrapper(example_user, rating_matrix_train, df_info=df_info, n_recommendations=10))


if __name__ == '__main__':
    main()

Loaded:
 (1682, 22) movie info | (100000, 4) ratings | (943, 5) users
Train ratings: (79619, 4) Test ratings: (20381, 4)

Computing user similarity...
Evaluating User-CF on 929 users (users with relevant test items)
User-CF results: {'Precision@5': np.float64(0.000861141011840689), 'Precision@10': np.float64(0.0005382131324004305)}

Computing item similarity...
Evaluating Item-CF on 929 users (users with relevant test items)
Item-CF results: {'Precision@5': np.float64(0.08245425188374596), 'Precision@10': np.float64(0.07793326157158235)}

Running SVD-based recommender...
Evaluating SVD on 929 users (users with relevant test items)
SVD results: {'Precision@5': np.float64(0.1618945102260495), 'Precision@10': np.float64(0.13078579117330463)}

Summary:
User-CF: {'Precision@5': np.float64(0.000861141011840689), 'Precision@10': np.float64(0.0005382131324004305)}
Item-CF: {'Precision@5': np.float64(0.08245425188374596), 'Precision@10': np.float64(0.07793326157158235)}
SVD   : {'Precision@5': 

#