In [1]:
# Movie Recommender System (MRS) + Examples

import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD

# Movie Recommender Class

class MRS:
    def __init__(self, data, user_column='user_id', movie_column='item_id', rating_column='rating',
                 recommend_count=10, k_value=5, mode='user-based'):
        self.user_col = user_column
        self.item_col = movie_column
        self.rating_col = rating_column
        self.recommend_count = recommend_count
        self.k_value = k_value
        self.mode = mode

        self.user_item_matrix = data.pivot(index=self.user_col, columns=self.item_col, values=self.rating_col)

        # Compute similarity
        if self.mode == "user-based":
            matrix = self.user_item_matrix.fillna(0).values
            self.similarity = cosine_similarity(matrix)
            self.similarity = pd.DataFrame(self.similarity,
                                           index=self.user_item_matrix.index,
                                           columns=self.user_item_matrix.index)
        elif self.mode == "item-based":
            matrix = self.user_item_matrix.fillna(0).T.values
            self.similarity = cosine_similarity(matrix)
            self.similarity = pd.DataFrame(self.similarity,
                                           index=self.user_item_matrix.columns,
                                           columns=self.user_item_matrix.columns)

    def __knn(self, idx):
        sims = self.similarity.loc[idx].sort_values(ascending=False)
        return sims.iloc[1:self.k_value + 1]

    def __predict(self, user_id, item_id):
        if self.mode == "user-based":
            if pd.isna(self.user_item_matrix.loc[user_id, item_id]):
                neighbors = self.__knn(user_id)
                num, den = 0, 0
                for other, sim in neighbors.items():
                    if not pd.isna(self.user_item_matrix.loc[other, item_id]):
                        num += sim * self.user_item_matrix.loc[other, item_id]
                        den += sim
                return num / den if den > 0 else 0
            return self.user_item_matrix.loc[user_id, item_id]
        elif self.mode == "item-based":
            if pd.isna(self.user_item_matrix.loc[user_id, item_id]):
                neighbors = self.__knn(item_id)
                num, den = 0, 0
                for other, sim in neighbors.items():
                    if not pd.isna(self.user_item_matrix.loc[user_id, other]):
                        num += sim * self.user_item_matrix.loc[user_id, other]
                        den += sim
                return num / den if den > 0 else 0
            return self.user_item_matrix.loc[user_id, item_id]

    def recommend(self, user_id, n_recs=None):
        n_recs = n_recs or self.recommend_count
        unrated = self.user_item_matrix.loc[user_id][self.user_item_matrix.loc[user_id].isna()].index
        preds = {item: self.__predict(user_id, item) for item in unrated}
        ranked = sorted(preds.items(), key=lambda x: x[1], reverse=True)
        return pd.Index([i[0] for i in ranked[:n_recs]]), np.array([i[1] for i in ranked[:n_recs]])

    def fit_svd(self, n_components=20):
        matrix = self.user_item_matrix.fillna(0).values
        svd = TruncatedSVD(n_components=n_components, random_state=42)
        U = svd.fit_transform(matrix)
        VT = svd.components_
        self.__predicted_svd = pd.DataFrame(np.dot(U, VT),
                                            index=self.user_item_matrix.index,
                                            columns=self.user_item_matrix.columns)
        print(f"SVD fitted with {n_components} latent factors.")

    def recommend_svd(self, user_id, top_n=10):
        preds = self.__predicted_svd.loc[user_id]
        ranked = preds.sort_values(ascending=False).head(top_n)
        return pd.Index(ranked.index), np.array(ranked.values)

    def evaluate(self, test_df, k=10):
        precisions = []
        for user_id in test_df['user_id'].unique():
            recs, _ = self.recommend(user_id, n_recs=k)
            relevant = test_df[(test_df['user_id'] == user_id) & (test_df['rating'] >= 4)]['item_id'].values
            if len(relevant) == 0:
                continue
            hits = len(set(recs).intersection(set(relevant)))
            precisions.append(hits / k)
        return np.mean(precisions) if precisions else 0

    def evaluate_svd(self, test_df, n_components=20, k=10):
        self.fit_svd(n_components)
        precisions = []
        for user_id, group in test_df.groupby('user_id'):
            recs, _ = self.recommend_svd(user_id, top_n=k)
            relevant = group[group['rating'] >= 4]['item_id'].values
            hits = len(set(recs).intersection(set(relevant)))
            precisions.append(hits / k if k > 0 else 0)
        return np.mean(precisions) if precisions else 0


# Example Usage

columns = ['user_id', 'item_id', 'rating', 'timestamp']
data = pd.read_csv('u.data', sep='\t', names=columns)
train = pd.read_csv('u1.base', sep='\t', names=columns)
test = pd.read_csv('u1.test', sep='\t', names=columns)

# Basic recommendation

mrs = MRS(data, user_column='user_id', movie_column='item_id', rating_column='rating')
print(mrs.recommend(1))

# Item-based evaluation

mrs = MRS(data=train, user_column='user_id', movie_column='item_id', rating_column='rating', mode="item-based", recommend_count=10, k_value=1)
print(mrs.evaluate(test))

# User-based evaluation

mrs = MRS(data=train, user_column='user_id', movie_column='item_id', rating_column='rating', mode="user-based", recommend_count=10, k_value=1)
print(mrs.evaluate(test))

# SVD recommendations

mrs.fit_svd(n_components=40)
print(mrs.recommend_svd(1, top_n=5))
print(mrs.evaluate_svd(test, k=5))


(Index([343, 919, 283, 302, 313, 331, 430, 483, 508, 512], dtype='int64'), array([5., 5., 5., 5., 5., 5., 5., 5., 5., 5.]))
0.19495614035087722
0.2199561403508772
SVD fitted with 40 latent factors.
(Index([50, 127, 172, 181, 1], dtype='int64', name='item_id'), array([6.25413965, 5.07909597, 4.51530785, 4.2789447 , 4.20063098]))
SVD fitted with 20 latent factors.
0.1411764705882353
