# Content-based Recommendation System

This notebook implements a **Content-Based Recommender System** on the [MovieLens 100K dataset](https://grouplens.org/datasets/movielens/100k/), using genre metadata as item features.


## Objective

Build a personalized recommender that suggests movies to users based on their preferences for movie **genres**.

In [220]:
import pandas as pd
import numpy as np

## Load the dataset

- Get the number of users and items:

In [221]:
data_info = pd.read_csv('../data/ml-100k/u.info', sep=' ', header=None)
n_users = data_info.iloc[0, 0]
n_items = data_info.iloc[1, 0]
print('Number of users:', n_users)
print('Number of items:', n_items)

Number of users: 943
Number of items: 1682


- Get genres of items:

In [222]:
movie_genres = pd.read_csv('../data/ml-100k/u.genre', sep='|', encoding='latin-1',
                           usecols=[0],names=['genre'])
movie_genres

Unnamed: 0,genre
0,unknown
1,Action
2,Adventure
3,Animation
4,Children's
5,Comedy
6,Crime
7,Documentary
8,Drama
9,Fantasy


## Build the item profiles

In [223]:
movies = pd.read_csv('../data/ml-100k/u.item', sep='|', encoding='latin-1',
                     index_col=0, header=None, 
                     names=['item_id'] + movie_genres['genre'].to_list(), 
                     usecols=[0] + list(range(5, 24)))
movies

Unnamed: 0_level_0,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1678,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1679,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
1680,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
1681,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


## Data Preprocessing

In [224]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
tfidf_item_matrix = tfidf_transformer.fit_transform(movies)
item_features = pd.DataFrame(tfidf_item_matrix.toarray(),
                           columns=tfidf_transformer.get_feature_names_out(),
                           index=movies.index)

# Find model for each user

In [225]:
from sklearn.linear_model import Ridge

class ContentBasedRecommender:
    def __init__(self, ):
        self.W = None
        self.b = None
        
    def fit(self, training_ratings, item_features):
        n_users = training_ratings['user_id'].max()
        n_features = item_features.shape[1]

        self.W = np.zeros((n_users, n_features))
        self.b = np.zeros((n_users, 1))
        for user_id in range(1, n_users+1):
            user_ratings = training_ratings[training_ratings['user_id'] == user_id]
            
            model = Ridge(alpha=0.01, fit_intercept=True)
            model.fit(item_features.loc[user_ratings['item_id']], user_ratings['rating'])

            self.W[user_id-1] = model.coef_
            self.b[user_id-1] = model.intercept_
        
    def predict(self, user_id, item_id):
        if self.W is None:
            raise ValueError("Model has not been fitted yet.")
        if user_id < 1 or user_id > n_users:
            raise ValueError("User ID out of range.")
        if item_id < 1 or item_id > n_items:
            raise ValueError("Item ID out of range.")
        return np.dot(self.W[user_id-1], item_features.loc[item_id]) + self.b[user_id-1]
    
    def evaluate(self, test_ratings: pd.DataFrame):
        if self.W is None:
            raise ValueError("Model has not been fitted yet.")
        sum_squared_error = 0
        for i in range(len(test_ratings)):
            row = test_ratings.iloc[i]
            sum_squared_error += (row['rating'] - self.predict(row['user_id'], row['item_id'])) ** 2
        mse = sum_squared_error / len(test_ratings)
        return mse

# Evaluate the model

In [227]:
for i in range(1, 6):
    recommender = ContentBasedRecommender()
    training_ratings = pd.read_csv(f'../data/ml-100k/u{i}.base', sep='\t', 
                        names=['user_id', 'item_id', 'rating', 'timestamp'])
    test_ratings = pd.read_csv(f'../data/ml-100k/u{i}.test', sep='\t',
                      names=['user_id', 'item_id', 'rating', 'timestamp'])
    recommender.fit(training_ratings, item_features)
    
    mse_training = recommender.evaluate(training_ratings)
    mse_test = recommender.evaluate(test_ratings)
    print(f"Iteration {i}: MSE Training: {mse_training}, MSE Test: {mse_test}")

Iteration 1: MSE Training: [0.79551039], MSE Test: [1.36234281]
Iteration 2: MSE Training: [0.7973019], MSE Test: [1.29353095]
Iteration 3: MSE Training: [0.8046709], MSE Test: [1.27644746]
Iteration 4: MSE Training: [0.80495259], MSE Test: [1.31225258]
Iteration 5: MSE Training: [0.80473013], MSE Test: [1.31995933]
