# Anime Recommender Collaborative Filtering

In [55]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


from scipy.sparse import csr_matrix
from math import sqrt
from tqdm import tqdm

- Memory based (KNN)
  - User based
  - Item based
- Modal based (Matrix Factorization)

## User-Item Matrix

In [4]:
anime_df = pd.read_csv('data/anime.csv')
anime_df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [5]:
rating_df = pd.read_csv('data/rating.csv')
rating_df.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [6]:
rating_df.drop_duplicates(subset=['user_id', 'anime_id'], keep='first', inplace=True)

In [7]:
rating_df['rating'] = rating_df['rating'].replace(-1, np.nan)

# Create user-item matrix
user_anime_matrix = rating_df.pivot(index='user_id', columns='anime_id', values='rating')

In [8]:
# Normalize the ratings
user_anime_matrix_normalized = user_anime_matrix.apply(lambda x: (x - np.nanmean(x)) / np.nanstd(x), axis=1)

# Fill missing values with 0
user_anime_matrix_filled = user_anime_matrix_normalized.fillna(0)

  user_anime_matrix_normalized = user_anime_matrix.apply(lambda x: (x - np.nanmean(x)) / np.nanstd(x), axis=1)
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,


In [9]:
user_anime_matrix_filled.head()

anime_id,1,5,6,7,8,15,16,17,18,19,...,34283,34324,34325,34349,34358,34367,34412,34475,34476,34519
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,1.532301,0.0,0.0,0.691504,0.0,0.691504,0.691504,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
user_anime_matrix_filled.info()

<class 'pandas.core.frame.DataFrame'>
Index: 73515 entries, 1 to 73516
Columns: 11200 entries, 1 to 34519
dtypes: float64(11200)
memory usage: 6.1 GB


## Util Functions

In [42]:
def train_test_split_sparse(matrix, test_size=0.2, seed=42):
    """Split a sparse matrix into a training and a test set."""
    np.random.seed(seed)

    # Make a copy of the original matrix
    train_matrix = matrix.copy()
    test_matrix = matrix.copy()

    # Mask the test set entries in the training matrix
    non_zero_indices = train_matrix.nonzero()
    non_zero_pairs = list(zip(non_zero_indices[0], non_zero_indices[1]))

    # Sample test set entries
    test_sample_size = int(len(non_zero_pairs) * test_size)
    test_pairs = np.random.choice(len(non_zero_pairs), size=test_sample_size, replace=False)

    for idx in test_pairs:
        train_matrix[non_zero_pairs[idx]] = 0  # Set to zero in the training set
        test_matrix[non_zero_pairs[idx]] = matrix[non_zero_pairs[idx]]  # Preserve the value in the test set

    # Eliminate zero entries in test set
    train_matrix.eliminate_zeros()
    test_matrix.eliminate_zeros()

    return train_matrix, test_matrix

In [67]:
def predict_ratings(user_index, item_index, train_matrix, user_similarity_matrix, top_n=20):
    """Predict ratings for a given user and item based on user-based collaborative filtering."""
    # Get the similarity scores for the given user
    if isinstance(user_similarity_matrix, np.ndarray):
        user_similarities = user_similarity_matrix[user_index]
    else:
        user_similarities = user_similarity_matrix[user_index].toarray().flatten()    
    # Get the ratings of the item by all users
    item_ratings = train_matrix[:, item_index].toarray().flatten()
    
    # Select only users who have rated the item
    rated_users = item_ratings.nonzero()[0]
    
    # If no users have rated the item, return NaN
    if len(rated_users) == 0:
        return np.nan

    # Select top-N similar users who have rated the item
    top_similar_users = np.argsort(user_similarities[rated_users])[-top_n:]
    rated_users = rated_users[top_similar_users]

    # Compute weighted average rating
    weighted_sum = np.dot(user_similarities[rated_users], item_ratings[rated_users])
    sum_of_weights = np.sum(user_similarities[rated_users])
    
    return weighted_sum / sum_of_weights if sum_of_weights != 0 else np.nan


In [61]:
def evaluate_model(train_matrix, test_matrix, user_similarity_matrix, similarity_metric_name):
    """Evaluate model performance on the test set."""
    true_ratings = []
    predicted_ratings = []

    # Get the non-zero entries in the test set
    test_non_zero_indices = test_matrix.nonzero()
    test_non_zero_pairs = list(zip(test_non_zero_indices[0], test_non_zero_indices[1]))

    # Predict ratings for all non-zero entries in the test set
    for user_index, item_index in tqdm(test_non_zero_pairs, desc=f"Evaluating model ({similarity_metric_name})"):
        # Get the true rating from the test matrix
        true_rating = test_matrix[user_index, item_index]
        
        # Predict the rating
        predicted_rating = predict_ratings(user_index, item_index, train_matrix, user_similarity_matrix)

        # Store the true and predicted ratings
        if not np.isnan(predicted_rating):
            true_ratings.append(true_rating)
            predicted_ratings.append(predicted_rating)

    # Calculate Mean Absolute Error (MAE), Root Mean Squared Error (RMSE), and R2 Score
    mae = mean_absolute_error(true_ratings, predicted_ratings)
    rmse = np.sqrt(mean_squared_error(true_ratings, predicted_ratings))
    r2 = r2_score(true_ratings, predicted_ratings)

    print(f"[{similarity_metric_name}] Mean Absolute Error (MAE): {mae:.4f}")
    print(f"[{similarity_metric_name}] Root Mean Squared Error (RMSE): {rmse:.4f}")
    print(f"[{similarity_metric_name}] R2 Score: {r2:.4f}")

    return mae, rmse, r2

## User Based Filtering

In [48]:
user_item_sparse = csr_matrix(user_anime_matrix_filled[:1000].values)

In [49]:
user_item_sparse

<1000x11200 sparse matrix of type '<class 'numpy.float64'>'
	with 77282 stored elements in Compressed Sparse Row format>

In [53]:
train_sparse, test_sparse = train_test_split_sparse(user_item_sparse, test_size=0.2)

In [56]:
# Calculate user-user similarity matrix using cosine similarity on the training set
user_similarity_cosine = cosine_similarity(train_sparse, dense_output=False)
print("User similarity matrix (cosine) calculated.")

# Calculate user-user similarity matrix using Pearson correlation
user_similarity_pearson = 1 - pairwise_distances(train_sparse.toarray(), metric='correlation')
user_similarity_pearson = np.nan_to_num(user_similarity_pearson)  # Replace NaNs with 0
print("User similarity matrix (Pearson) calculated.")

User similarity matrix (cosine) calculated.
User similarity matrix (Pearson) calculated.


In [68]:
print("\nEvaluating with Cosine Similarity:")
mae_cosine, rmse_cosine, r2_cosine = evaluate_model(train_sparse, test_sparse, user_similarity_cosine, "Cosine Similarity")

print("\nEvaluating with Pearson Correlation:")
mae_pearson, rmse_pearson, r2_pearson = evaluate_model(train_sparse, test_sparse, user_similarity_pearson, "Pearson Correlation")


Evaluating with Cosine Similarity:


Evaluating model (Cosine Similarity): 100%|████████████████████████████████████████████| 77282/77282 [00:08<00:00, 9036.86it/s]


[Cosine Similarity] Mean Absolute Error (MAE): 0.3922
[Cosine Similarity] Root Mean Squared Error (RMSE): 1.0462
[Cosine Similarity] R2 Score: -0.0938

Evaluating with Pearson Correlation:


Evaluating model (Pearson Correlation): 100%|█████████████████████████████████████████| 77282/77282 [00:06<00:00, 11574.36it/s]

[Pearson Correlation] Mean Absolute Error (MAE): 0.3922
[Pearson Correlation] Root Mean Squared Error (RMSE): 1.0429
[Pearson Correlation] R2 Score: -0.0870





## Item Based Filtering

## Matrix Factorization