In [1]:
import numpy as np 
import scipy
import pandas as pd 
import sklearn
import matplotlib.pyplot as plt
from skimage import io
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
import random
import math
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

In [4]:
anime_df = pd.read_csv('anime.csv')
rating_df = pd.read_csv('rating2.csv')

In [5]:
#replace -1 rating to null value
rating_df['rating'].replace(-1, np.nan, inplace=True)

In [6]:
# Display the first few rows to understand what each dataset looks like
print(anime_df.head())
print(rating_df.head())

# Get summary statistics and info about the datasets
print(anime_df.describe())
print(rating_df.describe())

print(anime_df.info())
print(rating_df.info())

# Check for missing values
print(anime_df.isnull().sum())
print(rating_df.isnull().sum())

   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type episodes  rating  \
0               Drama, Romance, School, Supernatural  Movie        1    9.37   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...     TV       64    9.26   
2  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.25   
3                                   Sci-Fi, Thriller     TV       24    9.17   
4  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.16   

   members  
0   200630  
1   793665  
2   114262  
3   673572  
4   151266  
   user_id  anime_id  rating
0        1        20     NaN
1        1        24     NaN
2        1        79     NaN
3        1       226     NaN
4

In [7]:
# Merge rating_df and anime_df on 'anime_id', handle column name conflicts by adding specified suffixes
anime_rating_df = rating_df.merge(anime_df, on='anime_id', how='inner', suffixes=('_user', ''))

# Display the first few rows of the merged dataset to check the merge
print(anime_rating_df.head(10))

   user_id  anime_id  rating_user    name  \
0        1        20          NaN  Naruto   
1        3        20          8.0  Naruto   
2        5        20          6.0  Naruto   
3        6        20          NaN  Naruto   
4       10        20          NaN  Naruto   
5       21        20          8.0  Naruto   
6       28        20          9.0  Naruto   
7       34        20          9.0  Naruto   
8       38        20          6.0  Naruto   
9       39        20         10.0  Naruto   

                                               genre type episodes  rating  \
0  Action, Comedy, Martial Arts, Shounen, Super P...   TV      220    7.81   
1  Action, Comedy, Martial Arts, Shounen, Super P...   TV      220    7.81   
2  Action, Comedy, Martial Arts, Shounen, Super P...   TV      220    7.81   
3  Action, Comedy, Martial Arts, Shounen, Super P...   TV      220    7.81   
4  Action, Comedy, Martial Arts, Shounen, Super P...   TV      220    7.81   
5  Action, Comedy, Martial Arts, Sho

# Drop Columns Not Used

In [8]:
anime_rating_df.drop(columns=['genre', 'type', 'episodes','rating','members'], inplace=True)
anime_rating_df.head(10)

Unnamed: 0,user_id,anime_id,rating_user,name
0,1,20,,Naruto
1,3,20,8.0,Naruto
2,5,20,6.0,Naruto
3,6,20,,Naruto
4,10,20,,Naruto
5,21,20,8.0,Naruto
6,28,20,9.0,Naruto
7,34,20,9.0,Naruto
8,38,20,6.0,Naruto
9,39,20,10.0,Naruto


In [9]:
#check missing value for the merged df
anime_rating_df.isnull().sum()

user_id           0
anime_id          0
rating_user    2169
name              0
dtype: int64

In [10]:
#drop all the null value
anime_rating_df.dropna(inplace=True)
anime_rating_df.shape

(6109, 4)

# Data Pre-processing

# 1) Keep only data with more than 10 Interactions

In [11]:
users_interactions_count_df = anime_rating_df.groupby(['user_id', 'anime_id']).size().groupby('user_id').size()
print('# users: %d' % len(users_interactions_count_df))
users_with_enough_interactions_df = users_interactions_count_df[users_interactions_count_df >= 10].reset_index()[['user_id']]
print('# users with at least 10 interactions: %d' % len(users_with_enough_interactions_df))

# users: 94
# users with at least 10 interactions: 67


In [12]:
print('# of interactions: %d' % len(anime_rating_df))
interactions_from_selected_users_df = anime_rating_df.merge(users_with_enough_interactions_df, 
               how = 'right',
               left_on = 'user_id',
               right_on = 'user_id')
print('# of interactions from users with at least 10 interactions: %d' % len(interactions_from_selected_users_df))

# of interactions: 6109
# of interactions from users with at least 10 interactions: 6005


# 2) Duplicate Checks

In [13]:
# count number of reviews done by a user on a anime for multiple times
rating_by_users_anime = interactions_from_selected_users_df.groupby(['user_id', 'anime_id'])["rating_user"].count().reset_index()
print(f'Number of duplicated ratings done on an anime by the same user: {rating_by_users_anime[rating_by_users_anime["rating_user"]>1]["rating_user"].count()}')

Number of duplicated ratings done on an anime by the same user: 0


# Split Data Set 80% Training 20% Test

In [18]:
from sklearn.model_selection import train_test_split
anime_rating_train_df, anime_rating_test_df = train_test_split(
    anime_rating_df, test_size=0.15, random_state=42
)

print('# No of Ratings on Train set:', len(anime_rating_train_df))
print('# No of Ratings on Test set:', len(anime_rating_test_df))

# No of Ratings on Train set: 5192
# No of Ratings on Test set: 917


In [19]:
#Indexing by user_id to speed up the searches during evaluation
anime_rating_train_indexed_df = anime_rating_train_df.set_index('user_id')
anime_rating_train_df_indexed_df = anime_rating_train_df.set_index('user_id')
anime_rating_test_df_indexed_df = anime_rating_test_df.set_index('user_id')

In [20]:
def get_items_rated(user_id, ratings_df):
    # Get the user's data and merge in the anime information.
    rated_items = ratings_df.loc[user_id]['anime_id']
    return set(rated_items if type(rated_items) == pd.Series else [rated_items])

In [21]:
class ModelEvaluator:
    EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS = 100

    def get_not_rated_items_sample(self, user_id, sample_size, seed=42):
        rated_items = get_items_rated(user_id, ratings_full_indexed_df)
        all_items = set(anime_df['anime_id'])  # Assume anime_df contains all anime IDs
        non_rated_items = all_items - rated_items

        random.seed(seed)
        non_rated_items_sample = random.sample(non_rated_items, sample_size)
        return set(non_rated_items_sample)

    def _verify_hit_top_n(self, item_id, recommended_items, topn):
        index = next((i for i, c in enumerate(recommended_items) if c == item_id), -1)
        return int(index < topn), index
    
    def _count_hit_at_top_n(self, rated_items, recommended_items, topn):
        topn_items = set(recommended_items[:topn])
        return len(topn_items & rated_items)

    def _average_precision_top_n(self, rated_items, recommended_items, topn):
        precision = sum(len(set(recommended_items[:n]) & rated_items) / n for n in range(1, topn + 1))
        return precision / topn

    def evaluate_model_for_user(self, model, user_id):
        rated_values_testset = ratings_test_indexed_df.loc[user_id]
        person_rated_items_testset = set(rated_values_testset['anime_id']) if type(rated_values_testset['anime_id']) == pd.Series else {rated_values_testset['anime_id']}
        rated_items_count_testset = len(person_rated_items_testset) 
        person_recs_df = model.recommend_items(user_id, items_to_ignore=get_items_rated(user_id, ratings_train_indexed_df), topn=10000000000)
        
        person_recs_ids = person_recs_df['anime_id'].tolist()
        p_hits_at_5_count = self._count_hit_at_top_n(person_rated_items_testset, person_recs_ids, 5)
        p_hits_at_10_count = self._count_hit_at_top_n(person_rated_items_testset, person_recs_ids, 10)

        hits_at_5_count = hits_at_10_count = 0
        for item_id in person_rated_items_testset:
            non_rated_items_sample = self.get_not_rated_items_sample(user_id, self.EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS, item_id % (2**32))
            valid_recs = person_recs_df[person_recs_df['anime_id'].isin(non_rated_items_sample.union({item_id}))]['anime_id'].values
            hit_at_5, _ = self._verify_hit_top_n(item_id, valid_recs, 5)
            hit_at_10, _ = self._verify_hit_top_n(item_id, valid_recs, 10)
            hits_at_5_count += hit_at_5
            hits_at_10_count += hit_at_10

        person_metrics = {
            'hits@5_count': hits_at_5_count, 
            'hits@10_count': hits_at_10_count,
            'rated_count': rated_items_count_testset,
            'recall@5': hits_at_5_count / float(rated_items_count_testset),
            'recall@10': hits_at_10_count / float(rated_items_count_testset),
            'p_hits@5_count': p_hits_at_5_count,
            'p_hits@10_count': p_hits_at_10_count,
            'precision@5': p_hits_at_5_count / 5.0,
            'precision@10': p_hits_at_10_count / 10.0,
            'average_p@5': self._average_precision_top_n(person_rated_items_testset, person_recs_ids, 5),
            'average_p@10': self._average_precision_top_n(person_rated_items_testset, person_recs_ids, 10)
        }
        
        return person_metrics

    def evaluate_model(self, model):
        people_metrics = []
        probs = []
        for idx, user_id in enumerate(ratings_test_indexed_df.index.unique()):
            person_metrics, prob = self.evaluate_model_for_user(model, user_id)
            person_metrics['_user_id'] = user_id
            people_metrics.append(person_metrics)
            probs += prob

        detailed_results_df = pd.DataFrame(people_metrics).sort_values('rated_count', ascending=False)
        i = len(people_metrics)
        global_metrics = {
            'modelName': model.get_model_name(),
            'recall@5': detailed_results_df['hits@5_count'].sum() / detailed_results_df['rated_count'].sum(),
            'recall@10': detailed_results_df['hits@10_count'].sum() / detailed_results_df['rated_count'].sum(),
            'precision@5': detailed_results_df['p_hits@5_count'].sum() / (5 * i),
            'precision@10': detailed_results_df['p_hits@10_count'].sum() / (10 * i),
            'map@5': detailed_results_df['average_p@5'].mean(),
            'map@10': detailed_results_df['average_p@10'].mean()
        }

        probs_df = pd.DataFrame(probs)
        return global_metrics, detailed_results_df, probs_df

model_evaluator = ModelEvaluator()
