### How can we effectively recommend high-quality, highly-rated animations to users by integrating both anime and user features? (RQ5)


In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# road the dataset
df_anime = pd.read_csv('data/anime-dataset-2023.csv')
df_score = pd.read_csv('data/users-score-2023.csv')
df_user = pd.read_csv('data/users-details-2023.csv')

In [None]:
# preprocess for anime dataset
df_anime = df_anime[df_anime['Score'] != 'UNKNOWN']
df_anime = df_anime[df_anime['Scored By'] != 'UNKNOWN']
# convert the floating-point numbers in the Scored By column to strings for processing
df_anime['Scored By'] = df_anime['Scored By'].astype(str).str.split('.').str[0]
df_anime['Score'] = df_anime['Score'].astype(np.float64)
df_anime['Scored By'] = df_anime['Scored By'].astype(np.int64)
df_anime = df_anime[df_anime['Scored By'] >= 1000]
df_filtered = df_anime[df_anime['Type'].isin(['TV', 'Movie', 'OVA'])] # only reserve TV Movie OVA
# filter animations with a duration of less than 1 hour
df_filtered = df_filtered[df_filtered['Duration'].apply(lambda x: 'hr' not in x)]
# filter out convertible years and retain only 21st-century animations
def extract_year(aired_date):
    try:
        return int(aired_date.split(' ')[-1])
    except ValueError:
        return np.nan
df_filtered['Year'] = df_filtered['Aired'].apply(extract_year)
df_filtered = df_filtered.dropna(subset=['Year'])
df_filtered = df_filtered[df_filtered['Year'] >= 2000]
df_sorted = df_filtered.sort_values(by='Score', ascending=False)
df_anime_500 = df_sorted.head(500)

In [None]:
# preprocess for users' scores dataset
df_score = df_score.dropna()
user_counts = df_score['user_id'].value_counts()
valid_users = user_counts[user_counts >= 3].index # drop users who rated less than 3 anime
df_score_filtered = df_score[df_score['user_id'].isin(valid_users)]
df_top3_per_user = df_score_filtered.sort_values(by=['user_id', 'rating', 'anime_id'], ascending=[True, False, True])
df_top3_per_user = df_top3_per_user.groupby('user_id').apply(lambda x: x.sort_values(by='rating', ascending=False)).reset_index(drop=True)

In [None]:
df_user_watched = df_top3_per_user # save a copy for filtering the anime watched by users

In [None]:
# merge two data frames to obtain popularity
df_top3_per_user = df_top3_per_user.merge(df_anime_500[['anime_id', 'Popularity']], on='anime_id', how='left')
df_top3_per_user = df_top3_per_user.dropna(subset=['Popularity'])
df_top3_per_user = df_top3_per_user.sort_values(by=['user_id', 'rating', 'Popularity'], ascending=[True, False, True])
df_top3_per_user = df_top3_per_user.groupby('user_id').head(3).reset_index(drop=True)

In [None]:
df_top3_per_user

In [None]:
# preprocess for user dataset
df_user = df_user.dropna(subset=['Birthday'])

df_user['Birthday'] = pd.to_datetime(df_user['Birthday'], errors='coerce') # ensure the Birthday column is of datetime type
current_year = 2023
df_user['age'] = current_year - df_user['Birthday'].dt.year # calculate age of users

In [None]:
# handle the missing gender, such as filling it with an empty string
df_user['Gender'] = df_user['Gender'].fillna('')
# check the distribution of gender
print(df_user['Gender'].value_counts())
# analyze the preferences of users of different genders in rating different types of animations
gender_preferences = df_score_filtered.merge(df_user[['Mal ID', 'Gender']], left_on='user_id', right_on='Mal ID')
# Calculate the mean and median of scores by gender.
gender_anime_ratings = gender_preferences.merge(df_anime_500[['anime_id', 'Genres']], on='anime_id')
gender_rating_stats = gender_anime_ratings.groupby(['Gender', 'Genres'])['rating'].agg(['mean', 'median']).reset_index()
# check the result
print(gender_rating_stats.sort_values(by='mean', ascending=False))

In [None]:
# initialize the BERT model and tokenizer locally (we cannot initialize the model online for internet issue)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [None]:
# move the model to the GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# convert the text into a fixed-size vector
def get_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().cpu()

In [None]:
# calculate embeddings for each anime in df_anime_500
df_anime_500 = df_anime_500.copy()
df_anime_500['embedding'] = df_anime_500['Synopsis'].apply(get_embedding)

In [None]:
df_top3_per_user

In [None]:
# the recommendation function
def recommend_anime(user_id, num_recommendations=5):
    user = df_user[df_user['Mal ID'] == user_id]
    # check if the user exists
    if user.empty:
        print(f"User ID {user_id} doesn't exist。")
        return None

    user_age = user['age'].values[0]
    user_gender = user['Gender'].values[0]

    # Get the top three highest-rated animations by users.
    user_top_anime_ids = df_top3_per_user[df_top3_per_user['user_id'] == user_id]['anime_id'].unique()
    if len(user_top_anime_ids) == 0:
        print(f"User ID {user_id} doesn't have enough anime。")
        return None

    user_top_anime = df_anime_500[df_anime_500['anime_id'].isin(user_top_anime_ids)]

    # initialize an empty dataframe to store similarity results
    similarity_results = pd.DataFrame()

    # print the 3 highest-rated anime by users
    print("The 3 highest-rated anime by users:")
    print(df_top3_per_user[df_top3_per_user['user_id'] == user_id][['anime_id', 'Anime Title', 'rating']])

    # recommend based on three animations one by one
    for _, user_anime in user_top_anime.iterrows():
        user_embedding = user_anime['embedding']

        df_anime_500_copy = df_anime_500.copy()
        df_anime_500_copy['similarity'] = df_anime_500_copy['embedding'].apply(lambda x: cosine_similarity([user_embedding], [x])[0][0])

        # filter the age of users
        if user_age < 17:
            df_anime_500_copy = df_anime_500_copy[df_anime_500_copy['Rating'] != 'R - 17+ (violence & profanity)']

        # combine gender preference (weighted values scored by gender preference)
        if user_gender:
            for genre in df_anime_500_copy['Genres'].unique():
                gender_genre_mean = gender_rating_stats[(gender_rating_stats['Gender'] == user_gender) & (gender_rating_stats['Genres'] == genre)]['mean']
                if not gender_genre_mean.empty:
                    mean_rating = gender_genre_mean.values[0]
                    df_anime_500_copy.loc[df_anime_500_copy['Genres'] == genre, 'similarity'] *= (mean_rating / 10)

        # calculated the weighted score
        df_anime_500_copy['score_weighted'] = (df_anime_500_copy['Score'] * 0.5 +
                                               df_anime_500_copy['Favorites'] / 1000 * 0.3 +
                                               df_anime_500_copy['Popularity'].apply(lambda y: (50 / y) * 0.2))

        # Remove the animations already included in the user ratings.
        df_anime_500_filtered = df_anime_500_copy[~df_anime_500_copy['anime_id'].isin(df_user_watched)]

        # select the top 5 most similar animations
        top_recommendations = df_anime_500_filtered.sort_values(by='similarity', ascending=False).head(5)
        similarity_results = pd.concat([similarity_results, top_recommendations])

    # calculate the total weighted score
    similarity_results['score_weighted'] = (similarity_results['Score'] * 0.5 +
                                            similarity_results['Favorites'] / 1000 * 0.3 +
                                            similarity_results['Popularity'].apply(lambda y: (50 / y) * 0.2))

    # remove duplicates and select the top 'num_recommendations' animations with the highest weighted scores
    final_recommendations = similarity_results.drop_duplicates(subset='anime_id').sort_values(by='score_weighted', ascending=False).head(num_recommendations)

    # print the results
    print("\nRecommended anime:")
    print(final_recommendations[['Name', 'Score', 'Popularity', 'Favorites', 'similarity', 'score_weighted']])

    return final_recommendations

In [None]:
df_user # check valid user ids

In [None]:
recommend_anime(20, 5) # test