# **The Genres of animes**

In [1]:
%cd ..

/mnt/c/Users/Yoshi/source/repos/Sideprojects/Anime_Recommender_system


# Load the library

In [2]:
import numpy as np

# load the anime data with its genre.

In [3]:
import pandas as pd

INPUT_DIR = './data'

animes_genres_df = pd.read_csv(
    INPUT_DIR + '/anime_with_synopsis.csv', 
    low_memory=False, 
    usecols=["MAL_ID", "Name", "Genres"]
)
animes_genres_df.rename({"MAL_ID": "anime_id"}, inplace=True, axis=1)

anime_num = len(animes_genres_df["anime_id"].unique())
print(f"Number of anime's types: {anime_num}")

genre_seperate = animes_genres_df["Genres"].str.split(pat=",", expand=True)
genre_series = pd.concat([genre_seperate[col] for col in genre_seperate.columns])
genre_set = set([item.strip() for item in genre_series.unique().tolist() if item is not None])
print(f"Set of genres: {genre_set}")
genre_num = len(genre_set)
print(f"Number of genre: {genre_num}")

animes_genres_df.head(5)

Number of anime's types: 16214
Set of genres: {'Shoujo', 'Kids', 'Romance', 'Fantasy', 'Thriller', 'Slice of Life', 'Seinen', 'Unknown', 'Shounen', 'Supernatural', 'Sports', 'Demons', 'Yaoi', 'Martial Arts', 'Psychological', 'Horror', 'Ecchi', 'Historical', 'Dementia', 'Drama', 'Adventure', 'Game', 'Parody', 'Military', 'Police', 'Josei', 'Action', 'Samurai', 'Mecha', 'Harem', 'Shounen Ai', 'Magic', 'Sci-Fi', 'Cars', 'Shoujo Ai', 'Comedy', 'Space', 'Super Power', 'Mystery', 'Vampire', 'Music', 'School'}
Number of genre: 42


Unnamed: 0,anime_id,Name,Genres
0,1,Cowboy Bebop,"Action, Adventure, Comedy, Drama, Sci-Fi, Space"
1,5,Cowboy Bebop: Tengoku no Tobira,"Action, Drama, Mystery, Sci-Fi, Space"
2,6,Trigun,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen"
3,7,Witch Hunter Robin,"Action, Mystery, Police, Supernatural, Drama, ..."
4,8,Bouken Ou Beet,"Adventure, Fantasy, Shounen, Supernatural"


In [4]:
genre2genre_idx = {x: i for i, x in enumerate(genre_set)}
genre_idx2genre = {i: x for i, x in enumerate(genre_set)}

# load the preprocessed user data with anime rating.

In [5]:
rating_df = pd.read_csv(INPUT_DIR + '/preprocessed_animelist.csv', 
                        low_memory=False, 
                        usecols=["user_id", "anime_id", "rating"]
                        )

rating_df.head(5)

Unnamed: 0,user_id,anime_id,rating
0,279596,2695,0.0
1,121782,15505,0.0
2,160479,39085,0.0
3,53950,32245,0.0
4,86402,35860,0.8


# Join two different table

In [6]:
rating_with_genres_df = rating_df.merge(animes_genres_df, on="anime_id", how="inner")
rating_with_genres_df.head(5)

Unnamed: 0,user_id,anime_id,rating,Name,Genres
0,279596,2695,0.0,Mobile Suit Gundam 0083: The Fading Light of Zeon,"Action, Sci-Fi, Adventure, Space, Mecha"
1,297666,2695,0.0,Mobile Suit Gundam 0083: The Fading Light of Zeon,"Action, Sci-Fi, Adventure, Space, Mecha"
2,251620,2695,0.6,Mobile Suit Gundam 0083: The Fading Light of Zeon,"Action, Sci-Fi, Adventure, Space, Mecha"
3,152637,2695,0.0,Mobile Suit Gundam 0083: The Fading Light of Zeon,"Action, Sci-Fi, Adventure, Space, Mecha"
4,74911,2695,0.0,Mobile Suit Gundam 0083: The Fading Light of Zeon,"Action, Sci-Fi, Adventure, Space, Mecha"


In [7]:
rating_with_genres_df = rating_with_genres_df[rating_with_genres_df['rating'] > 0]
# rating_with_genres_df.set_index(['user_id'], inplace=True)
rating_with_genres_df['Genres'] = rating_with_genres_df['Genres'].str.split(pat=",", expand=False)
rating_with_genres_df.head(5)

Unnamed: 0,user_id,anime_id,rating,Name,Genres
2,251620,2695,0.6,Mobile Suit Gundam 0083: The Fading Light of Zeon,"[Action, Sci-Fi, Adventure, Space, Mecha]"
13,55748,2695,0.7,Mobile Suit Gundam 0083: The Fading Light of Zeon,"[Action, Sci-Fi, Adventure, Space, Mecha]"
16,90350,2695,0.5,Mobile Suit Gundam 0083: The Fading Light of Zeon,"[Action, Sci-Fi, Adventure, Space, Mecha]"
17,196906,2695,0.5,Mobile Suit Gundam 0083: The Fading Light of Zeon,"[Action, Sci-Fi, Adventure, Space, Mecha]"
18,310955,2695,0.6,Mobile Suit Gundam 0083: The Fading Light of Zeon,"[Action, Sci-Fi, Adventure, Space, Mecha]"


# Convert the table to the user_id with corresponded feature representation.

In [8]:
rating_with_genres_dict = rating_with_genres_df.to_dict(orient='list')
genres_list = rating_with_genres_dict.pop('Genres')
rating_list = rating_with_genres_dict.pop('rating')

In [9]:
from tqdm import tqdm

genre_feature_np = []

for i, genres in enumerate(tqdm(genres_list)):
    feature_np = np.zeros((genre_num), dtype=float)
    for genre in genres:
        genre = genre.strip()
        feature_np[genre2genre_idx[genre]] = rating_list[i]
    genre_feature_np.append(feature_np)

rating_with_genres_dict['genre_vector'] = genre_feature_np

100%|██████████| 3405589/3405589 [00:09<00:00, 355544.61it/s]


In [10]:
user_with_genre_vector = pd.DataFrame(rating_with_genres_dict)
user_with_vector = user_with_genre_vector.groupby('user_id')[['genre_vector']].agg(lambda x: x.sum(axis=0))
user_with_vector['genre_vector'] = user_with_vector['genre_vector'].transform(lambda x: x/np.linalg.norm(x))
user_with_vector.reset_index(inplace=True)
user_with_vector.head(5)

Unnamed: 0,user_id,genre_vector
0,146,"[0.1917374853523082, 0.0025949434107831173, 0...."
1,240,"[0.009951391847854183, 0.021259791674961215, 0..."
2,436,"[0.03491061930498524, 0.0014290896791514423, 0..."
3,446,"[0.039486823995895814, 0.006513290556024052, 0..."
4,781,"[0.0393107249838712, 0.05666015109407131, 0.17..."


user_id with its corresponding index

In [11]:
user_with_vector_dict = user_with_vector.to_dict(orient='list')
user_id2user_id_idx = {x: i for i, x in enumerate(user_with_vector_dict['user_id'])}
user_id_idx2user_id = {i: x for i, x in enumerate(user_with_vector_dict['user_id'])}

user feature matrix

In [12]:
user_feature_matrix = np.vstack(user_with_vector_dict['genre_vector'])

# User feature mapping

generate input feature

In [13]:
input_feature = {"Super Power": 1}
input_feature_np = np.zeros((genre_num), dtype=float)
for genre, score in input_feature.items():
    input_feature_np[genre2genre_idx[genre]] = score

In [14]:
most_similar_user_id = user_id_idx2user_id[np.argmax(np.inner(user_feature_matrix, input_feature_np))]
print(f"Most similar user_id: {most_similar_user_id}")

Most similar user_id: 206731


In [15]:
a = np.argmax(np.inner(user_feature_matrix, input_feature_np))
print(user_feature_matrix[a])
print(genre2genre_idx['Super Power'])
#print(genre2genre_idx['Romance'])

[0.         0.         0.46932325 0.         0.         0.
 0.14079698 0.         0.         0.         0.         0.
 0.         0.         0.         0.14079698 0.         0.
 0.         0.14079698 0.         0.         0.         0.
 0.         0.         0.14079698 0.         0.         0.
 0.         0.         0.14079698 0.         0.         0.46932325
 0.14079698 0.46932325 0.         0.         0.         0.46932325]
37


# Model predict

load encoder config

In [16]:
import json

INPUT_DIR = "./model"

with open(INPUT_DIR + "/encode_config.json") as f:
    encoder_conig = json.load(f)

print(list(encoder_conig.keys()))

['user2user_encoded', 'user_encoded2user', 'anime2anime_encoded', 'anime_encoded2anime']


In [17]:
user2user_encoded = encoder_conig['user2user_encoded']
user_encoded2user = encoder_conig['user_encoded2user']
anime2anime_encoded = encoder_conig['anime2anime_encoded']
anime_encoded2anime = encoder_conig['anime_encoded2anime']

n_users = len(user2user_encoded)
n_animes = len(anime2anime_encoded)
print("Num of users: {}, Num of animes: {}".format(n_users, n_animes))

Num of users: 2463, Num of animes: 17558


Import the model

In [47]:
import torch
from recommender.ranking_base_filter.model import FactorizationMachineModel


model = torch.load(INPUT_DIR + "/model_mse.pt")
model.eval()

FactorizationMachineModel(
  (embedding): FeaturesEmbedding(
    (embedding): Embedding(20021, 16)
  )
  (linear): FeaturesLinear(
    (fc): Embedding(20021, 1)
  )
  (fm): FactorizationMachine()
)

load the data

In [48]:
most_similar_user_id_encode = user2user_encoded[str(most_similar_user_id)]
user_id_with_all_anime = [[most_similar_user_id_encode, anime_id_encode] for anime_id_encode in range(n_animes)]
user_id_with_all_anime_t = torch.tensor(user_id_with_all_anime)

In [49]:
top_n_recommend = 5

output = model(user_id_with_all_anime_t)
output = output.squeeze()
output_sort_asc = output.argsort(descending=True)[:top_n_recommend].tolist()

In [50]:
print(output_sort_asc)

[17134, 17543, 17550, 17534, 17518]


In [51]:
print(output[17134])

tensor(1.0000, grad_fn=<SelectBackward0>)


In [36]:
top_n_anime_id = []
for idx in output_sort_asc:
    top_n_anime_id.append(anime_encoded2anime[str(idx)])

In [37]:
print(top_n_anime_id)

[48483, 48422, 47623, 48481, 48492]
