# **The Genres of animes**

In [1]:
%cd ..

/mnt/c/Users/Yoshi/source/repos/Sideprojects/Anime_Recommender_system


# Load the library

In [2]:
import numpy as np
import pandas as pd

# load the anime data with its genre.

In [3]:
INPUT_DIR = './data'

animes_genres_df = pd.read_csv(
    INPUT_DIR + '/anime_with_synopsis.csv', 
    low_memory=False, 
    usecols=["MAL_ID", "Name", "Genres", "synopsis"]
)
animes_genres_df.rename({"MAL_ID": "anime_id"}, inplace=True, axis=1)

anime_num = len(animes_genres_df["anime_id"].unique())
print(f"Number of anime's types: {anime_num}")

genre_seperate = animes_genres_df["Genres"].str.split(pat=",", expand=True)
genre_series = pd.concat([genre_seperate[col] for col in genre_seperate.columns])
genre_unique = list(set([item.strip() for item in genre_series.unique().tolist() if item is not None]))
genre_unique.sort()
print(f"Set of genres: {genre_unique}")
genre_num = len(genre_unique)
print(f"Number of genre: {genre_num}")

animes_genres_df.head(5)

Number of anime's types: 16214
Set of genres: ['Action', 'Adventure', 'Cars', 'Comedy', 'Dementia', 'Demons', 'Drama', 'Ecchi', 'Fantasy', 'Game', 'Harem', 'Historical', 'Horror', 'Josei', 'Kids', 'Magic', 'Martial Arts', 'Mecha', 'Military', 'Music', 'Mystery', 'Parody', 'Police', 'Psychological', 'Romance', 'Samurai', 'School', 'Sci-Fi', 'Seinen', 'Shoujo', 'Shoujo Ai', 'Shounen', 'Shounen Ai', 'Slice of Life', 'Space', 'Sports', 'Super Power', 'Supernatural', 'Thriller', 'Unknown', 'Vampire', 'Yaoi']
Number of genre: 42


Unnamed: 0,anime_id,Name,Genres,synopsis
0,1,Cowboy Bebop,"Action, Adventure, Comedy, Drama, Sci-Fi, Space","In the year 2071, humanity has colonized sever..."
1,1,Cowboy Bebop,"Action, Adventure, Comedy, Drama, Sci-Fi, Space","In the year 2071, humanity has colonized sever..."
2,5,Cowboy Bebop: Tengoku no Tobira,"Action, Drama, Mystery, Sci-Fi, Space","other day, another bounty—such is the life of ..."
3,6,Trigun,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen","Vash the Stampede is the man with a $$60,000,0..."
4,7,Witch Hunter Robin,"Action, Mystery, Police, Supernatural, Drama, ...",ches are individuals with special powers like ...


In [4]:
genre2genre_idx = {x: i for i, x in enumerate(genre_unique)}
genre_idx2genre = {i: x for i, x in enumerate(genre_unique)}

# load the preprocessed user data with anime rating.

In [5]:
rating_df = pd.read_csv(INPUT_DIR + '/preprocessed_animelist.csv', 
                        low_memory=False, 
                        usecols=["user_id", "anime_id", "rating"]
                        )

rating_df.head(5)

Unnamed: 0,user_id,anime_id,rating
0,278536,31845,0.9
1,56255,24919,0.0
2,54171,2386,0.6
3,292408,5028,0.7
4,334396,30243,0.4


# Join two different table

In [6]:
rating_with_genres_df = rating_df.merge(animes_genres_df, on="anime_id", how="inner")
rating_with_genres_df.head(5)

Unnamed: 0,user_id,anime_id,rating,Name,Genres,synopsis
0,278536,31845,0.9,Masou Gakuen HxH,"Action, Sci-Fi, Harem, Comedy, Romance, Ecchi,...",Hida Kizuna possesses the HHG (Heart Hybrid Ge...
1,240056,31845,0.3,Masou Gakuen HxH,"Action, Sci-Fi, Harem, Comedy, Romance, Ecchi,...",Hida Kizuna possesses the HHG (Heart Hybrid Ge...
2,232230,31845,0.0,Masou Gakuen HxH,"Action, Sci-Fi, Harem, Comedy, Romance, Ecchi,...",Hida Kizuna possesses the HHG (Heart Hybrid Ge...
3,436,31845,0.0,Masou Gakuen HxH,"Action, Sci-Fi, Harem, Comedy, Romance, Ecchi,...",Hida Kizuna possesses the HHG (Heart Hybrid Ge...
4,39660,31845,0.7,Masou Gakuen HxH,"Action, Sci-Fi, Harem, Comedy, Romance, Ecchi,...",Hida Kizuna possesses the HHG (Heart Hybrid Ge...


In [7]:
# rating_with_genres_df = rating_with_genres_df[rating_with_genres_df['rating'] > 0]
rating_with_genres_df['Genres'] = rating_with_genres_df['Genres'].str.split(pat=",", expand=False)
rating_with_genres_df.head(5)

Unnamed: 0,user_id,anime_id,rating,Name,Genres,synopsis
0,278536,31845,0.9,Masou Gakuen HxH,"[Action, Sci-Fi, Harem, Comedy, Romance, ...",Hida Kizuna possesses the HHG (Heart Hybrid Ge...
1,240056,31845,0.3,Masou Gakuen HxH,"[Action, Sci-Fi, Harem, Comedy, Romance, ...",Hida Kizuna possesses the HHG (Heart Hybrid Ge...
2,232230,31845,0.0,Masou Gakuen HxH,"[Action, Sci-Fi, Harem, Comedy, Romance, ...",Hida Kizuna possesses the HHG (Heart Hybrid Ge...
3,436,31845,0.0,Masou Gakuen HxH,"[Action, Sci-Fi, Harem, Comedy, Romance, ...",Hida Kizuna possesses the HHG (Heart Hybrid Ge...
4,39660,31845,0.7,Masou Gakuen HxH,"[Action, Sci-Fi, Harem, Comedy, Romance, ...",Hida Kizuna possesses the HHG (Heart Hybrid Ge...


# Convert the table to the user_id with corresponded feature representation.

In [8]:
rating_with_genres_dict = rating_with_genres_df.to_dict(orient='list')
genres_list = rating_with_genres_dict.pop('Genres')
rating_list = rating_with_genres_dict.pop('rating')

In [9]:
from tqdm import tqdm

genre_feature_np = []

for i, genres in enumerate(tqdm(genres_list)):
    feature_np = np.zeros((genre_num), dtype=float)
    for genre in genres:
        genre = genre.strip()
        feature_np[genre2genre_idx[genre]] = 1.
    genre_feature_np.append(feature_np)

rating_with_genres_dict['genre_vector'] = genre_feature_np

100%|██████████| 7167828/7167828 [01:52<00:00, 63519.83it/s] 


In [10]:
def np_choose_highest_10(x):
    x[np.argsort(x)[:-10]] = 0
    return x

user_with_genre_vector = pd.DataFrame(rating_with_genres_dict)
user_with_vector = user_with_genre_vector.groupby('user_id')[['genre_vector']].agg(lambda x: x.sum(axis=0))
# user_with_vector['genre_vector'] = user_with_vector['genre_vector'].apply(np_choose_highest_10)
user_with_vector['genre_vector'] = user_with_vector['genre_vector'].apply(lambda x: x/np.linalg.norm(x))

user_with_vector.reset_index(inplace=True)
user_with_vector.head(5)

Unnamed: 0,user_id,genre_vector
0,146,"[0.3364680271937393, 0.12344443854835241, 0.0,..."
1,240,"[0.4034134564441187, 0.21088539177045493, 0.00..."
2,436,"[0.38909533850579003, 0.18608907493755178, 0.0..."
3,446,"[0.37972615753639605, 0.14338207608220763, 0.0..."
4,781,"[0.32928673606661446, 0.1617068037996573, 0.01..."


In [11]:
user_with_vector_dict = user_with_vector.to_dict(orient='list')

Save the data

In [12]:
import pickle

INPUT_DIR = './data'

with open(INPUT_DIR + "/user_with_vector_dict.pickle", 'wb') as f:
    pickle.dump(user_with_vector_dict, f)

In [None]:
import pickle

INPUT_DIR = './data'

genre_index_mapping = {}
genre_index_mapping["genre2genre_idx"] = genre2genre_idx
genre_index_mapping["genre_idx2genre"] = genre_idx2genre

with open(INPUT_DIR + "/genre_index_mapping.pickle", 'wb') as f:
    pickle.dump(genre_index_mapping, f)

# **Load user with vector dict (Restart the jupyter to clean up the used memory space)**

In [1]:
%cd ..
import numpy as np
import pandas as pd

/mnt/c/Users/Yoshi/source/repos/Sideprojects/Anime_Recommender_system


load the anime data with its genre.

In [3]:
INPUT_DIR = './data'

animes_genres_df = pd.read_csv(
    INPUT_DIR + '/anime_with_synopsis.csv', 
    low_memory=False, 
    usecols=["MAL_ID", "Name", "Genres", "synopsis"]
)
animes_genres_df.rename({"MAL_ID": "anime_id"}, inplace=True, axis=1)

anime_num = len(animes_genres_df["anime_id"].unique())
print(f"Number of anime's types: {anime_num}")

genre_seperate = animes_genres_df["Genres"].str.split(pat=",", expand=True)
genre_series = pd.concat([genre_seperate[col] for col in genre_seperate.columns])
genre_unique = list(set([item.strip() for item in genre_series.unique().tolist() if item is not None]))
genre_unique.sort()
print(f"Set of genres: {genre_unique}")
genre_num = len(genre_unique)
print(f"Number of genre: {genre_num}")

genre2genre_idx = {x: i for i, x in enumerate(genre_unique)}
genre_idx2genre = {i: x for i, x in enumerate(genre_unique)}

animes_genres_df.head(5)

Number of anime's types: 16214
Set of genres: ['Action', 'Adventure', 'Cars', 'Comedy', 'Dementia', 'Demons', 'Drama', 'Ecchi', 'Fantasy', 'Game', 'Harem', 'Historical', 'Horror', 'Josei', 'Kids', 'Magic', 'Martial Arts', 'Mecha', 'Military', 'Music', 'Mystery', 'Parody', 'Police', 'Psychological', 'Romance', 'Samurai', 'School', 'Sci-Fi', 'Seinen', 'Shoujo', 'Shoujo Ai', 'Shounen', 'Shounen Ai', 'Slice of Life', 'Space', 'Sports', 'Super Power', 'Supernatural', 'Thriller', 'Unknown', 'Vampire', 'Yaoi']
Number of genre: 42


Unnamed: 0,anime_id,Name,Genres,synopsis
0,1,Cowboy Bebop,"Action, Adventure, Comedy, Drama, Sci-Fi, Space","In the year 2071, humanity has colonized sever..."
1,1,Cowboy Bebop,"Action, Adventure, Comedy, Drama, Sci-Fi, Space","In the year 2071, humanity has colonized sever..."
2,5,Cowboy Bebop: Tengoku no Tobira,"Action, Drama, Mystery, Sci-Fi, Space","other day, another bounty—such is the life of ..."
3,6,Trigun,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen","Vash the Stampede is the man with a $$60,000,0..."
4,7,Witch Hunter Robin,"Action, Mystery, Police, Supernatural, Drama, ...",ches are individuals with special powers like ...


user_id with its corresponding index

In [4]:
import pickle

INPUT_DIR = './data'

with open(INPUT_DIR + "/user_with_vector_dict.pickle", 'rb') as f:
    user_with_vector_dict = pickle.load(f)

In [5]:
user_id2user_id_idx = {x: i for i, x in enumerate(user_with_vector_dict['user_id'])}
user_id_idx2user_id = {i: x for i, x in enumerate(user_with_vector_dict['user_id'])}

user feature matrix

In [6]:
user_feature_matrix = np.vstack(user_with_vector_dict['genre_vector'])

# User feature mapping

generate input feature

In [7]:
input_feature = {"Romance": 4, "School": 4, "Super Power": 2}
input_feature_np = np.zeros((genre_num), dtype=float)
for genre, score in input_feature.items():
    print(genre2genre_idx[genre])
    input_feature_np[genre2genre_idx[genre]] = score

24
26
36


In [8]:
most_similar_user_id = user_id_idx2user_id[np.argmax(np.inner(user_feature_matrix, input_feature_np))]
print(f"Most similar user_id: {most_similar_user_id}")

Most similar user_id: 154328


# Model predict

load encoder config

In [9]:
import json

INPUT_DIR = './model/ranking_base'

with open(INPUT_DIR + "/encode_config.json") as f:
    encoder_conig = json.load(f)

print(list(encoder_conig.keys()))

['user2user_encoded', 'user_encoded2user', 'anime2anime_encoded', 'anime_encoded2anime']


In [10]:
user2user_encoded = encoder_conig['user2user_encoded']
user_encoded2user = encoder_conig['user_encoded2user']
anime2anime_encoded = encoder_conig['anime2anime_encoded']
anime_encoded2anime = encoder_conig['anime_encoded2anime']

n_users = len(user2user_encoded)
n_animes = len(anime2anime_encoded)

field_dims = [n_users, n_animes]

print("Num of users: {}, Num of animes: {}".format(n_users, n_animes))

Num of users: 2463, Num of animes: 15749


Import the model

In [11]:
import torch
from recanime.recommender.ranking_base_filter.model import FactorizationMachineModel

model = FactorizationMachineModel(
    field_dims=field_dims,
    embed_dim=32
)

model.load_state_dict(torch.load(INPUT_DIR + "/model_state_dict.pt"))
model.eval()

  from .autonotebook import tqdm as notebook_tqdm


FactorizationMachineModel(
  (embedding): FeaturesEmbedding(
    (embedding): Embedding(18212, 32)
  )
  (linear): FeaturesLinear(
    (fc): Embedding(18212, 1)
  )
  (fm): FactorizationMachine()
)

Prefilter the animes by select the anime has the highest score feature.

In [14]:
# hightest_genre = sorted(input_feature.items(), key=lambda x: x[1], reverse=True)[0][0]
# prefilter_animes_df = animes_genres_df[['anime_id', 'Genres']].copy(deep=True)
# prefilter_animes_df['Genres'] = prefilter_animes_df['Genres'].str.split(pat=",", expand=False)
# prefilter_animes_df['Genres'] = prefilter_animes_df['Genres'].apply(lambda x: np.nan if hightest_genre not in x else x)
# prefilter_animes_df.dropna(inplace=True)

# prefilter_animes_df.head(5)

Select the animes with certain genre

In [12]:
select_genre = set(input_feature.keys())
prefilter_animes_df = animes_genres_df[['anime_id', 'Genres']].copy(deep=True)
prefilter_animes_df['Genres'] = prefilter_animes_df['Genres'].str.split(pat=",", expand=False)
prefilter_animes_df['Genres'] = prefilter_animes_df['Genres'].apply(lambda genres: [genre.strip() for genre in genres])
prefilter_animes_df['Genres'] = prefilter_animes_df['Genres'].apply(lambda x: np.nan if len(select_genre&set(x)) == 0 else x)
prefilter_animes_df.dropna(inplace=True)

prefilter_animes_df.head(5)

Unnamed: 0,anime_id,Genres
7,16,"[Comedy, Drama, Josei, Romance, Slice of Life]"
11,20,"[Action, Adventure, Comedy, Super Power, Marti..."
12,21,"[Action, Adventure, Comedy, Super Power, Drama..."
13,22,"[Action, Comedy, Sports, School, Shounen]"
15,24,"[Comedy, Romance, School, Shounen]"


In [13]:
prefilter_animes_dict = prefilter_animes_df.to_dict(orient='list')
prefilter_anime_id_list = prefilter_animes_dict['anime_id']

prefilter_anime_id_encode_list = []
anime_arg_id2_anime_id =  []
for anime_id in prefilter_anime_id_list:
    try:
        prefilter_anime_id_encode_list.append(anime2anime_encoded[str(anime_id)])
        anime_arg_id2_anime_id.append(anime_id)
    except:
        pass

load the data

In [14]:
most_similar_user_id_encode = user2user_encoded[str(most_similar_user_id)]
user_id_with_all_anime = [[most_similar_user_id_encode, anime_id_encode] for anime_id_encode in prefilter_anime_id_encode_list]
user_id_with_all_anime_t = torch.tensor(user_id_with_all_anime)

In [15]:
top_n_recommend = 10

output = model(user_id_with_all_anime_t)
output = output.squeeze()
sort_asc_score, sort_asc_score_idx = output.sort(descending=True)
sort_asc_score = sort_asc_score[:top_n_recommend].tolist()
sort_asc_score_idx = sort_asc_score_idx[:top_n_recommend].tolist()

In [16]:
top_n_anime_id = []
for idx in sort_asc_score_idx:
    top_n_anime_id.append(anime_arg_id2_anime_id[idx])
    
anime_id_with_score = pd.DataFrame({"anime_id": top_n_anime_id, "predict_score": sort_asc_score})

# Recommend Result (For input_feature = {"Romance": 4, "School": 4, "Super Power": 2})

In [17]:
show_result_df = pd.merge(anime_id_with_score, animes_genres_df, how="inner", on="anime_id").sort_values(by=["predict_score"], ascending=False)
show_result_df

Unnamed: 0,anime_id,predict_score,Name,Genres,synopsis
0,44221,1.0,Xie Wang Zhui Qi Zhi Yishi Qingcheng,"Action, Comedy, Drama, Romance, Martial Arts, ...","The well-known special agent, Su Luo, was betr..."
1,31530,1.0,Classroom☆Crisis: Tabi no Haji wa Uwanuri,"Sci-Fi, School",Unaired episode included with the third Blu-ra...
2,9790,1.0,Sora no Otoshimono: Tokeijikake no Angeloid,"Comedy, Drama, Ecchi, Harem, Romance, Sci-Fi, ...",ovie adaptation of the Sora no Otoshimono mang...
3,9795,1.0,Atashin'chi Movie,"Comedy, Super Power","In a stormy night, Mikan and Mom were stricken..."
4,9796,1.0,Atashin'chi 3D Movie: Jounetsu no Chou Chounou...,"Comedy, Super Power",3D theatrical anime adaptation. Mother is havi...
5,9799,1.0,Shin-Men,"Action, Comedy, Super Power, Seinen",The spinoff is set in a parallel world known a...
6,31521,1.0,Ushinawareta Mirai wo Motomete: Ushinawareta N...,Romance,Thirteenth episode delivered as a bonus Blu-ra...
7,9922,1.0,Oretachi ni Tsubasa wa Nai: Under the Innocent...,"Comedy, Drama, Ecchi, Harem, Romance",Haneda Takashi has a secret he cannot speak of...
8,9925,1.0,Amagami SS: Tachibana Miya-hen - Imouto,School,"a Tachibana, assuming her annoying brother Jun..."
9,37938,1.0,Hua Jianghu Zhi Huan Shi Men Sheng,"Action, Drama, Fantasy, Martial Arts, Mystery,...","Nian Yangxiao, came from a parallel world to s..."
