# **The Genres of animes**

In [1]:
%cd ..

/mnt/c/Users/Yoshi/source/repos/Sideprojects/Anime_Recommender_system


# Load the library

In [2]:
import numpy as np
import pandas as pd

# load the anime data with its genre.

In [3]:
INPUT_DIR = './data'

animes_genres_df = pd.read_csv(
    INPUT_DIR + '/anime_with_synopsis.csv', 
    low_memory=False, 
    usecols=["MAL_ID", "Name", "Genres", "sypnopsis"]
)
animes_genres_df.rename({"MAL_ID": "anime_id"}, inplace=True, axis=1)

anime_num = len(animes_genres_df["anime_id"].unique())
print(f"Number of anime's types: {anime_num}")

genre_seperate = animes_genres_df["Genres"].str.split(pat=",", expand=True)
genre_series = pd.concat([genre_seperate[col] for col in genre_seperate.columns])
genre_set = set([item.strip() for item in genre_series.unique().tolist() if item is not None])
print(f"Set of genres: {genre_set}")
genre_num = len(genre_set)
print(f"Number of genre: {genre_num}")

animes_genres_df.head(5)

Number of anime's types: 16214
Set of genres: {'Sports', 'Military', 'Adventure', 'Martial Arts', 'Kids', 'Supernatural', 'Magic', 'Romance', 'Action', 'Cars', 'Shounen', 'Mecha', 'Josei', 'Super Power', 'Dementia', 'Parody', 'Yaoi', 'Fantasy', 'Police', 'Ecchi', 'Seinen', 'Music', 'Mystery', 'Shounen Ai', 'School', 'Harem', 'Thriller', 'Shoujo', 'Samurai', 'Space', 'Vampire', 'Shoujo Ai', 'Game', 'Psychological', 'Drama', 'Horror', 'Historical', 'Demons', 'Slice of Life', 'Unknown', 'Comedy', 'Sci-Fi'}
Number of genre: 42


Unnamed: 0,anime_id,Name,Genres,sypnopsis
0,1,Cowboy Bebop,"Action, Adventure, Comedy, Drama, Sci-Fi, Space","In the year 2071, humanity has colonized sever..."
1,5,Cowboy Bebop: Tengoku no Tobira,"Action, Drama, Mystery, Sci-Fi, Space","other day, another bounty—such is the life of ..."
2,6,Trigun,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen","Vash the Stampede is the man with a $$60,000,0..."
3,7,Witch Hunter Robin,"Action, Mystery, Police, Supernatural, Drama, ...",ches are individuals with special powers like ...
4,8,Bouken Ou Beet,"Adventure, Fantasy, Shounen, Supernatural",It is the dark century and the people are suff...


In [4]:
genre2genre_idx = {x: i for i, x in enumerate(genre_set)}
genre_idx2genre = {i: x for i, x in enumerate(genre_set)}

# load the preprocessed user data with anime rating.

In [163]:
rating_df = pd.read_csv(INPUT_DIR + '/preprocessed_animelist.csv', 
                        low_memory=False, 
                        usecols=["user_id", "anime_id", "rating"]
                        )

rating_df.head(5)

Unnamed: 0,user_id,anime_id,rating
0,278536,31845,0.9
1,56255,24919,0.0
2,54171,2386,0.6
3,292408,5028,0.7
4,334396,30243,0.4


# Join two different table

In [6]:
rating_with_genres_df = rating_df.merge(animes_genres_df, on="anime_id", how="inner")
rating_with_genres_df.head(5)

Unnamed: 0,user_id,anime_id,rating,Name,Genres
0,278536,31845,0.9,Masou Gakuen HxH,"Action, Sci-Fi, Harem, Comedy, Romance, Ecchi,..."
1,240056,31845,0.3,Masou Gakuen HxH,"Action, Sci-Fi, Harem, Comedy, Romance, Ecchi,..."
2,232230,31845,0.0,Masou Gakuen HxH,"Action, Sci-Fi, Harem, Comedy, Romance, Ecchi,..."
3,436,31845,0.0,Masou Gakuen HxH,"Action, Sci-Fi, Harem, Comedy, Romance, Ecchi,..."
4,39660,31845,0.7,Masou Gakuen HxH,"Action, Sci-Fi, Harem, Comedy, Romance, Ecchi,..."


In [7]:
# rating_with_genres_df = rating_with_genres_df[rating_with_genres_df['rating'] > 0]
rating_with_genres_df['Genres'] = rating_with_genres_df['Genres'].str.split(pat=",", expand=False)
rating_with_genres_df.head(5)

Unnamed: 0,user_id,anime_id,rating,Name,Genres
0,278536,31845,0.9,Masou Gakuen HxH,"[Action, Sci-Fi, Harem, Comedy, Romance, ..."
1,240056,31845,0.3,Masou Gakuen HxH,"[Action, Sci-Fi, Harem, Comedy, Romance, ..."
2,232230,31845,0.0,Masou Gakuen HxH,"[Action, Sci-Fi, Harem, Comedy, Romance, ..."
3,436,31845,0.0,Masou Gakuen HxH,"[Action, Sci-Fi, Harem, Comedy, Romance, ..."
4,39660,31845,0.7,Masou Gakuen HxH,"[Action, Sci-Fi, Harem, Comedy, Romance, ..."


# Convert the table to the user_id with corresponded feature representation.

In [8]:
rating_with_genres_dict = rating_with_genres_df.to_dict(orient='list')
genres_list = rating_with_genres_dict.pop('Genres')
rating_list = rating_with_genres_dict.pop('rating')

In [9]:
from tqdm import tqdm

genre_feature_np = []

for i, genres in enumerate(tqdm(genres_list)):
    feature_np = np.zeros((genre_num), dtype=float)
    for genre in genres:
        genre = genre.strip()
        feature_np[genre2genre_idx[genre]] = 1.
    genre_feature_np.append(feature_np)

rating_with_genres_dict['genre_vector'] = genre_feature_np

100%|██████████| 7165619/7165619 [00:18<00:00, 391350.67it/s]


In [13]:
def np_choose_highest_10(x):
    x[np.argsort(x)[:-10]] = 0
    return x

user_with_genre_vector = pd.DataFrame(rating_with_genres_dict)
user_with_vector = user_with_genre_vector.groupby('user_id')[['genre_vector']].agg(lambda x: x.sum(axis=0))
# user_with_vector['genre_vector'] = user_with_vector['genre_vector'].apply(np_choose_highest_10)
user_with_vector['genre_vector'] = user_with_vector['genre_vector'].apply(lambda x: x/np.linalg.norm(x))

user_with_vector.reset_index(inplace=True)
user_with_vector.head(5)

Unnamed: 0,user_id,genre_vector
0,146,"[0.16210053075893657, 0.0, 0.0, 0.287466049151..."
1,240,"[0.0, 0.0, 0.0, 0.29798188070925263, 0.0, 0.21..."
2,436,"[0.0, 0.0, 0.0, 0.20908885241998634, 0.0, 0.0,..."
3,446,"[0.0, 0.0, 0.0, 0.24833529774036153, 0.0, 0.20..."
4,781,"[0.0, 0.0, 0.0, 0.23456700510100628, 0.3665634..."


In [11]:
user_with_vector_dict = user_with_vector.to_dict(orient='list')

Save the data

In [12]:
import pickle

INPUT_DIR = './data'

with open(INPUT_DIR + "/user_with_vector_dict.pickle", 'wb') as f:
    pickle.dump(user_with_vector_dict, f)

In [None]:
import pickle

INPUT_DIR = './data'

genre_index_mapping = {}
genre_index_mapping["genre2genre_idx"] = genre2genre_idx
genre_index_mapping["genre_idx2genre"] = genre_idx2genre

with open(INPUT_DIR + "/genre_index_mapping.pickle", 'wb') as f:
    pickle.dump(genre_index_mapping, f)

# **Load user with vector dict (Restart the jupyter to clean up the used memory space)**

In [1]:
%cd ..
import numpy as np
import pandas as pd

/mnt/c/Users/Yoshi/source/repos/Sideprojects/Anime_Recommender_system


load the anime data with its genre.

In [21]:
INPUT_DIR = './data'

animes_genres_df = pd.read_csv(
    INPUT_DIR + '/anime_with_synopsis.csv', 
    low_memory=False, 
    usecols=["MAL_ID", "Name", "Genres", "sypnopsis"]
)
animes_genres_df.rename({"MAL_ID": "anime_id"}, inplace=True, axis=1)

anime_num = len(animes_genres_df["anime_id"].unique())
print(f"Number of anime's types: {anime_num}")

genre_seperate = animes_genres_df["Genres"].str.split(pat=",", expand=True)
genre_series = pd.concat([genre_seperate[col] for col in genre_seperate.columns])
genre_set = set([item.strip() for item in genre_series.unique().tolist() if item is not None])
print(f"Set of genres: {genre_set}")
genre_num = len(genre_set)
print(f"Number of genre: {genre_num}")

genre2genre_idx = {x: i for i, x in enumerate(genre_set)}
genre_idx2genre = {i: x for i, x in enumerate(genre_set)}

animes_genres_df.head(5)

Number of anime's types: 16214
Set of genres: {'Comedy', 'Fantasy', 'Martial Arts', 'Action', 'Shounen', 'Super Power', 'Drama', 'Horror', 'Seinen', 'School', 'Dementia', 'Mystery', 'Cars', 'Game', 'Military', 'Ecchi', 'Music', 'Vampire', 'Romance', 'Psychological', 'Mecha', 'Slice of Life', 'Yaoi', 'Samurai', 'Harem', 'Space', 'Sports', 'Historical', 'Unknown', 'Parody', 'Supernatural', 'Kids', 'Demons', 'Sci-Fi', 'Magic', 'Shoujo Ai', 'Police', 'Shounen Ai', 'Shoujo', 'Josei', 'Adventure', 'Thriller'}
Number of genre: 42


Unnamed: 0,anime_id,Name,Genres,sypnopsis
0,1,Cowboy Bebop,"Action, Adventure, Comedy, Drama, Sci-Fi, Space","In the year 2071, humanity has colonized sever..."
1,5,Cowboy Bebop: Tengoku no Tobira,"Action, Drama, Mystery, Sci-Fi, Space","other day, another bounty—such is the life of ..."
2,6,Trigun,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen","Vash the Stampede is the man with a $$60,000,0..."
3,7,Witch Hunter Robin,"Action, Mystery, Police, Supernatural, Drama, ...",ches are individuals with special powers like ...
4,8,Bouken Ou Beet,"Adventure, Fantasy, Shounen, Supernatural",It is the dark century and the people are suff...


user_id with its corresponding index

In [22]:
import pickle

INPUT_DIR = './data'

with open(INPUT_DIR + "/user_with_vector_dict.pickle", 'rb') as f:
    user_with_vector_dict = pickle.load(f)

In [23]:
user_id2user_id_idx = {x: i for i, x in enumerate(user_with_vector_dict['user_id'])}
user_id_idx2user_id = {i: x for i, x in enumerate(user_with_vector_dict['user_id'])}

user feature matrix

In [24]:
user_feature_matrix = np.vstack(user_with_vector_dict['genre_vector'])

# User feature mapping

generate input feature

In [25]:
input_feature = {"Romance": 4, "School": 4, "Super Power": 2}
input_feature_np = np.zeros((genre_num), dtype=float)
for genre, score in input_feature.items():
    print(genre2genre_idx[genre])
    input_feature_np[genre2genre_idx[genre]] = score

18
9
5


In [26]:
most_similar_user_id = user_id_idx2user_id[np.argmax(np.inner(user_feature_matrix, input_feature_np))]
print(f"Most similar user_id: {most_similar_user_id}")

Most similar user_id: 27413


# Model predict

load encoder config

In [27]:
import json

INPUT_DIR = './model'

with open(INPUT_DIR + "/encode_config.json") as f:
    encoder_conig = json.load(f)

print(list(encoder_conig.keys()))

['user2user_encoded', 'user_encoded2user', 'anime2anime_encoded', 'anime_encoded2anime']


In [28]:
user2user_encoded = encoder_conig['user2user_encoded']
user_encoded2user = encoder_conig['user_encoded2user']
anime2anime_encoded = encoder_conig['anime2anime_encoded']
anime_encoded2anime = encoder_conig['anime_encoded2anime']

n_users = len(user2user_encoded)
n_animes = len(anime2anime_encoded)

field_dims = [n_users, n_animes]

print("Num of users: {}, Num of animes: {}".format(n_users, n_animes))

Num of users: 2463, Num of animes: 15749


Import the model

In [29]:
import torch
from recanime.recommender.ranking_base_filter.model import FactorizationMachineModel

model = FactorizationMachineModel(
    field_dims=field_dims,
    embed_dim=32
)

model.load_state_dict(torch.load(INPUT_DIR + "/model_state_dict.pt"))
model.eval()

FactorizationMachineModel(
  (embedding): FeaturesEmbedding(
    (embedding): Embedding(18212, 32)
  )
  (linear): FeaturesLinear(
    (fc): Embedding(18212, 1)
  )
  (fm): FactorizationMachine()
)

Prefilter the animes by select the anime has the highest score feature.

In [14]:
# hightest_genre = sorted(input_feature.items(), key=lambda x: x[1], reverse=True)[0][0]
# prefilter_animes_df = animes_genres_df[['anime_id', 'Genres']].copy(deep=True)
# prefilter_animes_df['Genres'] = prefilter_animes_df['Genres'].str.split(pat=",", expand=False)
# prefilter_animes_df['Genres'] = prefilter_animes_df['Genres'].apply(lambda x: np.nan if hightest_genre not in x else x)
# prefilter_animes_df.dropna(inplace=True)

# prefilter_animes_df.head(5)

Select the animes with certain genre

In [30]:
select_genre = set(input_feature.keys())
prefilter_animes_df = animes_genres_df[['anime_id', 'Genres']].copy(deep=True)
prefilter_animes_df['Genres'] = prefilter_animes_df['Genres'].str.split(pat=",", expand=False)
prefilter_animes_df['Genres'] = prefilter_animes_df['Genres'].apply(lambda genres: [genre.strip() for genre in genres])
prefilter_animes_df['Genres'] = prefilter_animes_df['Genres'].apply(lambda x: np.nan if len(select_genre&set(x)) == 0 else x)
prefilter_animes_df.dropna(inplace=True)

prefilter_animes_df.head(5)

Unnamed: 0,anime_id,Genres
6,16,"[Comedy, Drama, Josei, Romance, Slice of Life]"
10,20,"[Action, Adventure, Comedy, Super Power, Marti..."
11,21,"[Action, Adventure, Comedy, Super Power, Drama..."
12,22,"[Action, Comedy, Sports, School, Shounen]"
14,24,"[Comedy, Romance, School, Shounen]"


In [31]:
prefilter_animes_dict = prefilter_animes_df.to_dict(orient='list')
prefilter_anime_id_list = prefilter_animes_dict['anime_id']

prefilter_anime_id_encode_list = []
anime_arg_id2_anime_id =  []
for anime_id in prefilter_anime_id_list:
    try:
        prefilter_anime_id_encode_list.append(anime2anime_encoded[str(anime_id)])
        anime_arg_id2_anime_id.append(anime_id)
    except:
        pass

load the data

In [32]:
most_similar_user_id_encode = user2user_encoded[str(most_similar_user_id)]
user_id_with_all_anime = [[most_similar_user_id_encode, anime_id_encode] for anime_id_encode in prefilter_anime_id_encode_list]
user_id_with_all_anime_t = torch.tensor(user_id_with_all_anime)

In [33]:
top_n_recommend = 10

output = model(user_id_with_all_anime_t)
output = output.squeeze()
sort_asc_score, sort_asc_score_idx = output.sort(descending=True)
sort_asc_score = sort_asc_score[:top_n_recommend].tolist()
sort_asc_score_idx = sort_asc_score_idx[:top_n_recommend].tolist()

In [34]:
top_n_anime_id = []
for idx in sort_asc_score_idx:
    top_n_anime_id.append(anime_arg_id2_anime_id[idx])
    
anime_id_with_score = pd.DataFrame({"anime_id": top_n_anime_id, "predict_score": sort_asc_score})

# Recommend Result (For input_feature = {"Romance": 4, "School": 4, "Super Power": 2})

In [35]:
show_result_df = pd.merge(anime_id_with_score, animes_genres_df, how="inner", on="anime_id").sort_values(by=["predict_score"], ascending=False)
show_result_df

Unnamed: 0,anime_id,predict_score,Name,Genres,sypnopsis
0,30276,0.910624,One Punch Man,"Action, Sci-Fi, Comedy, Parody, Super Power, S...",The seemingly ordinary and unimpressive Saitam...
1,4224,0.907217,Toradora!,"Slice of Life, Comedy, Romance, School",uuji Takasu is a gentle high school student wi...
2,5081,0.903895,Bakemonogatari,"Romance, Supernatural, Mystery, Vampire","Koyomi Araragi, a third-year high school stude..."
3,2904,0.897971,Code Geass: Hangyaku no Lelouch R2,"Action, Military, Sci-Fi, Super Power, Drama, ...","One year has passed since the Black Rebellion,..."
4,16498,0.892058,Shingeki no Kyojin,"Action, Military, Mystery, Super Power, Drama,...","Centuries ago, mankind was slaughtered to near..."
5,6547,0.889648,Angel Beats!,"Action, Comedy, Drama, School, Supernatural",Otonashi awakens only to learn he is dead. A r...
6,15809,0.886228,Hataraku Maou-sama!,"Comedy, Demons, Supernatural, Romance, Fantasy","Striking fear into the hearts of mortals, the ..."
7,1575,0.88284,Code Geass: Hangyaku no Lelouch,"Action, Military, Sci-Fi, Super Power, Drama, ...","In the year 2010, the Holy Empire of Britannia..."
8,4282,0.873331,Kara no Kyoukai 5: Mujun Rasen,"Action, Mystery, Supernatural, Drama, Romance,...","In November 1998, a double homicide occurs at ..."
9,28171,0.871553,Shokugeki no Souma,"Ecchi, School, Shounen","Ever since he was a child, fifteen-year-old So..."
