# **The Genres of animes**

In [1]:
%cd ..

/mnt/c/Users/Yoshi/source/repos/Sideprojects/Anime_Recommender_system


# Load the library

In [2]:
import numpy as np

# load the anime data with its genre.

In [3]:
import pandas as pd

INPUT_DIR = './data'

animes_genres_df = pd.read_csv(
    INPUT_DIR + '/anime_with_synopsis.csv', 
    low_memory=False, 
    usecols=["MAL_ID", "Name", "Genres"]
)
animes_genres_df.rename({"MAL_ID": "anime_id"}, inplace=True, axis=1)

anime_num = len(animes_genres_df["anime_id"].unique())
print(f"Number of anime's types: {anime_num}")

genre_seperate = animes_genres_df["Genres"].str.split(pat=",", expand=True)
genre_series = pd.concat([genre_seperate[col] for col in genre_seperate.columns])
genre_set = set([item.strip() for item in genre_series.unique().tolist() if item is not None])
print(f"Set of genres: {genre_set}")
genre_num = len(genre_set)
print(f"Number of genre: {genre_num}")

animes_genres_df.head(5)

Number of anime's types: 16214
Set of genres: {'Vampire', 'Seinen', 'School', 'Magic', 'Police', 'Parody', 'Mecha', 'Space', 'Comedy', 'Ecchi', 'Shounen', 'Romance', 'Horror', 'Game', 'Drama', 'Slice of Life', 'Demons', 'Historical', 'Adventure', 'Martial Arts', 'Supernatural', 'Josei', 'Kids', 'Mystery', 'Super Power', 'Samurai', 'Unknown', 'Shoujo', 'Cars', 'Sci-Fi', 'Military', 'Music', 'Thriller', 'Fantasy', 'Yaoi', 'Psychological', 'Harem', 'Shoujo Ai', 'Action', 'Sports', 'Shounen Ai', 'Dementia'}
Number of genre: 42


Unnamed: 0,anime_id,Name,Genres
0,1,Cowboy Bebop,"Action, Adventure, Comedy, Drama, Sci-Fi, Space"
1,5,Cowboy Bebop: Tengoku no Tobira,"Action, Drama, Mystery, Sci-Fi, Space"
2,6,Trigun,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen"
3,7,Witch Hunter Robin,"Action, Mystery, Police, Supernatural, Drama, ..."
4,8,Bouken Ou Beet,"Adventure, Fantasy, Shounen, Supernatural"


In [4]:
genre2genre_idx = {x: i for i, x in enumerate(genre_set)}
genre_idx2genre = {i: x for i, x in enumerate(genre_set)}

# load the preprocessed user data with anime rating.

In [5]:
rating_df = pd.read_csv(INPUT_DIR + '/preprocessed_animelist.csv', 
                        low_memory=False, 
                        usecols=["user_id", "anime_id", "rating"]
                        )

rating_df.head(5)

Unnamed: 0,user_id,anime_id,rating
0,279596,2695,0.0
1,121782,15505,0.0
2,160479,39085,0.0
3,53950,32245,0.0
4,86402,35860,0.8


# Join two different table

In [6]:
rating_with_genres_df = rating_df.merge(animes_genres_df, on="anime_id", how="inner")
rating_with_genres_df.head(5)

Unnamed: 0,user_id,anime_id,rating,Name,Genres
0,279596,2695,0.0,Mobile Suit Gundam 0083: The Fading Light of Zeon,"Action, Sci-Fi, Adventure, Space, Mecha"
1,297666,2695,0.0,Mobile Suit Gundam 0083: The Fading Light of Zeon,"Action, Sci-Fi, Adventure, Space, Mecha"
2,251620,2695,0.6,Mobile Suit Gundam 0083: The Fading Light of Zeon,"Action, Sci-Fi, Adventure, Space, Mecha"
3,152637,2695,0.0,Mobile Suit Gundam 0083: The Fading Light of Zeon,"Action, Sci-Fi, Adventure, Space, Mecha"
4,74911,2695,0.0,Mobile Suit Gundam 0083: The Fading Light of Zeon,"Action, Sci-Fi, Adventure, Space, Mecha"


In [7]:
rating_with_genres_df = rating_with_genres_df[rating_with_genres_df['rating'] > 0]
# rating_with_genres_df.set_index(['user_id'], inplace=True)
rating_with_genres_df['Genres'] = rating_with_genres_df['Genres'].str.split(pat=",", expand=False)
rating_with_genres_df.head(5)

Unnamed: 0,user_id,anime_id,rating,Name,Genres
2,251620,2695,0.6,Mobile Suit Gundam 0083: The Fading Light of Zeon,"[Action, Sci-Fi, Adventure, Space, Mecha]"
13,55748,2695,0.7,Mobile Suit Gundam 0083: The Fading Light of Zeon,"[Action, Sci-Fi, Adventure, Space, Mecha]"
16,90350,2695,0.5,Mobile Suit Gundam 0083: The Fading Light of Zeon,"[Action, Sci-Fi, Adventure, Space, Mecha]"
17,196906,2695,0.5,Mobile Suit Gundam 0083: The Fading Light of Zeon,"[Action, Sci-Fi, Adventure, Space, Mecha]"
18,310955,2695,0.6,Mobile Suit Gundam 0083: The Fading Light of Zeon,"[Action, Sci-Fi, Adventure, Space, Mecha]"


# Convert the table to the user_id with corresponded feature representation.

In [8]:
rating_with_genres_dict = rating_with_genres_df.to_dict(orient='list')
genres_list = rating_with_genres_dict.pop('Genres')
rating_list = rating_with_genres_dict.pop('rating')

In [9]:
from tqdm import tqdm

genre_feature_np = []

for i, genres in enumerate(tqdm(genres_list)):
    feature_np = np.zeros((genre_num), dtype=float)
    for genre in genres:
        genre = genre.strip()
        feature_np[genre2genre_idx[genre]] = rating_list[i]
    genre_feature_np.append(feature_np)

rating_with_genres_dict['genre_vector'] = genre_feature_np

100%|██████████| 3405589/3405589 [00:10<00:00, 335556.31it/s]


In [22]:
user_with_genre_vector = pd.DataFrame(rating_with_genres_dict)
user_with_vector = user_with_genre_vector.groupby('user_id')[['genre_vector']].agg(lambda x: x.sum(axis=0))
user_with_vector['genre_vector'] = user_with_vector['genre_vector'].transform(lambda x: x/np.linalg.norm(x))
user_with_vector.head(5)

Unnamed: 0_level_0,genre_vector
user_id,Unnamed: 1_level_1
146,"[0.02133620137755008, 0.07410005073014013, 0.3..."
240,"[0.029401839550478273, 0.06558871899722075, 0...."
436,"[0.0236820575402239, 0.08513291374373591, 0.32..."
446,"[0.02361067826558718, 0.11031885879265745, 0.2..."
781,"[0.01581213518904316, 0.12810757676308096, 0.2..."
