In [86]:
import pandas as pd
import os
import numpy as np
import re

DATASET_PATH = os.path.join("datasets/real_set")
def load_dataset(FILE_NAME,CSV_PATH = DATASET_PATH):
    csv_path = os.path.join(CSV_PATH, FILE_NAME)
    return pd.read_csv(csv_path)
anime_infos = load_dataset("anime.csv")
my_ratings = load_dataset("my_rating.csv")

In [87]:
anime_infos.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [88]:
anime_infos.loc[(anime_infos["genre"]=="Hentai") & (anime_infos["episodes"]=="Unknown"),"episodes"] = "1"
anime_infos.loc[(anime_infos["type"]=="OVA") & (anime_infos["episodes"]=="Unknown"),"episodes"] = "1"

anime_infos.loc[(anime_infos["type"] == "Movie") & (anime_infos["episodes"] == "Unknown")] = "1"

### Episodes
Many animes have unknown episodes. Just make them the median of all animes

In [89]:
anime_infos["episodes"] = anime_infos["episodes"].map(lambda x:np.nan if x=="Unknown" else x)
anime_infos["episodes"].fillna(anime_infos["episodes"].median(),inplace = True)

### Rating 

Many animes have unknown ratings. These were filled with the median of the ratings.

In [90]:
anime_infos["rating"] = anime_infos["rating"].astype(float)
anime_infos["rating"].fillna(anime_infos["rating"].median(),inplace = True)

### Type 

Type category differentiates between movies, music, TV shows(regular anime episodes), OVA/ONA etc. These are categorical variables so I used ```pd.get_dummies``` to convert them to dummy variables.

In [91]:
pd.get_dummies(anime_infos[["type"]]).head()

Unnamed: 0,type_1,type_Movie,type_Music,type_ONA,type_OVA,type_Special,type_TV
0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,1
2,0,0,0,0,0,0,1
3,0,0,0,0,0,0,1
4,0,0,0,0,0,0,1


### Members

Just converted the strings to float.

In [92]:
anime_infos["members"] = anime_infos["members"].astype(float)

# Feature Selection and Preprocessing



Episode numbers, members and rating are different from categorical variables and very different in values. Rating ranges from 0-10 in the dataset while the episode number can be even 800+ episodes long when it comes to long running popular animes such as One Piece, Naruto etc. So I ended up using ```sklearn.preprocessing.MaxAbsScaler``` as it preserves the sparsity while scaing the values from 0-1.

In [93]:
anime_features = pd.concat([anime_infos["genre"].str.get_dummies(sep=","),pd.get_dummies(anime_infos[["type"]]),anime_infos[["rating"]],anime_infos[["members"]],anime_infos["episodes"]],axis=1)

In [94]:
anime_features.head()

Unnamed: 0,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,Harem,...,type_1,type_Movie,type_Music,type_ONA,type_OVA,type_Special,type_TV,rating,members,episodes
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,9.37,200630.0,1
1,1,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,1,9.26,793665.0,64
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,9.25,114262.0,51
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,9.17,673572.0,24
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,9.16,151266.0,51


In [95]:
anime_features.columns

Index([' Adventure', ' Cars', ' Comedy', ' Dementia', ' Demons', ' Drama',
       ' Ecchi', ' Fantasy', ' Game', ' Harem', ' Hentai', ' Historical',
       ' Horror', ' Josei', ' Kids', ' Magic', ' Martial Arts', ' Mecha',
       ' Military', ' Music', ' Mystery', ' Parody', ' Police',
       ' Psychological', ' Romance', ' Samurai', ' School', ' Sci-Fi',
       ' Seinen', ' Shoujo', ' Shoujo Ai', ' Shounen', ' Shounen Ai',
       ' Slice of Life', ' Space', ' Sports', ' Super Power', ' Supernatural',
       ' Thriller', ' Vampire', ' Yaoi', ' Yuri', '1', 'Action', 'Adventure',
       'Cars', 'Comedy', 'Dementia', 'Demons', 'Drama', 'Ecchi', 'Fantasy',
       'Game', 'Harem', 'Hentai', 'Historical', 'Horror', 'Josei', 'Kids',
       'Magic', 'Martial Arts', 'Mecha', 'Military', 'Music', 'Mystery',
       'Parody', 'Police', 'Psychological', 'Romance', 'Samurai', 'School',
       'Sci-Fi', 'Seinen', 'Shoujo', 'Shounen', 'Slice of Life', 'Space',
       'Sports', 'Super Power', 'Supernat

In [96]:
from sklearn.preprocessing import MaxAbsScaler

In [97]:
max_abs_scaler = MaxAbsScaler()
anime_features = max_abs_scaler.fit_transform(anime_features)

# KNN for finding similar animes

In [98]:
from sklearn.neighbors import NearestNeighbors

In [99]:
nbrs = NearestNeighbors(n_neighbors=8, algorithm='ball_tree').fit(anime_features)

In [100]:
distances, indices = nbrs.kneighbors(anime_features)

# Query examples and helper functions 

In [101]:
def get_index_from_name(name):
    return anime_infos[anime_infos["name"]==name].index.tolist()[0]
def get_index_from_id(anime_id):
    return anime_infos[anime_infos["anime_id"]==anime_id].index.tolist()[0]

In [102]:
get_index_from_name("Naruto")

841

In [129]:
indices[841]

array([ 841,  615,  175,  582,  206,  178, 1584,  260], dtype=int64)

Many anime names have not been documented properly and in many cases the names are in Japanese instead of English and the spelling is often different. For that reason I've also created another helper function ```get_id_from_partial_name``` to find out ids of the animes from part of names.

In [103]:
all_anime_names = list(anime_infos.name.values)

In [104]:
def get_id_from_partial_name(partial):
    for name in all_anime_names:
        if partial in name:
            print(name,all_anime_names.index(name))

In [148]:
""" print_similar_query can search for similar animes both by id and by name. """

def print_similar_animes(name = None, anime_id = None):
    if anime_id:
        id = anime_infos.index[anime_infos['anime_id'] == anime_id].tolist()
        id = id[0]
        for idx in indices[id][1:]:
            print(idx)
            print(anime_infos.loc[idx]["name"])
    elif name:
        found_id = get_index_from_name(name)
        for idx in indices[found_id][1:]:
            print(anime_infos.loc[idx]["name"])

In [115]:
def get_anime_name(anime_id):
    anime_name = anime_infos[anime_infos["anime_id"] == anime_id]["name"]
    return "" if len(anime_name) == 0 else anime_name.values[0]

In [163]:
def get_anime_rating(anime_id):
    rating = my_ratings_copy[my_ratings_copy["anime_id"] == anime_id]["rating"]
    return 0 if len(rating) == 0 or rating.values[0] == -1 else rating.values[0]

In [130]:
def get_all_anime_ids():
    return set(anime_infos.anime_id.values)

In [156]:
def get_anime_idx_by_id(anime_id):
    return anime_infos.index[anime_infos["anime_id"] == anime_id][0]

0

# Query Examples 

Kimi No Na Wa is a super popular movie that won many awards. All the recommendations for this movie is really good. I've seen Harmonie and Hotarubi no Mori e and can absolutely say both are amazing. The 1st recommendation also seems highly rated in IMDB and a fantasy film. Air + Clannad is also highly rated. 

See the wallpaper from Your name  vs the 1st choice "Kokoro ga Sakebitagatterunda". Both are school life based fantasy films focusing on romance.


![](https://images-na.ssl-images-amazon.com/images/M/MV5BODRmZDVmNzUtZDA4ZC00NjhkLWI2M2UtN2M0ZDIzNDcxYThjL2ltYWdlXkEyXkFqcGdeQXVyNTk0MzMzODA@._V1_UY1200_CR90,0,630,1200_AL_.jpg)


![](https://i.ytimg.com/vi/fnD8ABI4JrQ/maxresdefault.jpg)

In [149]:
print_similar_animes(anime_id = 32281)

0
Kimi no Na wa.
208
Kokoro ga Sakebitagatterunda.
1494
Harmonie
1959
Air Movie
60
Hotarubi no Mori e
2103
Clannad Movie
894
Momo e no Tegami
1199
&quot;Bungaku Shoujo&quot; Movie


In [117]:
print_similar_animes("Noragami")

Noragami Aragoto
JoJo no Kimyou na Bouken (TV)
JoJo no Kimyou na Bouken: Stardust Crusaders
JoJo no Kimyou na Bouken: Stardust Crusaders 2nd Season
Yumekui Merry
Allison to Lillia
Dororo to Hyakkimaru


In [118]:
print_similar_animes("Mushishi")

Mushishi Zoku Shou
Mushishi Zoku Shou 2nd Season
Mushishi Special: Hihamukage
Mushishi Zoku Shou: Odoro no Michi
Mushishi Zoku Shou: Suzu no Shizuku
Pandora Hearts
Mononoke


In [119]:
print_similar_animes("Gintama")

Gintama&#039;
Gintama°
Gintama&#039;: Enchousen
Gintama (2017)
Gintama Movie: Kanketsu-hen - Yorozuya yo Eien Nare
Gintama Movie: Shinyaku Benizakura-hen
Gintama: Yorinuki Gintama-san on Theater 2D


In [120]:
print_similar_animes("Fairy Tail")

Fairy Tail (2014)
Magi: The Labyrinth of Magic
Magi: The Kingdom of Magic
Densetsu no Yuusha no Densetsu
Magi: Sinbad no Bouken (TV)
Toriko
MÄR


In [121]:
print_similar_animes(anime_id=23847)

Yahari Ore no Seishun Love Comedy wa Machigatteiru.
Kotoura-san
Otome wa Boku ni Koishiteru
Gekkan Shoujo Nozaki-kun
Chuunibyou demo Koi ga Shitai! Ren
Ore no Nounai Sentakushi ga, Gakuen Love Comedy wo Zenryoku de Jama Shiteiru
Onegai☆Teacher


<b>3.</b> Kimi no Na wa (Your Name)



# Algorithm goes vroom time

In [172]:
my_ratings["name"] = my_ratings.apply (lambda row: get_anime_name(row["anime_id"]), axis=1)
my_ratings_copy = my_ratings.copy()
my_ratings

Unnamed: 0,anime_id,rating,name
0,23847,10,Yahari Ore no Seishun Love Comedy wa Machigatt...
1,14813,10,Yahari Ore no Seishun Love Comedy wa Machigatt...
2,12403,10,Yuru Yuri♪♪
3,30902,10,Yuru Yuri Nachuyachumi!+
4,30279,10,Yuru Yuri San☆Hai!
5,10495,10,Yuru Yuri
6,5909,8,Seitokai no Ichizon
7,10464,8,Seitokai no Ichizon Lv.2
8,32603,8,Okusama ga Seitokaichou!+!
9,20847,10,Seitokai Yakuindomo*


In [173]:
def original_set():
    return set(my_ratings["anime_id"].values)
original_set = original_set()
current_set = original_set
my_ratings_copy.at[0,"rating"] = 10
my_ratings_copy

Unnamed: 0,anime_id,rating,name
0,23847,10,Yahari Ore no Seishun Love Comedy wa Machigatt...
1,14813,10,Yahari Ore no Seishun Love Comedy wa Machigatt...
2,12403,10,Yuru Yuri♪♪
3,30902,10,Yuru Yuri Nachuyachumi!+
4,30279,10,Yuru Yuri San☆Hai!
5,10495,10,Yuru Yuri
6,5909,8,Seitokai no Ichizon
7,10464,8,Seitokai no Ichizon Lv.2
8,32603,8,Okusama ga Seitokaichou!+!
9,20847,10,Seitokai Yakuindomo*


In [174]:
id_to_my_ratings_idx = dict()
id_to_anime_infos_idx = dict()

In [None]:
import time

def add_anime_iteration(all_anime_ids):
    for anime_id in all_anime_ids:
        if anime_id in original_set:
            continue #we don't wanna modify actual user's rating
            
        #since idx won't change, we put it in a dict for faster search time
        if anime_id in id_to_anime_infos_idx:
            anime_idx = id_to_anime_infos_idx[anime_id]
        else:
            anime_idx = get_anime_idx_by_id(anime_id)
            id_to_anime_infos_idx[anime_id] = anime_idx
        
        #get similar animes. note that these are the anime's idx, not anime_id
        similar_animes = indices[anime_idx].tolist()
        anime_distances = distances[anime_idx].tolist()
        #remove yourself
        if anime_idx in similar_animes:
            id_anime = similar_animes.index(anime_idx)
            similar_animes.pop(id_anime)
            anime_distances.pop(id_anime)
        else:
            similar_animes = similar_animes[:number_neighbors-1]
            anime_distances = anime_distances[:number_neighbors-1]
            
        #get the similar anime's id to obtain their rating in my_ratings
        similar_anime_id = [anime_infos.at[idx,"anime_id"] for idx in similar_animes]
        numerator = [anime_distances[i] * get_anime_rating(similar_anime_id[i]) for i in range(0,len(anime_distances))]
        denominator = [anime_distances[i] * (numerator[i] != 0) for i in range(0,len(numerator))]
        if sum(denominator) != 0:
            predicted_rating = sum(numerator)/sum(denominator)
            
            #if we haven't predict the rating of this anime before
            if anime_id not in current_set:
                id_to_my_ratings_idx[anime_id] = len(my_ratings_copy.index)
                #same thing, put the id-idx for my_ratings_copy into a dict for faster search time
                my_ratings_copy.loc[len(my_ratings_copy.index)] = [anime_id, predicted_rating, get_anime_name(anime_id)]
                current_set.add(anime_id)
            else:
                rating_idx = id_to_my_ratings_idx[anime_id]
                my_ratings_copy.at[rating_idx,"rating"] = predicted_rating
#timer
start = time.time()

cnt = 1
all_anime_ids = get_all_anime_ids()
length = len(current_set)
while len(current_set) < len(all_anime_ids):
    add_anime_iteration(all_anime_ids)
    if length == len(current_set):
        cnt += 1
    else:
        cnt = 1
    length = len(current_set)
    print(length)
    if cnt == 2:
        print("Try adding more of your own reviews")
        break

end = time.time()
print("Algorithms run in",end - start,"seconds")

736
2262
3942
5089
5607
5789
5861
5888


In [165]:
my_ratings_copy.sort_values(by="rating",ascending=False, inplace = True)
my_ratings_copy.nunique()

anime_id    5922
rating      1825
name        5922
dtype: int64

In [166]:
def get_top_n_animes(n = 20):
    cnt = 1
    i = 1
    while cnt <= n and i < len(my_ratings_copy) - 1:
        row = my_ratings_copy.loc[my_ratings_copy.index[i-1]]
        anime_id = row["anime_id"]
        if anime_id not in my_ratings.anime_id.values:
            print(str(cnt)+':',anime_id,row["name"], round(row["rating"]))
            cnt += 1
        i += 1
get_top_n_animes()

1: 570 Jin-Rou 10
2: 27789 Yodaka no Hoshi (Music) 10
3: 10036 Boku no Chikyuu wo Mamotte: Kiniro no Toki Nasarete 10
4: 8557 Shinryaku! Ika Musume 10
5: 2029 Sirius no Densetsu 10
6: 32804 Ainone 10
7: 1718 Winter Garden 10
8: 3558 Doukyuusei (OVA): Natsu no Owari ni 10
9: 25139 Oh! My Konbu 10
10: 28673 Duan Nao 10
11: 11101 Honoo no Alpenrose 10
12: 8713 Akai Ito 10
13: 1723 Clannad Movie 10
14: 30652 Long Riders! 10
15: 33911 Gakuen Handsome: Legend of Sexy 10
16: 7598 Loups=Garous 10
17: 32760 Bungaku Shounen no Yuuutsu 10
18: 32759 Kokuen 10
19: 3469 Maria-sama ga Miteru: Haru Specials 10
20: 6783 ef: A Tale of Memories. - Recollections 10


In [167]:
OUTPUT_PATH = os.path.join("datasets","output")
OUTPUT_FILE = os.path.join(OUTPUT_PATH,"anime_prediction_by_infos.csv")
my_ratings_copy.to_csv(OUTPUT_FILE,index = False, header = True)