In [1]:
import collections
from tqdm import tqdm
import statistics
import os
import copy
import math
import numpy as np
import csv

In [2]:
rows = []

anime_id_to_name = {}
anime_name_to_id = {}
anime_id_to_genres = {}

with open('myanimelist/anime.csv', newline='') as f:
    file_rows = csv.reader(f)
    first_row = True
    for row in file_rows:
        if first_row:
            first_row = False
        else:
            anime_id = int(row[0])
            anime_name = row[1]
            anime_genres = set()
            anime_genres_spaces = row[2].split(',')
            for genre in anime_genres_spaces:
                if not genre:
                    continue
                if genre[0] == ' ':
                    anime_genres.add(genre[1:])
                else:
                    anime_genres.add(genre)
                
            
            
            anime_id_to_name[anime_id] = anime_name
            anime_name_to_id[anime_name] = anime_id
            anime_id_to_genres[anime_id] = anime_genres   
            
anime_names = anime_name_to_id.keys()
anime_ids = anime_id_to_name.keys()

In [11]:
print(anime_id_to_name[list(anime_id_to_genres.keys())[618]],list(anime_id_to_genres.items())[618])

Zan Sayonara Zetsubou Sensei Bangaichi (7044, {'Comedy', 'Parody', 'School'})


In [12]:
all_genres_counter = collections.Counter([genre for anime_genres in anime_id_to_genres.values() for genre in anime_genres])

all_genres = set()
for s in anime_id_to_genres.values():
    all_genres |= s
    
anime_count = len(list(anime_ids))

In [13]:
print(all_genres_counter)

print()

print(all_genres)

print()

print(anime_count)

Counter({'Comedy': 4645, 'Action': 2845, 'Adventure': 2348, 'Fantasy': 2309, 'Sci-Fi': 2070, 'Drama': 2016, 'Shounen': 1711, 'Kids': 1609, 'Romance': 1464, 'School': 1220, 'Slice of Life': 1220, 'Hentai': 1141, 'Supernatural': 1037, 'Mecha': 944, 'Music': 860, 'Historical': 806, 'Magic': 778, 'Ecchi': 637, 'Shoujo': 603, 'Seinen': 547, 'Sports': 543, 'Mystery': 495, 'Super Power': 465, 'Military': 426, 'Parody': 408, 'Space': 381, 'Horror': 369, 'Harem': 317, 'Demons': 294, 'Martial Arts': 265, 'Dementia': 240, 'Psychological': 229, 'Police': 197, 'Game': 181, 'Samurai': 148, 'Vampire': 102, 'Thriller': 87, 'Cars': 72, 'Shounen Ai': 65, 'Shoujo Ai': 55, 'Josei': 54, 'Yuri': 42, 'Yaoi': 39})

{'Mecha', 'Music', 'Samurai', 'Horror', 'Shoujo', 'Game', 'Police', 'Hentai', 'Thriller', 'Space', 'Cars', 'Drama', 'Shounen Ai', 'Magic', 'Historical', 'Sports', 'Shounen', 'Yaoi', 'Harem', 'Fantasy', 'Supernatural', 'Romance', 'Adventure', 'Vampire', 'Comedy', 'Kids', 'Martial Arts', 'Dementia', 

In [16]:
def calculate_tfidf(anime):
    terms = list(anime_id_to_genres[anime])
    terms_len = len(terms)
    
    tfidfd = collections.defaultdict(lambda: 0.0)

    for genre in anime_id_to_genres[anime]:
        tfidfd[genre] = 1.0/terms_len * np.log10(anime_count/all_genres_counter[genre])

    ksum = sum(tfidfd[k] for k in tfidfd.keys())
    for k in tfidfd.keys():
        tfidfd[k] = tfidfd[k]/ksum
        
    return tfidfd

calculate_tfidf(7044)

defaultdict(<function __main__.calculate_tfidf.<locals>.<lambda>()>,
            {'Comedy': 0.145506621494283,
             'Parody': 0.5091206245854134,
             'School': 0.3453727539203035})

In [7]:
def calculate_cosine(anime1, anime2):
        
    # this is done so that a title is similar to itself even if it doesnt have any tags
    if anime1 == anime2:
        return 1
    
    anime1tfidf = calculate_tfidf(anime1)
    anime2tfidf = calculate_tfidf(anime2)
    
    licznik = 0.0
    for k in set(list(anime1tfidf.keys()) + list(anime2tfidf.keys())):
        licznik += anime1tfidf[k] * anime2tfidf[k]
    
    m1 = sum(anime1tfidf[k]**2.0 for k in anime1tfidf.keys())
    m2 = sum(anime2tfidf[k]**2.0 for k in anime2tfidf.keys())
    
    try:
        res = licznik/(math.sqrt(m1) * math.sqrt(m2))
        if not math.isnan(res):
            return res
        else:
            return -1
    except:
        return -1

In [19]:
atmp = []
for anime in anime_ids:
    atmp.append(( 
        calculate_cosine(
            anime, 
            6675
        ), anime,anime_id_to_name[anime],anime_id_to_genres[anime])
    )
    
for e in sorted(atmp, reverse=True):
    print(e)

  res = licznik/(math.sqrt(m1) * math.sqrt(m2))


(1, 6675, 'Redline', {'Action', 'Sci-Fi', 'Sports', 'Cars'})
(0.9337097588024856, 11483, 'Redline Pilot', {'Sports', 'Cars'})
(0.9156056881079427, 10562, 'Machine Hayabusa', {'Action', 'Shounen', 'Sports', 'Cars'})
(0.8941622712299346, 3800, 'Arrow Emblem Grand Prix no Taka', {'Sports', 'Cars', 'Drama'})
(0.8872122881978289, 20115, 'Gekisou! Rubenkaiser', {'Sports', 'Shounen', 'Cars'})
(0.8872122881978289, 7048, 'Crush Gear Nitro', {'Sports', 'Shounen', 'Cars'})
(0.8872122881978289, 6055, 'F', {'Sports', 'Shounen', 'Cars'})
(0.8872122881978289, 3808, 'Futari Daka', {'Sports', 'Shounen', 'Cars'})
(0.8872122881978289, 2709, 'Bakusou Kyoudai Let&#039;s &amp; Go MAX', {'Sports', 'Shounen', 'Cars'})
(0.8872122881978289, 2706, 'Bakusou Kyoudai Let&#039;s &amp; Go WGP', {'Sports', 'Shounen', 'Cars'})
(0.8872122881978289, 388, 'Capeta', {'Sports', 'Shounen', 'Cars'})
(0.8748175390243395, 16331, 'Next A-Class', {'Action', 'Sci-Fi', 'Cars'})
(0.8748175390243395, 9881, 'Chou Supercar Gattiger', {

(0.08560227232727982, 9837, 'Emblem Take 2', {'Action', 'Seinen', 'Drama'})
(0.08558566044597075, 28045, 'Kogane no Hana', {'Action', 'Adventure', 'Fantasy', 'Historical'})
(0.08558566044597075, 1827, 'Seirei no Moribito', {'Action', 'Adventure', 'Fantasy', 'Historical'})
(0.08557335236101915, 1086, 'Kenran Butou Sai: The Mars Daybreak', {'Mecha', 'Shounen', 'Military', 'Romance', 'Adventure', 'Sci-Fi', 'Comedy'})
(0.08556217765340532, 4211, 'Hareluya II Boy', {'Action', 'School', 'Shounen', 'Fantasy', 'Comedy'})
(0.08530959941561721, 3673, 'Nijuu Mensou no Musume', {'Action', 'Adventure', 'Mystery'})
(0.08527270666712136, 4154, 'Time Bokan Series: Yattodetaman', {'Action', 'Mecha', 'Fantasy', 'Adventure', 'Comedy'})
(0.08524493014413967, 9170, 'Kimi ga Nozomu Eien: Gundam Parody', {'Sci-Fi', 'Mecha', 'Parody', 'Space'})
(0.0851715347881324, 1780, 'Choukou Tenshi Escalayer', {'Super Power', 'Sci-Fi', 'Hentai', 'Demons'})
(0.085165682284968, 2044, 'Mahou no Tenshi Creamy Mami', {'School

(0.0, 25979, 'Sheep in the Island', {'Comedy'})
(0.0, 25975, 'I Love Sky', {'Kids', 'Comedy'})
(0.0, 25973, 'I Love Picnic', {'Kids', 'Comedy'})
(0.0, 25971, 'Angel (Special)', {'Dementia'})
(0.0, 25969, 'Clock', {'Dementia'})
(0.0, 25965, 'Backkom 2', {'Kids', 'Comedy'})
(0.0, 25963, 'Backkom Meogeujan Yeohaeng', {'Adventure', 'Kids', 'Magic', 'Comedy'})
(0.0, 25943, 'Hacka Doll', {'Fantasy'})
(0.0, 25941, 'Peeping Life: The Perfect Explosion Specials', {'Slice of Life', 'Comedy'})
(0.0, 25923, 'Oppai Gakuen Marching Band-bu!', {'Hentai'})
(0.0, 25921, 'Koori no Kuni no Misuke', {'Adventure', 'Kids'})
(0.0, 25915, 'Sakura Capusule', {'Kids', 'Slice of Life'})
(0.0, 25897, 'Love Live! School Idol Project: μ&#039;s →NEXT LoveLive! 2014 - Endless Parade Makuai Drama', {'Comedy'})
(0.0, 25883, 'Taisei Kensetsu: Vietnam Noi Bai Kuukou', {'Drama'})
(0.0, 25879, 'Working!!!', {'Slice of Life', 'Comedy', 'Romance'})
(0.0, 25877, 'Demonion: Gaiden', {'Hentai'})
(0.0, 25875, 'Wake Up, Girl Zoo!

(0.0, 6231, 'Detective Conan Magic File 3: Shinichi and Ran - Memories of Mahjong Tiles and Tanabata', {'Mystery', 'Shounen'})
(0.0, 6227, 'Lovedol: Lovely Idol OVA', {'Ecchi', 'Drama', 'Music', 'Harem', 'Comedy'})
(0.0, 6220, 'Musuko no Tomodachi ni Okasarete', {'Hentai'})
(0.0, 6219, 'Nijiiro Hotaru: Eien no Natsuyasumi', {'Fantasy', 'Slice of Life', 'Romance'})
(0.0, 6217, 'Crayon Shin-chan Movie 03: Unkokusai no Yabou', {'Slice of Life', 'Ecchi', 'School', 'Kids', 'Shounen', 'Comedy'})
(0.0, 6211, 'Tokyo Magnitude 8.0', {'Drama'})
(0.0, 6209, 'Macross 7 Plus', {'Mecha', 'Music', 'Slice of Life'})
(0.0, 6206, 'La Vilaine LuLu', {'Comedy'})
(0.0, 6203, 'Sasameki Koto', {'Shoujo Ai', 'School', 'Comedy', 'Romance'})
(0.0, 6202, 'Daisetsusan no Yuusha Kibaou', {'Adventure', 'Drama'})
(0.0, 6201, 'Princess Lover!', {'School', 'Comedy', 'Ecchi', 'Harem'})
(0.0, 6199, 'Mai-Otome Zwei Special', {'Magic'})
(0.0, 6198, 'Detective Conan OVA 08: High School Girl Detective Sonoko Suzuki&#039;s C

In [93]:
# idk if it makes sens to have this function, 
# we could maybe merge functions i.e calculate_cosine from here and danbooru notebook
def calculate_series_similarity(series1, series2):
    return calculate_cosine(series1, series2)

In [94]:
# example
series1 = anime_name_to_id['Kimi no Na wa.']
atmp = []
for series2 in tqdm(anime_ids):
    a = calculate_series_similarity(series1, series2)
    atmp.append((a, series2))
    
for w in sorted(atmp, reverse=True):
    print(w)

  res = licznik/(math.sqrt(m1) * math.sqrt(m2))
100%|██████████| 12294/12294 [00:00<00:00, 39124.66it/s]


(1.0000000000000002, 547)
(1.0000000000000002, 546)
(1, 32281)
(0.9762646819216453, 14669)
(0.8920885169212823, 6572)
(0.8920885169212823, 2787)
(0.8920885169212823, 355)
(0.8852788284868801, 32262)
(0.8852788284868801, 26019)
(0.8745052988110293, 20903)
(0.8745052988110293, 10067)
(0.8682551245974519, 20517)
(0.8682551245974519, 18195)
(0.8682551245974519, 16001)
(0.8682551245974519, 11887)
(0.8682551245974519, 2167)
(0.8501304920850693, 2105)
(0.8501304920850693, 1607)
(0.8501304920850693, 1039)
(0.8501304920850693, 713)
(0.8340786765995449, 31716)
(0.8260378704157191, 28725)
(0.8260378704157191, 18053)
(0.8260378704157191, 18045)
(0.8260378704157191, 17585)
(0.8260378704157191, 12175)
(0.8260378704157191, 9988)
(0.8260378704157191, 8481)
(0.8260378704157191, 6351)
(0.8260378704157191, 2927)
(0.8260378704157191, 2926)
(0.8260378704157191, 2179)
(0.8260378704157191, 2129)
(0.8260378704157191, 1624)
(0.8260378704157191, 756)
(0.7983132125787217, 34106)
(0.7983132125787217, 31610)
(0.79

(0.20043794024625256, 2130)
(0.2003074694403923, 3992)
(0.20028853030432536, 3630)
(0.19992196447784671, 10740)
(0.19992196447784671, 9917)
(0.19992196447784671, 2813)
(0.19992196447784671, 1121)
(0.19992196447784671, 1120)
(0.19992196447784671, 1119)
(0.19992196447784671, 1117)
(0.1998959273266102, 32016)
(0.19985756689904913, 32574)
(0.19985756689904913, 30915)
(0.19985756689904913, 24439)
(0.19984601820988726, 18039)
(0.19981686146233996, 18631)
(0.19981686146233996, 3077)
(0.19981686146233996, 1608)
(0.19967010284612188, 1702)
(0.19966127852734533, 10218)
(0.19961210828175058, 3813)
(0.1994889792084234, 11917)
(0.1994889792084234, 9890)
(0.1994889792084234, 7655)
(0.1994889792084234, 6076)
(0.19941594881102773, 1805)
(0.1993971666686472, 30485)
(0.1993971666686472, 4975)
(0.19929743362342778, 916)
(0.19928024691087673, 2000)
(0.19928024691087673, 766)
(0.19887852880828547, 34151)
(0.1988682262271723, 3178)
(0.19884909470208867, 1557)
(0.19862931889801272, 25437)
(0.1986293188980127

(0.0, 23733)
(0.0, 23731)
(0.0, 23729)
(0.0, 23727)
(0.0, 23725)
(0.0, 23723)
(0.0, 23721)
(0.0, 23719)
(0.0, 23713)
(0.0, 23709)
(0.0, 23707)
(0.0, 23703)
(0.0, 23699)
(0.0, 23697)
(0.0, 23679)
(0.0, 23677)
(0.0, 23675)
(0.0, 23665)
(0.0, 23661)
(0.0, 23651)
(0.0, 23647)
(0.0, 23645)
(0.0, 23643)
(0.0, 23641)
(0.0, 23637)
(0.0, 23635)
(0.0, 23633)
(0.0, 23627)
(0.0, 23619)
(0.0, 23617)
(0.0, 23613)
(0.0, 23611)
(0.0, 23609)
(0.0, 23607)
(0.0, 23605)
(0.0, 23597)
(0.0, 23595)
(0.0, 23575)
(0.0, 23569)
(0.0, 23555)
(0.0, 23551)
(0.0, 23539)
(0.0, 23537)
(0.0, 23523)
(0.0, 23519)
(0.0, 23517)
(0.0, 23515)
(0.0, 23511)
(0.0, 23487)
(0.0, 23483)
(0.0, 23479)
(0.0, 23477)
(0.0, 23475)
(0.0, 23459)
(0.0, 23439)
(0.0, 23433)
(0.0, 23427)
(0.0, 23425)
(0.0, 23421)
(0.0, 23409)
(0.0, 23399)
(0.0, 23393)
(0.0, 23387)
(0.0, 23383)
(0.0, 23375)
(0.0, 23369)
(0.0, 23365)
(0.0, 23361)
(0.0, 23359)
(0.0, 23349)
(0.0, 23347)
(0.0, 23345)
(0.0, 23343)
(0.0, 23341)
(0.0, 23333)
(0.0, 23327)
(0.0, 23325)

(0.0, 6841)
(0.0, 6840)
(0.0, 6839)
(0.0, 6838)
(0.0, 6837)
(0.0, 6836)
(0.0, 6835)
(0.0, 6834)
(0.0, 6833)
(0.0, 6830)
(0.0, 6829)
(0.0, 6828)
(0.0, 6827)
(0.0, 6823)
(0.0, 6822)
(0.0, 6809)
(0.0, 6802)
(0.0, 6800)
(0.0, 6798)
(0.0, 6797)
(0.0, 6796)
(0.0, 6795)
(0.0, 6794)
(0.0, 6793)
(0.0, 6792)
(0.0, 6779)
(0.0, 6777)
(0.0, 6772)
(0.0, 6771)
(0.0, 6769)
(0.0, 6768)
(0.0, 6762)
(0.0, 6761)
(0.0, 6760)
(0.0, 6759)
(0.0, 6758)
(0.0, 6749)
(0.0, 6748)
(0.0, 6743)
(0.0, 6741)
(0.0, 6735)
(0.0, 6734)
(0.0, 6731)
(0.0, 6730)
(0.0, 6727)
(0.0, 6721)
(0.0, 6718)
(0.0, 6714)
(0.0, 6713)
(0.0, 6705)
(0.0, 6704)
(0.0, 6702)
(0.0, 6701)
(0.0, 6695)
(0.0, 6694)
(0.0, 6693)
(0.0, 6692)
(0.0, 6691)
(0.0, 6690)
(0.0, 6689)
(0.0, 6688)
(0.0, 6687)
(0.0, 6686)
(0.0, 6685)
(0.0, 6684)
(0.0, 6675)
(0.0, 6674)
(0.0, 6672)
(0.0, 6671)
(0.0, 6670)
(0.0, 6667)
(0.0, 6666)
(0.0, 6658)
(0.0, 6657)
(0.0, 6654)
(0.0, 6641)
(0.0, 6636)
(0.0, 6635)
(0.0, 6634)
(0.0, 6633)
(0.0, 6630)
(0.0, 6629)
(0.0, 6628)
(0.0

In [96]:
# this is being calculated suspiciously fast but okay, we only have at max a few genres per series
for series1 in tqdm(anime_ids):
    for series2 in anime_ids:
        a = calculate_series_similarity(series1, series2)

  res = licznik/(math.sqrt(m1) * math.sqrt(m2))
  0%|          | 8/12294 [00:02<1:14:58,  2.73it/s]


KeyboardInterrupt: 