In [1]:
import requests
from collections import Counter
import pandas as pd
import numpy as np
from sklearn.metrics import DistanceMetric
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
url = 'https://graphql.anilist.co'

In [3]:
def find_data(year):
    query = """
    query ($year: Int, $page: Int, $perPage: Int) {
      Page(page: $page, perPage: $perPage) {
        media(seasonYear: $year, type: ANIME, sort: POPULARITY_DESC) {
          id
          title {
            romaji
            english
          }
          genres
          season
          seasonYear
          averageScore
          episodes
          popularity
        }
      }
    }
    """

    variables = {
        'year': year,
        'page': 1,
        'perPage': 50
    }

    response = requests.post(url, json={'query': query, 'variables': variables})
    data = response.json()
    anime_data = data['data']['Page']['media']

    df = pd.json_normalize(anime_data)

    # Keep useful columns
    df = df[['title.romaji', 'title.english', 'genres', 'season', 'seasonYear', 'popularity','averageScore']]
    df = df.dropna(subset=['genres'])  # Remove entries without genre data

    # Create a simplified title column
    df['title'] = df['title.english'].fillna(df['title.romaji'])
    df = df.dropna(subset=['title']).reset_index(drop=True)

    # Normalize score (optional)
    df['averageScore'] = df['averageScore'].fillna(df['averageScore'].mean())
    df['score_norm'] = (df['averageScore'] - df['averageScore'].min()) / (df['averageScore'].max() - df['averageScore'].min())

    return df


In [4]:
def get_anime_data_range(start_year, end_year):
    all_years = []
    for y in range(start_year, end_year + 1):
        try:
            year_df = find_data(y)
            all_years.append(year_df)
        except Exception as e:
            print(f"Error fetching {y}: {e}")
    combined_df = pd.concat(all_years, ignore_index=True)
    return combined_df

In [5]:
# test
# df = find_data(2025)
# df.head(15)
# anime_df = get_anime_data_range(2018, 2024)
# anime_df = get_anime_data_range(2005, 2025)
# anime_df.sort_values(by='popularity', ascending=False).head(50)

In [6]:
# jaccard similarity
def jaccard_similarity(genres1, genres2):
    set1, set2 = set(genres1), set(genres2)
    intersection = len(set1 & set2)
    union = len(set1 | set2)
    return intersection / union if union > 0 else 0

In [7]:
# get similarity 
def compute_similarity(df, weight_genre=0.8, weight_score=0.2):
    n = len(df)
    sim_matrix = np.zeros((n, n))

    for i in range(n):
        for j in range(n):
            genre_sim = jaccard_similarity(df['genres'][i], df['genres'][j])
            score_sim = 1 - abs(df['score_norm'][i] - df['score_norm'][j])  
            sim_matrix[i, j] = (weight_genre * genre_sim) + (weight_score * score_sim)
    return sim_matrix

In [8]:
# get similiarty
def top_similar(df, sim_matrix, title, n=10):
    if title not in df['title'].values:
        print(f"'{title}' not found in dataset.")
        return

    idx = df.index[df['title'] == title][0]
    similar_indices = sim_matrix[idx].argsort()[::-1][1:n+1]  
    result = df.iloc[similar_indices][['title', 'genres', 'score_norm']]
    return result

In [9]:
anime_df = get_anime_data_range(2005, 2025)
df = anime_df
sim_matrix = compute_similarity(df)

In [10]:
queries = ['Demon Slayer: Kimetsu no Yaiba', 'Golden Time', 'Free! -Iwatobi Swim Club-']

In [11]:
anime = queries[0]
print(anime)
top_similar(df, sim_matrix, anime).sort_values(by='score_norm', ascending=False)

Demon Slayer: Kimetsu no Yaiba


Unnamed: 0,title,genres,score_norm
1007,Demon Slayer: Kimetsu no Yaiba Infinity Castle,"[Action, Adventure, Drama, Fantasy, Supernatural]",0.966667
100,Naruto: Shippuden,"[Action, Adventure, Comedy, Drama, Fantasy, Su...",0.916667
751,Demon Slayer -Kimetsu no Yaiba- The Movie: Mug...,"[Action, Adventure, Drama, Fantasy, Mystery, S...",0.88
806,Demon Slayer: Kimetsu no Yaiba Mugen Train Arc,"[Action, Adventure, Drama, Fantasy, Mystery, S...",0.815789
21,Pokémon: Lucario and the Mystery of Mew,"[Action, Adventure, Drama, Fantasy]",0.787879
953,Demon Slayer: Kimetsu no Yaiba Hashira Trainin...,"[Action, Adventure, Drama, Fantasy, Supernatural]",0.78125
824,JoJo's Bizarre Adventure: STONE OCEAN,"[Action, Adventure, Drama, Supernatural]",0.763158
711,Dororo,"[Action, Adventure, Drama, Supernatural]",0.75
902,Demon Slayer: Kimetsu no Yaiba Swordsmith Vill...,"[Action, Adventure, Drama, Fantasy, Supernatural]",0.642857
95,BLEACH: The Sealed Sword Frenzy,"[Action, Adventure, Drama, Fantasy, Supernatural]",0.615385


In [12]:
anime = queries[1]
print(anime)
top_similar(df, sim_matrix, anime).sort_values(by='score_norm', ascending=False)

Golden Time


Unnamed: 0,title,genres,score_norm
427,The Wind Rises,"[Drama, Romance]",0.813953
32,Peach Girl: Super Pop Love Hurricane,"[Drama, Romance, Slice of Life]",0.69697
649,Welcome to the Ballroom,"[Drama, Romance, Sports]",0.69697
80,Strawberry Panic,"[Drama, Romance]",0.653846
43,School Days,"[Drama, Romance]",0.484848
137,Myself; Yourself,"[Drama, Romance]",0.472222
184,True Tears,"[Drama, Romance]",0.387097
370,"Say ""I love you"".","[Drama, Romance]",0.37931
638,LOVE and LIES,"[Drama, Romance]",0.151515
683,Citrus,"[Drama, Romance]",0.0


In [13]:
anime = queries[2]
print(anime)
top_similar(df, sim_matrix, anime).sort_values(by='score_norm', ascending=False)

Free! -Iwatobi Swim Club-


Unnamed: 0,title,genres,score_norm
668,Grand Blue Dreaming,"[Comedy, Slice of Life, Sports]",0.88
218,Hajime no Ippo: The Fighting! New Challenger,"[Comedy, Drama, Sports]",0.864865
482,SHIROBAKO,"[Comedy, Drama, Slice of Life]",0.85
757,HAIKYU!! TO THE TOP,"[Comedy, Drama, Sports]",0.84
1040,Grand Blue Dreaming Season 2,"[Comedy, Slice of Life, Sports]",0.833333
506,HAIKYU!! 2nd Season,"[Comedy, Drama, Sports]",0.806452
786,HAIKYU!! LAND VS. AIR,"[Comedy, Drama, Sports]",0.64
326,Hanasaku Iroha ~Blossoms for Tomorrow~,"[Comedy, Drama, Slice of Life]",0.612903
565,Yuri!!! on ICE,"[Comedy, Drama, Sports]",0.576923
491,Free! -Eternal Summer-,"[Comedy, Drama, Slice of Life, Sports]",0.45
