In [None]:
# default_exp datasets.internals.MangaDex

# MangaDexClient

This class defines some wrappers for the MangaDex API (https://api.mangadex.org/docs.html), which we use to get manga labels and cover image urls from their database. We can use those to create our datasets for training.

In [None]:
# hide
from nbdev.showdoc import *

In [None]:
# export
import requests
import json


class MangaDexClient:
    """Client for the MangaDex API"""

    def __init__(self, credentials_file):
        with open(credentials_file) as config_file:
            data = config_file.read()

        response = requests.post(
            "https://api.mangadex.org/auth/login", json=json.loads(data)
        )
        content = json.loads(response.content)
        self.refresh_token = content["token"]["refresh"]
        self.session_token = content["token"]["session"]

    def get_manga_tags(self):
        """Returns a dict from human readable tag names to tag_ids for each tag in the mangadex database"""
        response = requests.get(
            "https://api.mangadex.org/manga/tag",
            headers={
                "Authorization": f"Bearer {self.session_token}",
            },
        )
        content = json.loads(response.content)
        return {
            item["data"]["attributes"]["name"]["en"]: item["data"]["id"]
            for item in content
        }

    def search_manga_tags_covers(
        self, total=None, limit=100, offset=0, includedTags=None, excludedTags=None
    ):
        """Gets a list of manga with id, tags and cover image filenames"""
        contents = []
        while total is None or offset < total:
            response = requests.get(
                "https://api.mangadex.org/manga",
                params={
                    "limit": limit if not total else min(limit, total - offset),
                    "offset": offset,
                    "includedTags[]": includedTags,
                    "excludedTags[]": excludedTags,
                    "includes[]": "cover_art",
                },
                headers={
                    "Authorization": f"Bearer {self.session_token}",
                },
            )
            content = json.loads(response.content)
            if not total:
                total = content["total"]
            contents.append(content)
            offset += limit

        return [
            {
                "mangaId": result["data"]["id"],
                "tags": [
                    tag["attributes"]["name"]["en"]
                    for tag in result["data"]["attributes"]["tags"]
                ],
                "cover_art_filenames": [
                    relationship["attributes"]["fileName"]
                    for relationship in result["relationships"]
                    if relationship["type"] == "cover_art"
                ],
            }
            for content in contents
            for result in content["results"]
        ]

In [None]:
client = MangaDexClient("credentials.json")

In [None]:
show_doc(MangaDexClient.get_manga_tags)

<h4 id="MangaDexClient.get_manga_tags" class="doc_header"><code>MangaDexClient.get_manga_tags</code><a href="__main__.py#L20" class="source_link" style="float:right">[source]</a></h4>

> <code>MangaDexClient.get_manga_tags</code>()

Returns a dict from human readable tag names to tag_ids for each tag in the mangadex database

Here's what it looks like:

In [None]:
tags = client.get_manga_tags()
tags["Fantasy"]

'cdc58593-87dd-415e-bbc0-2ec27bf404cc'

In [None]:
show_doc(MangaDexClient.search_manga_tags_covers)

<h4 id="MangaDexClient.search_manga_tags_covers" class="doc_header"><code>MangaDexClient.search_manga_tags_covers</code><a href="__main__.py#L34" class="source_link" style="float:right">[source]</a></h4>

> <code>MangaDexClient.search_manga_tags_covers</code>(**`total`**=*`None`*, **`limit`**=*`100`*, **`offset`**=*`0`*, **`includedTags`**=*`None`*, **`excludedTags`**=*`None`*)

Gets a list of manga with id, tags and cover image filenames

We can use this method to get covers from mangas with the tags we want.

In [None]:
mangas = client.search_manga_tags_covers(total=1, includedTags=tags["Fantasy"])
mangas

[{'mangaId': 'e78a489b-6632-4d61-b00b-5206f5b8b22b',
  'tags': ['Reincarnation',
   'Monsters',
   'Action',
   'Demons',
   'Comedy',
   'Samurai',
   'Adventure',
   'Magic',
   'Isekai',
   'Drama',
   'Fantasy',
   'Monster Girls',
   'Adaptation'],
  'cover_art_filenames': ['0bf34fa4-876a-4139-9685-001fa18ef094.jpg']}]

# Mangadex helper methods

Methods for handling some specific use cases.

In [None]:
# export
import pandas as pd
from tqdm.auto import tqdm

def get_covers_for_all_tags(num_mangas=20):
    """Returns a pandas DataFrame with covers image urls for each tag in the MangaDex database.
    
    It may be possible for a manga to show up in the query for multiple different tags, so we
    deduplicate those cases.
    
    TODO: There seems to be an issue with the API where only one cover image is returned for each
    manga. We need to investigate this further, so we do not run into the issue of having too much
    data to handle unexpectedly if this behavior changes suddenly.
    """
    client = MangaDexClient("credentials.json")
    tags = client.get_manga_tags()
    mangas = [
        manga
        for _, tag_id in tqdm(tags.items())
        for manga in client.search_manga_tags_covers(total=num_mangas, includedTags=[tag_id])
    ]

    # Deduplicate mangas in list by mangaId
    seen = set()
    mangas = [
        seen.add(manga["mangaId"]) or manga
        for manga in mangas
        if manga["mangaId"] not in seen
    ]

    return pd.DataFrame(
        [
            {
                "mangaId": manga["mangaId"],
                "url": f'https://uploads.mangadex.org/covers/{manga["mangaId"]}/{filename}',
                "filename": f'{manga["mangaId"]}_{filename}',
                "tags": "|".join(manga["tags"]),
            }
            for manga in mangas
            for filename in manga["cover_art_filenames"]
        ]
    )

In [None]:
show_doc(get_covers_for_all_tags)

<h4 id="get_covers_for_all_tags" class="doc_header"><code>get_covers_for_all_tags</code><a href="__main__.py#L5" class="source_link" style="float:right">[source]</a></h4>

> <code>get_covers_for_all_tags</code>(**`num_mangas`**=*`20`*)

Returns a pandas DataFrame with covers image urls for each tag in the MangaDex database.

It may be possible for a manga to show up in the query for multiple different tags, so we
deduplicate those cases.

TODO: There seems to be an issue with the API where only one cover image is returned for each
manga. We need to investigate this further, so we do not run into the issue of having too much
data to handle unexpectedly if this behavior changes suddenly.

Here's how the DataFrame looks like:

In [None]:
get_covers_for_all_tags().head()

  0%|          | 0/76 [00:00<?, ?it/s]

Unnamed: 0,mangaId,url,filename,tags
0,2661ccb2-9b4e-42bb-9697-bed499b9b363,https://uploads.mangadex.org/covers/2661ccb2-9...,2661ccb2-9b4e-42bb-9697-bed499b9b363_be17ccee-...,Oneshot|Reincarnation|Historical|Romance|Antho...
1,cb34b520-fd48-474e-8d06-a7a25473342e,https://uploads.mangadex.org/covers/cb34b520-f...,cb34b520-fd48-474e-8d06-a7a25473342e_5035057d-...,Oneshot|Romance|Comedy|Slice of Life
2,ab468776-27a5-456d-8f58-e058059531c9,https://uploads.mangadex.org/covers/ab468776-2...,ab468776-27a5-456d-8f58-e058059531c9_4a18ccf6-...,Oneshot|Romance|Comedy|Anthology|Office Worker...
3,11afa5c2-41dc-4cf3-8451-f306a3caf1ab,https://uploads.mangadex.org/covers/11afa5c2-4...,11afa5c2-41dc-4cf3-8451-f306a3caf1ab_31616f14-...,Oneshot|Romance|Comedy|Anthology|Office Worker...
4,cd8197af-e771-46de-82a0-5e70f977e6a1,https://uploads.mangadex.org/covers/cd8197af-e...,cd8197af-e771-46de-82a0-5e70f977e6a1_a5a3b64c-...,Oneshot|Romance|Comedy|Anthology|School Life
