In [None]:
# default_exp datasets.loaders

# Dataset loaders

Methods for downloading the manga covers from the MangaDex database.

In [None]:
# hide
from nbdev.showdoc import *

In [None]:
# export
from PIL import Image, UnidentifiedImageError
from pathlib import Path
from tqdm.auto import tqdm
from mangacover.datasets.internals.MangaDex import get_covers_for_all_tags

def create_multiclass_tag_manga_dataset(path, num_mangas=20):
    """Saves a dataset for the multiclassification problem of deriving the tags associated with
    a manga by its cover, in `path`.
    """
    print('Fetching manga metadata from the MangaDex API...')
    manga_df = get_covers_for_all_tags(num_mangas)
    folder = Path(f'{path}')
    folder.mkdir(parents=True, exist_ok=True)
    print(f'Saving data to {folder}...')
    for manga in tqdm(list(manga_df.itertuples())):
        my_file = folder/manga.filename
        if my_file.exists():
            manga_df.at[manga.Index, 'failed'] = False
            continue
        try:
            im = Image.open(requests.get(manga.url, stream=True).raw)
            im.save(folder/manga.filename)
            manga_df.at[manga.Index, 'failed'] = False
        except UnidentifiedImageError:
            print("Warning: Couldn't identify image file " + manga.filename + ". Skipping.")
            manga_df.at[manga.Index, 'failed'] = True
    metadata_csv_path = folder/'dataset.csv'
    print(f'Writing metadata csv file at {metadata_csv_path}')
    manga_df.to_csv(metadata_csv_path)
    print('Done.')

In [None]:
show_doc(create_multiclass_tag_manga_dataset)

<h4 id="create_multiclass_tag_manga_dataset" class="doc_header"><code>create_multiclass_tag_manga_dataset</code><a href="__main__.py#L7" class="source_link" style="float:right">[source]</a></h4>

> <code>create_multiclass_tag_manga_dataset</code>(**`path`**, **`num_mangas`**=*`20`*)

Saves a dataset for the multiclassification problem of deriving the tags associated with
a manga by its cover, in `path`.

Let's create the dataset in the `data/` folder

In [None]:
create_multiclass_tag_manga_dataset('data/')

Fetching manga metadata from the MangaDex API...


  0%|          | 0/76 [00:00<?, ?it/s]

Saving data to data...


  0%|          | 0/833 [00:00<?, ?it/s]

Writing metadata csv file at data/dataset.csv
Done.


Here's how the metadata csv file looks like. We can use the `filename` to load the files and the `tags` are pipe (|) separated, so we need to handle those to pass to our model.

In [None]:
import pandas as pd

mangas_df = pd.read_csv('data/dataset.csv')
mangas_df.head()

Unnamed: 0.1,Unnamed: 0,mangaId,url,filename,tags,failed
0,0,2661ccb2-9b4e-42bb-9697-bed499b9b363,https://uploads.mangadex.org/covers/2661ccb2-9...,2661ccb2-9b4e-42bb-9697-bed499b9b363_be17ccee-...,Oneshot|Reincarnation|Historical|Romance|Antho...,False
1,1,cb34b520-fd48-474e-8d06-a7a25473342e,https://uploads.mangadex.org/covers/cb34b520-f...,cb34b520-fd48-474e-8d06-a7a25473342e_5035057d-...,Oneshot|Romance|Comedy|Slice of Life,False
2,2,ab468776-27a5-456d-8f58-e058059531c9,https://uploads.mangadex.org/covers/ab468776-2...,ab468776-27a5-456d-8f58-e058059531c9_4a18ccf6-...,Oneshot|Romance|Comedy|Anthology|Office Worker...,False
3,3,11afa5c2-41dc-4cf3-8451-f306a3caf1ab,https://uploads.mangadex.org/covers/11afa5c2-4...,11afa5c2-41dc-4cf3-8451-f306a3caf1ab_31616f14-...,Oneshot|Romance|Comedy|Anthology|Office Worker...,False
4,4,cd8197af-e771-46de-82a0-5e70f977e6a1,https://uploads.mangadex.org/covers/cd8197af-e...,cd8197af-e771-46de-82a0-5e70f977e6a1_a5a3b64c-...,Oneshot|Romance|Comedy|Anthology|School Life,False
