In [None]:
!pip install icrawler

Collecting icrawler
  Downloading icrawler-0.6.10-py3-none-any.whl.metadata (6.2 kB)
Collecting bs4 (from icrawler)
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Downloading icrawler-0.6.10-py3-none-any.whl (36 kB)
Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Installing collected packages: bs4, icrawler
Successfully installed bs4-0.0.2 icrawler-0.6.10


In [None]:
import kagglehub
from kagglehub import KaggleDatasetAdapter
import ast
import os
import re
from icrawler.builtin import BingImageCrawler
import time

import shutil
import warnings


warnings.filterwarnings("ignore")

BASE_ROOT = 'anime_dataset'  # Root for train/test split
TRAIN_ROOT = os.path.join(BASE_ROOT, 'train')
TEST_ROOT = os.path.join(BASE_ROOT, 'test')
os.makedirs(TRAIN_ROOT, exist_ok=True)
os.makedirs(TEST_ROOT, exist_ok=True)


df = kagglehub.dataset_load(
    KaggleDatasetAdapter.PANDAS,
    "forgetabhi/top-1000-anime-and-manga-characters-dataset",
    "top_anime_characters_cleaned.csv",
)


Downloading from https://www.kaggle.com/api/v1/datasets/download/forgetabhi/top-1000-anime-and-manga-characters-dataset?dataset_version_number=1&file_name=top_anime_characters_cleaned.csv...


100%|██████████| 195k/195k [00:00<00:00, 2.09MB/s]


In [None]:
def get_first_anime(anime_col):
    if isinstance(anime_col, str):
        anime_list = ast.literal_eval(anime_col)
    else:
        anime_list = anime_col
    return anime_list[0] if anime_list else None

df['main_anime'] = df['anime_manga_titles'].apply(get_first_anime)

In [None]:
def clean_anime_name(anime_name):
    # Split at :, -, (, take first part
    anime_clean = re.split(':', anime_name)[0].strip()
    anime_clean = re.sub(r'[^\w\s]', '', anime_clean)  # remove punctuation
    return anime_clean


In [None]:
def download_and_split_by_anime(anime_name, character_name, max_images=10):
    anime_clean = clean_anime_name(anime_name)
    char_clean = re.sub(r',', '', str(character_name)).strip().replace(' ', '_')

    temp_folder = f"/tmp/{anime_clean}_{char_clean}"
    os.makedirs(temp_folder, exist_ok=True)

    crawler = BingImageCrawler(storage={"root_dir": temp_folder})
    try:
        crawler.crawl(keyword=f"{character_name}", max_num=max_images)
    except Exception as e:
        print(f"Error downloading {character_name}: {e}")
        shutil.rmtree(temp_folder, ignore_errors=True)
        return

    images = [f for f in os.listdir(temp_folder) if f.lower().endswith(('.jpg', '.png', '.jpeg'))]
    if len(images) < 3:
        print(f"Skipping {character_name} ({anime_name}): only {len(images)} images")
        shutil.rmtree(temp_folder, ignore_errors=True)
        return

    # Create folders for this anime
    train_folder = os.path.join(TRAIN_ROOT, anime_clean)
    test_folder = os.path.join(TEST_ROOT, anime_clean)
    os.makedirs(train_folder, exist_ok=True)
    os.makedirs(test_folder, exist_ok=True)

    # Calculate current counts to offset filenames
    existing_train = len(os.listdir(train_folder))
    existing_test = len(os.listdir(test_folder))

    # Move all images randomly: 80% train, 20% test
    images.sort()
    split_index = int(len(images) * 0.8)
    train_images, test_images = images[:split_index], images[split_index:]

    # Save new train images with unique names
    for i, img in enumerate(train_images, start=existing_train + 1):
        ext = os.path.splitext(img)[1]
        dest_name = f"{anime_clean}_{char_clean}_train_{i:04d}{ext}"
        shutil.move(os.path.join(temp_folder, img), os.path.join(train_folder, dest_name))

    # Save new test images with unique names
    for i, img in enumerate(test_images, start=existing_test + 1):
        ext = os.path.splitext(img)[1]
        dest_name = f"{anime_clean}_{char_clean}_test_{i:04d}{ext}"
        shutil.move(os.path.join(temp_folder, img), os.path.join(test_folder, dest_name))

    shutil.rmtree(temp_folder, ignore_errors=True)
    print(f"{anime_name}: {len(train_images)} new train, {len(test_images)} new test images added")

# Example loop
downloaded = set()
for idx, row in enumerate(df.itertuples(index=False), start=1):
    anime = getattr(row, 'main_anime', None)
    char = getattr(row, 'name_english', None)

    if not anime or not char or (anime, char) in downloaded:
        continue

    print(f"[{idx}/{len(df)}] {anime} - {char}")
    download_and_split_by_anime(anime, char)
    downloaded.add((anime, char))
    time.sleep(1)

[1/1050] Code Geass: Hangyaku no Lelouch - Lamperouge, Lelouch


ERROR:downloader:Response status code 403, file https://rare-gallery.com/uploads/posts/188348-lelouch-lamperouge-1920x1200.jpg


Code Geass: Hangyaku no Lelouch: 8 new train, 2 new test images added
[2/1050] Shingeki no Kyojin: Kuinaki Sentaku - Levi
Shingeki no Kyojin: Kuinaki Sentaku: 8 new train, 2 new test images added
[3/1050] One Piece - Monkey D., Luffy


ERROR:downloader:Response status code 404, file https://vignette.wikia.nocookie.net/heros/images/e/e4/Monkey_D_Luffy_Infobox.jpg


In [None]:
import shutil

shutil.make_archive("/content/anime_dataset", 'zip', "/content/anime_dataset")


In [None]:
from google.colab import files

files.download("/content/anime_dataset.zip")