# Collect resources

## From GitHub

In [7]:
!git clone -q https://github.com/mrok273/Qiita ../data/raw/mrok273/Qiita

## From Kaggle

In [1]:
!kaggle datasets download -d mikoajkolman/pokemon-images-first-generation17000-files -p "../data/raw/" -q

## From Web

- Install firefox
- Open web page
- Bulk save image (See [How to Save All the Images on a Web Page in Firefox Browser](https://www.journeybytes.com/bulk-save-images-using-firefox/))

## From YouTube

### Pal

In [None]:
# Official paldeck
!yt-dlp --postprocessor-args "-ss 00:00:00 -t 00:00:05" -o "../data/video/pocketpair/%(title)s-%(id)s-5s.%(ext)s" -q https://www.youtube.com/playlist?list=PLptNv_Fxn9idzsTRulWNmLYKWgKhqKI5s

In [10]:
import os
import re

directory = "../data/video/pocketpair"
for filename in os.listdir(directory):
    match = re.search(r'[Pp]aldeck.*[Nn]o.(\d+).*.webm', filename)
    paldeck_no, = match.groups() if match else [None]
    if paldeck_no is None:
        continue
    new_filename = f"paldeck_no{paldeck_no.zfill(3)}.webm"
    os.rename(os.path.join(directory, filename), os.path.join(directory, new_filename))

In [12]:
!ffprobe -v error -select_streams v:0 -show_entries stream=width,height,r_frame_rate -of csv=s=x:p=0 "../data/video/pocketpair/paldeck_no001.webm"

1920x1080x60/1


In [21]:
import sys
sys.path.append('..')

import os
import subprocess

from datetime import datetime
from src.pipeline import *

input_dir = "../data/video/pocketpair"
output_dir = "../data/raw/pocketpair"

for root, dirs, files in os.walk(input_dir):
    for filename in files:
        full_input_path = os.path.join(root, filename)
        filename_without_ext, _ext = os.path.splitext(filename)
        output_subdir = os.path.join(output_dir, filename_without_ext)
        os.makedirs(output_subdir, exist_ok=True)

        output_pattern = os.path.join(output_subdir, "frame_%05d.png")
        command = ['ffmpeg', '-hwaccel', 'cuda', '-i', full_input_path, '-vf', 'fps=12', output_pattern]
        subprocess.run(command, check=True)
        
        for root_out, _, files_out in os.walk(output_subdir):
            for filename_out in files_out:
                full_output_path = os.path.join(root_out, filename_out)
                raw_dir = data_dir(Step.raw.value)
                metadata = Metadata(
                    bucket=raw_dir,
                    path=os.path.relpath(full_output_path, raw_dir),
                    step=Step.raw,
                    label=Label.pal,
                    created_at=datetime.utcnow()
                )
                create_metadata(metadata)
        

In [None]:
# Fan video
!yt-dlp -o "../data/video/palworld-fan/%(id)s.%(ext)s" -q https://www.youtube.com/playlist?list=PLitsLuiXBQxtd0ThPaYMqsbxUMfmdxVHc

In [24]:
import sys
sys.path.append('..')

import os
import subprocess

from datetime import datetime
from src.pipeline import *

input_dir = "../data/video/palworld-fan"
output_dir = "../data/raw/palworld-fan"

for root, dirs, files in os.walk(input_dir):
    for filename in files:
        full_input_path = os.path.join(root, filename)
        filename_without_ext, _ext = os.path.splitext(filename)
        output_subdir = os.path.join(output_dir, filename_without_ext)
        os.makedirs(output_subdir, exist_ok=True)

        output_pattern = os.path.join(output_subdir, "frame_%05d.png")
        command = ['ffmpeg', '-hwaccel', 'cuda', '-i', full_input_path, '-vf', 'fps=12', output_pattern]
        subprocess.run(command, check=True)
        
        for root_out, _, files_out in os.walk(output_subdir):
            for filename_out in files_out:
                full_output_path = os.path.join(root_out, filename_out)
                raw_dir = data_dir(Step.raw.value)
                metadata = Metadata(
                    bucket=raw_dir,
                    path=os.path.relpath(full_output_path, raw_dir),
                    step=Step.raw,
                    label=Label.pal,
                    created_at=datetime.utcnow()
                )
                create_metadata(metadata)
        

### Pokemon

In [None]:
!yt-dlp -o "../data/video/pokemon-games/%(id)s.%(ext)s" -q https://youtube.com/playlist?list=PLitsLuiXBQxvqH5Hv1R5ioFnCpIBMNvX3&si=nzehh3dDiU3k2Q7F

In [None]:
import os
import subprocess

input_dir = "../data/video/pokemon-games"
output_dir = "../data/raw/pokemon-games"
FPS = 6 # 動画の種類が多いので、動画あたりの画像数は少なくて良い

for root, dirs, files in os.walk(input_dir):
    for filename in files:
        full_input_path = os.path.join(root, filename)
        filename_without_ext, _ext = os.path.splitext(filename)
        output_subdir = os.path.join(output_dir, filename_without_ext)
        os.makedirs(output_subdir, exist_ok=True)
        output_pattern = os.path.join(output_subdir, "frame_%05d.png")
        command = ['ffmpeg', '-hwaccel', 'cuda', '-i', full_input_path, '-vf', f"fps={FPS}", output_pattern]
        subprocess.run(command, check=True)


In [13]:
# torchvision.dataset はフォルダ構造が`split`/`label`でないと使えない。前処理にはHuggingFace。
import os
from datasets import load_dataset
from torchvision import transforms
from typing import Tuple


def center_crop_and_save(input_dir:str, output_dir:str, crop_size: Tuple[int, int]):
    datasets = load_dataset("imagefolder", data_dir=input_dir)
    cropper = transforms.CenterCrop(crop_size)
    os.makedirs(output_dir, exist_ok=True)

    def _center_crop_and_save(example):
        cropped = cropper(example["image"])
        cropped.filename = os.path.abspath(example["image"].filename).lower().replace(
            os.path.abspath(input_dir).lower(),
            os.path.abspath(output_dir).lower(),
        )
        cropped.save(cropped.filename)
        # No need to return example, just save it.

    datasets.map(_center_crop_and_save)

In [15]:
center_crop_and_save("../data/raw/pokemon-games/0Loz61U6CuE", "../data/raw/pokemon-games/0Loz61U6CuE_cropped", (1028, 1028))
center_crop_and_save("../data/raw/pokemon-games/AObd6oPnlyg", "../data/raw/pokemon-games/AObd6oPnlyg_cropped", (1028, 1028))
center_crop_and_save("../data/raw/pokemon-games/LG-LZKUUVZI", "../data/raw/pokemon-games/LG-LZKUUVZI_cropped", (1028, 1028))
center_crop_and_save("../data/raw/pokemon-games/Q3-fCEL-JjE", "../data/raw/pokemon-games/Q3-fCEL-JjE_cropped", (1028, 1028))

Map: 100%|██████████| 11112/11112 [11:55<00:00, 15.52 examples/s]
Generating train split: 7405 examples [00:00, 10062.70 examples/s]
Map: 100%|██████████| 7405/7405 [18:05<00:00,  6.82 examples/s]
Generating train split: 9862 examples [00:00, 10031.40 examples/s]
Map: 100%|██████████| 9862/9862 [30:25<00:00,  5.40 examples/s]
Generating train split: 12420 examples [00:01, 10197.81 examples/s]
Map: 100%|██████████| 12420/12420 [30:12<00:00,  6.85 examples/s]
