In [58]:
import scipy.io
from pathlib import Path
from typing import Optional
DATA_PATH = Path('..', 'data_dir')
MAX_PATH = DATA_PATH
LOG_PATH = Path(DATA_PATH, 'logs')

DN_PATH = Path(DATA_PATH, 'damageNet')
DN_LABEL_PATH = Path(DN_PATH, 'val_damagenet.txt')
DN_IMAGES_PATH = Path(DN_PATH, 'images')

ORI_PATH = Path(DATA_PATH, 'ori')

ORI_LABEL_PATH = Path(ORI_PATH,'labels.csv')
ORI_IMAGES_PATH = Path(ORI_PATH,'train','images')

In [50]:
import warnings
from contextlib import contextmanager
import os
import pandas as pd
import shutil
import tempfile
from typing import Any, Dict, List, Iterator, Optional, Tuple
import torch
from torchvision.datasets.utils import check_integrity, extract_archive, verify_str_arg
ARCHIVE_META = {
    'train': ('ILSVRC2012_img_train.tar', '1d675b47d978889d74fa0da5fadfb00e'),
    'val': ('ILSVRC2012_img_val.tar', '29b22e2961454d5413ddabcf34fc5622'),
    'devkit': ('ILSVRC2012_devkit_t12.tar.gz', 'fa75699e90414af021442c21a62c3abf')
}
META_FILE = "meta.bin"


In [21]:

def _verify_archive(root: str, file: str, md5: str) -> None:
    if not check_integrity(os.path.join(root, file), md5):
        msg = ("The archive {} is not present in the root directory or is corrupted. "
               "You need to download it externally and place it in {}.")
        raise RuntimeError(msg.format(file, root))
def parse_devkit_archive(root: str, file: Optional[str] = None) -> None:
    """Parse the devkit archive of the ImageNet2012 classification dataset and save
    the meta information in a binary file.

    Args:
        root (str): Root directory containing the devkit archive
        file (str, optional): Name of devkit archive. Defaults to
            'ILSVRC2012_devkit_t12.tar.gz'
    """
    import scipy.io as sio

    def parse_meta_mat(devkit_root: str) -> Tuple[Dict[int, str], Dict[str, str]]:
        metafile = os.path.join(devkit_root, "data", "meta.mat")
        meta = sio.loadmat(metafile, squeeze_me=True)['synsets']
        nums_children = list(zip(*meta))[4]
        meta = [meta[idx] for idx, num_children in enumerate(nums_children)
                if num_children == 0]
        idcs, wnids, classes = list(zip(*meta))[:3]
        classes = [tuple(clss.split(', ')) for clss in classes]
        idx_to_wnid = {idx: wnid for idx, wnid in zip(idcs, wnids)}
        wnid_to_classes = {wnid: clss for wnid, clss in zip(wnids, classes)}
        return idx_to_wnid, wnid_to_classes

    def parse_val_groundtruth_txt(devkit_root: str) -> List[int]:
        file = os.path.join(devkit_root, "data",
                            "ILSVRC2012_validation_ground_truth.txt")
        with open(file, 'r') as txtfh:
            val_idcs = txtfh.readlines()
        return [int(val_idx) for val_idx in val_idcs]

    @contextmanager
    def get_tmp_dir() -> Iterator[str]:
        tmp_dir = tempfile.mkdtemp()
        try:
            yield tmp_dir
        finally:
            shutil.rmtree(tmp_dir)

    archive_meta = ARCHIVE_META["devkit"]
    if file is None:
        file = archive_meta[0]
    md5 = archive_meta[1]

    _verify_archive(root, file, md5)

    with get_tmp_dir() as tmp_dir:
        extract_archive(os.path.join(root, file), tmp_dir)

        devkit_root = os.path.join(tmp_dir, "ILSVRC2012_devkit_t12")
        idx_to_wnid, wnid_to_classes = parse_meta_mat(devkit_root)
        val_idcs = parse_val_groundtruth_txt(devkit_root)
        val_wnids = [idx_to_wnid[idx] for idx in val_idcs]

        torch.save((wnid_to_classes, val_wnids), os.path.join(root, META_FILE))

In [30]:
import scipy.io as sio
def parse_meta_mat(devkit_root: str) -> Tuple[Dict[int, str], Dict[str, str]]:
        metafile = os.path.join(devkit_root, "data", "meta.mat")
        meta = sio.loadmat(metafile, squeeze_me=True)['synsets']
        nums_children = list(zip(*meta))[4]
        meta = [meta[idx] for idx, num_children in enumerate(nums_children)
                if num_children == 0]
        idcs, wnids, classes = list(zip(*meta))[:3]
        classes = [tuple(clss.split(', ')) for clss in classes]
        idx_to_wnid = {idx: wnid for idx, wnid in zip(idcs, wnids)}
        wnid_to_classes = {wnid: clss for wnid, clss in zip(wnids, classes)}
        return idx_to_wnid, wnid_to_classes


In [52]:
idx_to_wnid, wnid_to_classes = parse_meta_mat(Path(ORI_PATH, 'info'))

In [38]:
idx_to_wnid_map = idx_to_wnid.items()

In [42]:
label_wnid_map = pd.DataFrame(idx_to_wnid_map, columns=['label', 'wnid'])

In [125]:
files=[f.name for f in ORI_IMAGES_PATH.glob('*.JPEG')]

In [128]:
train = pd.DataFrame(files, columns=['files'])
label_wnid_map = pd.DataFrame(idx_to_wnid_map, columns=['label', 'wnid'])
label_wnid_map['label'] = label_wnid_map['label'].astype(int)
train['wnid'] = train['files'].str.split('_').str[0]

In [130]:
tt = pd.merge(train, label_wnid_map, how='left', on='wnid')

In [134]:
tt = tt.drop(columns=['wnid'])

In [140]:
tt.to_csv(Path(ORI_PATH, 'train', 'labels.csv'), sep=' ', index=False, header=False)