In [1]:
from os import listdir, makedirs
from os.path import isfile, join, exists
import shutil
import json

## Goal is to turn a flat directory of shell images into a structured dataset + json indices
sourcepath = 'shells'
onlyfiles = [f for f in listdir(sourcepath) if isfile(join(sourcepath, f))]

In [3]:
# Where we'll put our files
target = 'structured/files'

# Index of all image files
image_index = []

# Sortable index of all genus/species
taxa_index = {}

for src_file in onlyfiles:
    if src_file.startswith('.'):
        continue
    parts = src_file.split('_')
    variant = parts.pop().lower()
    index = parts.pop().lower()
    filename = index + '_' + variant
    path = target   
    
    # copy images to structured folder
    for pathpart in parts:
        path = path + '/' + pathpart.lower()
    
    if not exists(path):
        makedirs(path)
        
    dst_file = path + '/' + filename
        
    genus = parts[0].lower()
    species = parts[1].lower()
    
    image_meta = {}
    image_meta['filename'] = dst_file.replace('structured/', '')
    image_meta['original'] = src_file
    image_meta['genus'] = parts[0]
    image_meta['species'] = species
    image_index.append(image_meta)
    
    genus = parts[0].lower()
    species = parts[1].lower()
    
    if taxa_index.get(genus) is None:
        taxa_index[genus] = {}
    
    if taxa_index[genus].get(species) is None:
        taxa_index[genus][species] = []
    
    taxa_index[genus][species].append(filename)
    
    # finally, move file
    if exists(dst_file):
        continue
    shutil.copyfile(sourcepath + '/' + src_file, dst_file)

with open('structured/naming.json', 'w') as outfile:
    data = {
        'data': taxa_index
    }
    json.dump(data, outfile, indent=4)
    
with open('structured/images.json', 'w') as outfile:
    data = {
        'data': image_index
    }
    json.dump(data, outfile, indent=4)

# Add the metadata to the index files, as we can later display this information for all data viewers (over IPFS)
metadata = {
    'citation': 'Zhang, Qi; Zhou, Jianhang; He, Jing; Cun, Xiaodong; Zeng, Shaoning; Zhang, Bob (2019): A shell dataset, for shell features extraction and recognition. figshare. Collection. https://doi.org/10.6084/m9.figshare.c.4428335.v1',
    'description': 'We initially present a shell dataset, containing 7894 shell species with 29622 samples, where totally 59244 shell images for shell features extraction and recognition are used. 134 shell species’ images are analysed by three (colour, shape and texture) feature extraction methods, which is further evaluated by different classifiers.',
    'title': 'A shell dataset, for shell features extraction and recognition',
    'published': 'Published on 15 Oct 2019 - 03:35'
}

with open('structured/metadata.json', 'w') as outfile:
    json.dump(metadata, outfile, indent=4)