## Dataset split using `os.symlink()`

When working with large datasets consisting of separate files, we can manage dataset splits with symbolic links, essentially creating shortcuts to original files according to a desired split ratio.

The advantage is we don't need to copy or move files around, as these operations require extra computation and memory usage.  

In [1]:
import os
import matplotlib.pyplot as plt
import pandas as pd

print(os.getcwd())

datapath = "images/"

c:\Users\tomas\Desktop\ip102_v1.1


In [2]:
# read category from filename
filename = "0_00213.jpg"
category = filename.split("_")[0]
print(category)

0


### Get labels

In [3]:
# read in `classes.txt.txt` with proper class names into a dict.
# these could do with a bit of sanitation.
labels = {}

with open('classes.txt.txt') as f:
    for line in f:
        # remove lead/trailing whitespace 
        stripped = line.strip()
        # collect the parts of the names together with *
        head, *rest = stripped.split()

        # we can do -1 here since our images are classed 0-101
        key = int(head) -1
        
        val = rest
        # join the names with an underscore
        labels[key] = '_'.join(val)

labels

{0: 'rice_leaf_roller',
 1: 'rice_leaf_caterpillar',
 2: 'paddy_stem_maggot',
 3: 'asiatic_rice_borer',
 4: 'yellow_rice_borer',
 5: 'rice_gall_midge',
 6: 'Rice_Stemfly',
 7: 'brown_plant_hopper',
 8: 'white_backed_plant_hopper',
 9: 'small_brown_plant_hopper',
 10: 'rice_water_weevil',
 11: 'rice_leafhopper',
 12: 'grain_spreader_thrips',
 13: 'rice_shell_pest',
 14: 'grub',
 15: 'mole_cricket',
 16: 'wireworm',
 17: 'white_margined_moth',
 18: 'black_cutworm',
 19: 'large_cutworm',
 20: 'yellow_cutworm',
 21: 'red_spider',
 22: 'corn_borer',
 23: 'army_worm',
 24: 'aphids',
 25: 'Potosiabre_vitarsis',
 26: 'peach_borer',
 27: 'english_grain_aphid',
 28: 'green_bug',
 29: 'bird_cherry-oataphid',
 30: 'wheat_blossom_midge',
 31: 'penthaleus_major',
 32: 'longlegged_spider_mite',
 33: 'wheat_phloeothrips',
 34: 'wheat_sawfly',
 35: 'cerodonta_denticornis',
 36: 'beet_fly',
 37: 'flea_beetle',
 38: 'cabbage_army_worm',
 39: 'beet_army_worm',
 40: 'Beet_spot_flies',
 41: 'meadow_moth',
 

In [6]:
# alternatively the dictionary comprehension way
with open("classes.txt.txt") as f:
    a = {int(k)-1: '_'.join(v.split()) for line in f for (k,v) in [line.strip().split(None, 1)]}
a

{0: 'rice_leaf_roller',
 1: 'rice_leaf_caterpillar',
 2: 'paddy_stem_maggot',
 3: 'asiatic_rice_borer',
 4: 'yellow_rice_borer',
 5: 'rice_gall_midge',
 6: 'Rice_Stemfly',
 7: 'brown_plant_hopper',
 8: 'white_backed_plant_hopper',
 9: 'small_brown_plant_hopper',
 10: 'rice_water_weevil',
 11: 'rice_leafhopper',
 12: 'grain_spreader_thrips',
 13: 'rice_shell_pest',
 14: 'grub',
 15: 'mole_cricket',
 16: 'wireworm',
 17: 'white_margined_moth',
 18: 'black_cutworm',
 19: 'large_cutworm',
 20: 'yellow_cutworm',
 21: 'red_spider',
 22: 'corn_borer',
 23: 'army_worm',
 24: 'aphids',
 25: 'Potosiabre_vitarsis',
 26: 'peach_borer',
 27: 'english_grain_aphid',
 28: 'green_bug',
 29: 'bird_cherry-oataphid',
 30: 'wheat_blossom_midge',
 31: 'penthaleus_major',
 32: 'longlegged_spider_mite',
 33: 'wheat_phloeothrips',
 34: 'wheat_sawfly',
 35: 'cerodonta_denticornis',
 36: 'beet_fly',
 37: 'flea_beetle',
 38: 'cabbage_army_worm',
 39: 'beet_army_worm',
 40: 'Beet_spot_flies',
 41: 'meadow_moth',
 

### Data exploration & split 

- read original splits from .csvs and select matching filenames
- custom split:
    * get lengths of each class subset (# of samples per label)
    * 

In [29]:
# https://github.com/MLWhiz/data_science_blogs/blob/master/compvisblog/Boats_DataExploration.ipynb

# print(labels)

file_list = []
for (dirpath, dirnames, filenames) in os.walk(datapath):
    file_list.extend(filenames)


# or
images = []
for image in os.listdir(datapath):
    images.append(image)

print(len(images))
print(images[0], images[-1])

75222
0_00000.jpg 9_06574.jpg


In [35]:
images = {}
for image in os.listdir(datapath):
    category = image.split("_")[0]
    images[image] = category

In [None]:
# make dataframe with filenames, class id, and label 


In [13]:
# Load the paths to the images in a directory

def load_images_from_folder(folder,only_path = False, label = ""):
    if only_path == False:
        images = []
        file_name=[]
        for filename in os.listdir(folder):
            img = plt.imread(os.path.join(folder,filename))
            
            if img is not None:
                end=filename.find(".")
                file_name.append(file[0:end])
                images.append(img)
                
        return images, file_name
    else:
        path = []
        for filename in os.listdir(folder):
            img_path = os.path.join(folder,filename)
            if img_path is not None:
                path.append([label,img_path])
        return path

In [15]:
# Load the paths on the images
images = []
path = datapath
for f in os.listdir(path):
    images += load_images_from_folder(path,False,label = f)      
                        
# Create a dataframe with the paths and the label for each insect
df1 = pd.DataFrame(images, columns = ["insect_gbif", "path_img"])

file_name=[]
for i in range(len(df1["path_img"])):
    temp=df1.path_img[i].split('/')[-1].split('.')[0]
    file_name.append(temp)
file_name

df1['file_name'] = file_name
display(df1.describe())

display(df1)


NameError: name 'file' is not defined

### creating symlinks

In [4]:
# create folder structure to hold split sets
dataset = "split/"
#os.mkdir(dataset)

dirs = ["train", "test", "val"]

train_path = os.path.join(dataset, "train")
val_path = os.path.join(dataset, "val")
test_path = os.path.join(dataset, "test")

##os.mkdir(train_path)
#os.mkdir(val_path)
#os.mkdir(test_path)

- Make folders for each class in each dataset folder
- read in the original train/val/test split csvs
- using `os.symlink()`, add links to original files in corresponding class folder in each dataset folder

functionalised making of symlinks

- (locally tested) Now properly creates split/phase/class folders in the right place and symlink for provided source file.

- `create_link(src, phase)`
-  `src` being an absolute path, `phase` being a string of 'train', 'val' or 'test'

In [27]:
dataset = "split/"
os.mkdir(dataset)
src_abs = os.path.abspath(src)


def create_link(src, phase):

    # check src
    if os.path.isfile(src):
        print('source exists')
        base_path = os.path.relpath(dataset)

        print("base_path: ", base_path)
        
        
        # determine class index from filename
        head_tail = os.path.split(src)

        label_index = int(head_tail[1].split('_')[0])
        print(label_index)

        img_filename_only = head_tail[1]

        labelname = labels[label_index]
        print('class index of source: {}, labelname: {}'.format(label_index, labelname))
        phase_path = os.path.join(base_path, phase)
        print("phase path: '%s'" % phase_path)

        # if phase path doesn't exist, it needs to be made
        if not os.path.isdir(phase_path):
            print("making phase directory...")
            os.mkdir(phase_path)

        dest_folder_path = os.path.join(phase_path, labelname)
        print("dest folder path: {}, is abs: {}".format(dest_folder_path, os.path.isabs(dest_folder_path)))

        # if destination folder doesn't exist, make it
        if not os.path.exists(dest_folder_path):
             print("making class directory...")
             os.mkdir(dest_folder_path)
        
        # create destination path
        link_path = os.path.join(dest_folder_path, img_filename_only)
        link_path = os.path.relpath(link_path)
        print("link path:", link_path)
        
        # make link
        print("making symlink at", link_path)
        os.symlink(src, link_path)
        

    else:
        print('src file not found')


create_link(src_abs, "train")

source exists
base_path:  split
0
class index of source: 0, labelname: rice_leaf_roller
phase path: 'split\train'
making phase directory...
dest folder path: split\train\rice_leaf_roller, is abs: False
making class directory...
link path: split\train\rice_leaf_roller\0_00005.jpg
making symlink at split\train\rice_leaf_roller\0_00005.jpg
