## To split data into correct hierarchy, run this notebook from top to bottom

In [2]:
import os, sys
import glob
import shutil
import json
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import warnings
from IPython.display import display
FOLD='5'

In [90]:
def validate_split(df):
    """
    Checks-
    1. Each row should be either unknown (mos/non-mos), genus_spp or a known species.
    2. SPECIES_TO_PATH contains all valid folders with images. Check that the name in dataframe 
        exists in the actual directory.
    3. Warn if number of images in the directory does not match what csv says.
    """
    for i in range(len(df)):
        unk = df.loc[i, 'unknown']
        gknown = df.loc[i, 'genus']
        sknown = df.loc[i, 'species']
        assert pd.isnull(unk) + pd.isnull(gknown) + pd.isnull(sknown) == 2, "Error at Row index {}".format(i)
        assert df.loc[i, 'folder'] in SPECIES_TO_PATH, "Error at Row index {}, '{}' not found.".format(i, df.loc[i, 'folder'])
        nfiles = len(glob.glob(SPECIES_TO_PATH[df.loc[i, 'folder']] + '*m.*'))
        if nfiles != df.loc[i, 'm']:
            print("Index {}, Files found in {}: {}, Files according to split: {}".format(
                i, SPECIES_TO_PATH[df.loc[i, 'folder']], nfiles, df.loc[i, 'm']))
    print("K.")
    
def class_rename_util(cname):
    """
    'aedes aegypti' -> 'aedes aedes_aegypti'
    """
    if pd.isnull(cname): return cname
    assert len(cname.split(' ')) > 1
    species = cname.split(' ')[-1]
    return cname.replace(species, cname.replace(' ', '_'))

In [91]:
split_df = pd.read_excel(f'./unk training - 1.29.2020 - reduced.xlsx', sheet_name='fold{}'.format(FOLD))
split_df = split_df[split_df['ignore'] != 'YES'].reset_index(drop=True)
split_df['genus'] = split_df['genus'].apply(class_rename_util)
split_df['species'] = split_df['species'].apply(class_rename_util)
with pd.option_context("display.max_rows", 1000):
    display(split_df)

Unnamed: 0,ignore,unknown,genus,species,folder,m,specimens,fold
0,,,,aedes aedes_aegypti,aedes aegypti,896,188,tr/v/t
1,,,,aedes aedes_albopictus,aedes albopictus,725,155,tr/v/t
2,,,aedes aedes_spp,,aedes atlanticus,80,16,tr/v/t
3,,,aedes aedes_spp,,aedes canadensis,46,9,tr/v/t
4,,,aedes aedes_spp,,aedes cantator,8,2,test
5,,,aedes aedes_spp,,aedes cinereus,0,0,test
6,,,aedes aedes_spp,,aedes condolescens,10,2,test
7,,,,aedes aedes_dorsalis,aedes dorsalis,288,55,tr/v/t
8,,,aedes aedes_spp,,aedes fairfax-1,6,1,test
9,,,aedes aedes_spp,,aedes flavescens,72,16,tr/v/t


In [92]:
"""
SPECIES_TO_PATH maps folder names to paths on the hard drive
"""
SPECIES_TO_PATH = {}
genus_folders = glob.glob(f'/opt/ImageBase/AWS_sync/cropped/*/')
for genus_folder in genus_folders:
    species_folders = glob.glob(genus_folder + '*/')
    for species_folder in species_folders:
        gen_sp = ' '.join(species_folder.split('/')[-3:-1])
        SPECIES_TO_PATH[gen_sp] = species_folder
        
        sub_species_folders = glob.glob(species_folder + '*/')
        for sub_species_folder in sub_species_folders:
            gen_sp_subsp = ' '.join(sub_species_folder.split('/')[-4:-1])
            SPECIES_TO_PATH[gen_sp_subsp] = sub_species_folder

SPECIES_TO_PATH['non_mosquito'] = '/opt/ImageBase/AWS_sync/cropped/non_mosquito/'

In [93]:
SPECIES_TO_PATH

{'psorophora pygmaea': '/opt/ImageBase/AWS_sync/cropped/psorophora/pygmaea/',
 'psorophora howardii': '/opt/ImageBase/AWS_sync/cropped/psorophora/howardii/',
 'psorophora discolor': '/opt/ImageBase/AWS_sync/cropped/psorophora/discolor/',
 'psorophora cyanescens': '/opt/ImageBase/AWS_sync/cropped/psorophora/cyanescens/',
 'psorophora ciliata': '/opt/ImageBase/AWS_sync/cropped/psorophora/ciliata/',
 'psorophora spp': '/opt/ImageBase/AWS_sync/cropped/psorophora/spp/',
 'psorophora columbiae': '/opt/ImageBase/AWS_sync/cropped/psorophora/columbiae/',
 'psorophora ferox': '/opt/ImageBase/AWS_sync/cropped/psorophora/ferox/',
 'psorophora horrida': '/opt/ImageBase/AWS_sync/cropped/psorophora/horrida/',
 'psorophora signipennis': '/opt/ImageBase/AWS_sync/cropped/psorophora/signipennis/',
 'anopheles pharoensis': '/opt/ImageBase/AWS_sync/cropped/anopheles/pharoensis/',
 'anopheles cf-coustani': '/opt/ImageBase/AWS_sync/cropped/anopheles/cf-coustani/',
 'anopheles maculipennis': '/opt/ImageBase/A

In [94]:
validate_split(split_df)

Index 0, Files found in /opt/ImageBase/AWS_sync/cropped/aedes/aegypti/: 921, Files according to split: 896
Index 1, Files found in /opt/ImageBase/AWS_sync/cropped/aedes/albopictus/: 692, Files according to split: 725
Index 3, Files found in /opt/ImageBase/AWS_sync/cropped/aedes/canadensis/: 128, Files according to split: 46
Index 12, Files found in /opt/ImageBase/AWS_sync/cropped/aedes/japonicus/: 1093, Files according to split: 1103
Index 13, Files found in /opt/ImageBase/AWS_sync/cropped/aedes/mediovittatus/: 84, Files according to split: 13
Index 24, Files found in /opt/ImageBase/AWS_sync/cropped/aedes/vexans/: 364, Files according to split: 295
Index 26, Files found in /opt/ImageBase/AWS_sync/cropped/anopheles/coustani/: 142, Files according to split: 148
Index 28, Files found in /opt/ImageBase/AWS_sync/cropped/anopheles/crucians_sl/b/: 74, Files according to split: 39
Index 29, Files found in /opt/ImageBase/AWS_sync/cropped/anopheles/crucians_sl/c/: 61, Files according to split: 1

In [95]:
"""
UNIQUE_CLASSES contains list of unique classes, in order
"""
UNIQUE_CLASSES = [i for i in split_df['unknown'].unique() if not pd.isnull(i)]
UNIQUE_CLASSES.extend([i for i in split_df['genus'].unique() if not pd.isnull(i)])
UNIQUE_CLASSES.extend([i for i in split_df['species'].unique() if not pd.isnull(i)])
UNIQUE_CLASSES.sort()
print(len(UNIQUE_CLASSES))
UNIQUE_CLASSES

21


['aedes aedes_aegypti',
 'aedes aedes_albopictus',
 'aedes aedes_dorsalis',
 'aedes aedes_sollicitans',
 'aedes aedes_spp',
 'aedes aedes_taeniorhynchus',
 'aedes aedes_vexans',
 'anopheles anopheles_coustani',
 'anopheles anopheles_freeborni',
 'anopheles anopheles_funestus',
 'anopheles anopheles_gambiae',
 'anopheles anopheles_punctipennis',
 'anopheles anopheles_quadrimaculatus',
 'anopheles anopheles_spp',
 'culex culex_pipiens_sl',
 'culex culex_salinarius',
 'culex culex_spp',
 'mosquito',
 'psorophora psorophora_columbiae',
 'psorophora psorophora_cyanescens',
 'psorophora psorophora_spp']

In [96]:
# order with spp classes at the bottom
if FOLD==1:
    UNIQUE_CLASSES = ['aedes aedes_aegypti',
    'aedes aedes_albopictus',
    'aedes aedes_dorsalis',
    'aedes aedes_japonicus',
    'aedes aedes_sollicitans',
    'aedes aedes_vexans',
    'anopheles anopheles_coustani',
    'anopheles anopheles_crucians',
    'anopheles anopheles_freeborni',
    'anopheles anopheles_funestus',
    'anopheles anopheles_gambiae',
    'culex culex_erraticus',
    'culex culex_pipiens_sl',
    'culex culex_salinarius',
    'psorophora psorophora_columbiae',
    'psorophora psorophora_ferox',
    'aedes aedes_spp',               
    'anopheles anopheles_spp',
    'culex culex_spp',
    'psorophora psorophora_spp',
    'mosquito'] #fold1
elif FOLD ==2:

    UNIQUE_CLASSES = ['aedes aedes_albopictus',
    'aedes aedes_dorsalis',
    'aedes aedes_japonicus',
    'aedes aedes_taeniorhynchus',
    'aedes aedes_vexans',
    'anopheles anopheles_coustani',
    'anopheles anopheles_crucians',
    'anopheles anopheles_funestus',
    'anopheles anopheles_gambiae',
    'anopheles anopheles_punctipennis',
    'anopheles anopheles_quadrimaculatus',
    'culex culex_erraticus',
    'culex culex_salinarius',
    'psorophora psorophora_columbiae',
    'psorophora psorophora_cyanescens',
    'psorophora psorophora_ferox',
    'aedes aedes_spp',               
    'anopheles anopheles_spp',
    'culex culex_spp',
    'psorophora psorophora_spp',
    'mosquito'] #fold2
elif FOLD ==3:

    UNIQUE_CLASSES=['aedes aedes_aegypti',
    'aedes aedes_dorsalis',
    'aedes aedes_japonicus',
    'aedes aedes_sollicitans',
    'aedes aedes_taeniorhynchus',
    'anopheles anopheles_coustani',
    'anopheles anopheles_crucians',
    'anopheles anopheles_freeborni',
    'anopheles anopheles_gambiae',
    'anopheles anopheles_punctipennis',
    'anopheles anopheles_quadrimaculatus',
    'culex culex_erraticus',
    'culex culex_pipiens_sl',
    'psorophora psorophora_columbiae',
    'psorophora psorophora_cyanescens',
    'psorophora psorophora_ferox',
    'aedes aedes_spp',               
    'anopheles anopheles_spp',
    'culex culex_spp',
    'psorophora psorophora_spp',
    'mosquito'] #fold3
elif FOLD == 4:

    UNIQUE_CLASSES = ['aedes aedes_aegypti',
    'aedes aedes_albopictus',
    'aedes aedes_japonicus',
    'aedes aedes_sollicitans',
    'aedes aedes_taeniorhynchus',
    'aedes aedes_vexans',
    'anopheles anopheles_crucians',
    'anopheles anopheles_freeborni',
    'anopheles anopheles_funestus',
    'anopheles anopheles_punctipennis',
    'anopheles anopheles_quadrimaculatus',
    'culex culex_erraticus',
    'culex culex_pipiens_sl',
    'culex culex_salinarius',
    'psorophora psorophora_cyanescens',
    'psorophora psorophora_ferox',
    'aedes aedes_spp',               
    'anopheles anopheles_spp',
    'culex culex_spp',
    'psorophora psorophora_spp',
    'mosquito'] #fold4
elif FOLD==5:

    UNIQUE_CLASSES= ['aedes aedes_aegypti',
    'aedes aedes_albopictus',
    'aedes aedes_dorsalis',
    'aedes aedes_sollicitans',
    'aedes aedes_taeniorhynchus',
    'aedes aedes_vexans',
    'anopheles anopheles_coustani',
    'anopheles anopheles_freeborni',
    'anopheles anopheles_funestus',
    'anopheles anopheles_gambiae',
    'anopheles anopheles_punctipennis',
    'anopheles anopheles_quadrimaculatus',
    'culex culex_pipiens_sl',
    'culex culex_salinarius',
    'psorophora psorophora_columbiae',
    'psorophora psorophora_cyanescens',
    'aedes aedes_spp',               
    'anopheles anopheles_spp',
    'culex culex_spp',
    'psorophora psorophora_spp',
    'mosquito'] #fold5


UNIQUE_GENUSES = [
    'aedes',
    'anopheles',
    'culex',
    'psorophora',
    'mosquito'
]

In [97]:
"""
ROW_TO_CLASS maps each row in the split to a class
"""
ROW_TO_CLASS = {}
for i in tqdm(range(len(split_df))):
    setindx = False
    if not pd.isnull(split_df.loc[i, 'unknown']):
        ROW_TO_CLASS[i] = UNIQUE_CLASSES.index(split_df.loc[i, 'unknown'])
        setindx = True
    if not pd.isnull(split_df.loc[i, 'genus']):
        if setindx:
            raise ValueError('Resetting folder to class map')
        ROW_TO_CLASS[i] = UNIQUE_CLASSES.index(split_df.loc[i, 'genus'])
        setindx = True
    if not pd.isnull(split_df.loc[i, 'species']):
        if setindx:
            raise ValueError('Resetting folder to class map')
        ROW_TO_CLASS[i] = UNIQUE_CLASSES.index(split_df.loc[i, 'species'])

HBox(children=(FloatProgress(value=0.0, max=79.0), HTML(value='')))




In [98]:
def split_helper(id_to_sample, ratio):
    """
    Split mosquito samples (not images, to prevent leakage)
    """
    df = pd.DataFrame(data=list(id_to_sample.keys()))
    trainlist, vallist, testlist = np.split(df.sample(frac=1), [int(ratio[0]*len(df)), int((ratio[0]+ratio[1])*len(df))])
    return trainlist, vallist, testlist

def join_lists(id_to_sample, mos_ids):
    """
    Takes mosquito sample IDs and returns list of images of those samples
    Eg: [000429] -> ['JHU-000429_05m.jpg', 'JHU-000429_04m.jpg', 'JHU-000429_02m.jpg']
    """
    jointlist = []
    for mos in mos_ids:
        jointlist.extend(id_to_sample[mos])
    return jointlist

def split_imlist(imlist, ratio):
    """
    Takes a list of images, and splits them into training, validation and test sets in the provided ratio.
    The ratio will not be followed strictly. The samples are split according to the ratio, and since
    the samples may have different number of images, the ratio of actual number of images might vary.
    
    Just repeat the split process multiple times if you don't like the number of images in splits.
    """
    id_to_sample = {}
    for im in imlist:
        mosid = im.split('/')[-1].split('_')[0].split('-')[-1]
        if mosid in id_to_sample:
            id_to_sample[mosid].append(im)
        else:
            id_to_sample[mosid] = [im]
    
    trainlist, vallist, testlist = split_helper(id_to_sample, ratio)
    try:
#         print(type(trainlist[0]), len(trainlist[0]))
        trainlist = join_lists(id_to_sample, trainlist[0])
    except:
        sys.exit(trainlist[0])
    vallist = join_lists(id_to_sample, vallist[0])
    testlist = join_lists(id_to_sample, testlist[0])
    return trainlist, vallist, testlist

In [99]:
def extend_df(output_df, imlist, targetgenus, targetclass, split, genus_name, species_name):
    """
    Add samples to the dataframe with their class and split
    """
    temp_df = pd.DataFrame(data=imlist, columns=['Id'])
    temp_df['Genus'] = targetgenus
    temp_df['Species'] = targetclass
    temp_df['Split'] = split
    temp_df['Genus_Name'] = genus_name
    temp_df['Species_Name'] = species_name
    temp_df = temp_df[output_df.columns]
    return output_df.append(temp_df).reset_index(drop=True)
    
def prepare_data(split_df, ratio=[0.7, 0.15, 0.15], fold='fold'):
    """
    Takes in split_df, ratio and fold column name, and outputs splits
    """
    output_df = pd.DataFrame(columns=['Id','Genus','Species','Split','Genus_Name','Species_Name'])
    for i in tqdm(range(len(split_df))):
        sourcepath = SPECIES_TO_PATH[split_df.loc[i, 'folder']]
        imlist = glob.glob(sourcepath + '*m.jpg')
        targetclass = ROW_TO_CLASS[i]
        targetgenus = UNIQUE_GENUSES.index(UNIQUE_CLASSES[targetclass].split(' ')[0])
        genus_name = UNIQUE_GENUSES[targetgenus]
        species_name = 'mosquito' if genus_name == 'mosquito' else UNIQUE_CLASSES[targetclass].split(' ')[1]
        
        if split_df.loc[i, fold] == 'tr/v/t':
            trainlist, vallist, testlist = split_imlist(imlist, ratio)
            output_df = extend_df(output_df, trainlist, targetgenus, targetclass, 'Train', genus_name, species_name)
            output_df = extend_df(output_df, vallist, targetgenus, targetclass, 'Valid', genus_name, species_name)
            output_df = extend_df(output_df, testlist, targetgenus, targetclass, 'Test', genus_name, species_name)
        
        else:
            output_df = extend_df(output_df, imlist, targetgenus, targetclass, 'ExtendedTest', genus_name, species_name)

    return output_df

In [100]:
output_df = prepare_data(split_df, ratio=[0.71, 0.15, 0.15])
print("Total number of images: ", len(output_df))
traincount = (output_df['Split'] == 'Train').sum()
valcount = (output_df['Split'] == 'Valid').sum()
testcount = (output_df['Split'] == 'Test').sum()
extendedtestcount = (output_df['Split'] == 'ExtendedTest').sum()

print("Train images: {}, Valid iamges: {}, Test images: {}, Extended Test images: {}".format(traincount, 
        valcount, testcount, extendedtestcount))
totalcount = traincount + valcount + testcount
print("Train/Valid/Test Ratio: {:.4f}, {:.4f}, {:.4f}".format(traincount/totalcount, 
                                                              valcount/totalcount, testcount/totalcount))

HBox(children=(FloatProgress(value=0.0, max=79.0), HTML(value='')))


Total number of images:  12977
Train images: 7322, Valid iamges: 1575, Test images: 1558, Extended Test images: 2522
Train/Valid/Test Ratio: 0.7003, 0.1506, 0.1490


In [101]:
pd.set_option('display.max_colwidth', -1)
output_df

  """Entry point for launching an IPython kernel.


Unnamed: 0,Id,Genus,Species,Split,Genus_Name,Species_Name
0,/opt/ImageBase/AWS_sync/cropped/aedes/aegypti/JHU-005975_02m.jpg,0,0,Train,aedes,aedes_aegypti
1,/opt/ImageBase/AWS_sync/cropped/aedes/aegypti/JHU-005975_01m.jpg,0,0,Train,aedes,aedes_aegypti
2,/opt/ImageBase/AWS_sync/cropped/aedes/aegypti/JHU-005975_04m.jpg,0,0,Train,aedes,aedes_aegypti
3,/opt/ImageBase/AWS_sync/cropped/aedes/aegypti/JHU-005975_06m.jpg,0,0,Train,aedes,aedes_aegypti
4,/opt/ImageBase/AWS_sync/cropped/aedes/aegypti/JHU-005975_09m.jpg,0,0,Train,aedes,aedes_aegypti
...,...,...,...,...,...,...
12972,/opt/ImageBase/AWS_sync/cropped/uranotaenia/sapphirina/JHU-000430_05m.jpg,4,17,ExtendedTest,mosquito,mosquito
12973,/opt/ImageBase/AWS_sync/cropped/uranotaenia/sapphirina/JHU-000429_05m.jpg,4,17,ExtendedTest,mosquito,mosquito
12974,/opt/ImageBase/AWS_sync/cropped/uranotaenia/sapphirina/JHU-000429_03m.jpg,4,17,ExtendedTest,mosquito,mosquito
12975,/opt/ImageBase/AWS_sync/cropped/uranotaenia/sapphirina/JHU-000429_01m.jpg,4,17,ExtendedTest,mosquito,mosquito


## Validating Data

### Regenerate the class and genus mapping from Image paths

In [102]:
re_class_map = {i: [] for i in range(output_df['Species'].nunique())}
re_genus_map = {i: [] for i in range(output_df['Genus'].nunique())}
for i, row in output_df.iterrows():
    cls_name = ' '.join(row['Id'].split('/')[-3:-1])
    if cls_name not in re_class_map[row['Species']]:
        re_class_map[row['Species']].append(cls_name)

In [103]:
comp_df = []
for i in range(len(UNIQUE_CLASSES)):
    comp_df.append({'Class': i, 'Original': UNIQUE_CLASSES[i], 'Inferred': re_class_map[i]})
comp_df = pd.DataFrame(comp_df)
comp_df

Unnamed: 0,Class,Original,Inferred
0,0,aedes aedes_aegypti,[aedes aegypti]
1,1,aedes aedes_albopictus,[aedes albopictus]
2,2,aedes aedes_dorsalis,[aedes dorsalis]
3,3,aedes aedes_sollicitans,[aedes sollicitans]
4,4,aedes aedes_spp,"[aedes atlanticus, aedes canadensis, aedes cantator, aedes condolescens, aedes fairfax-1, aedes flavescens, aedes hendersoni, aedes infirmatus, aedes japonicus, aedes mediovittatus, aedes melanimon, aedes nigromaculis, aedes sierrensis, aedes spilotus, aedes sticticus, aedes tortilis, triseriatus_sl sl, aedes trivittatus]"
5,5,aedes aedes_taeniorhynchus,[aedes taeniorhynchus]
6,6,aedes aedes_vexans,[aedes vexans]
7,7,anopheles anopheles_coustani,[anopheles coustani]
8,8,anopheles anopheles_freeborni,[anopheles freeborni]
9,9,anopheles anopheles_funestus,"[funestus_sl sl, funestus_sl ss]"


### Validate that no two images of the same sample are split in different sets

In [104]:
sample_to_set = {}
for i, row in output_df.iterrows():
    sample = row['Id'].split('/')[-1].split('_')[0]
    if sample in sample_to_set:
        assert row['Split'] == sample_to_set[sample]
    else:
        sample_to_set[sample] = row['Split']
print("Congratulations. No specimen leak.")
print('Total number of specimens: ', len(sample_to_set))

Congratulations. No specimen leak.
Total number of specimens:  2696


## Save DF

In [105]:
output_df.to_csv('datasplit_fold{}.csv'.format(FOLD), index=False)
print('FOLD: ',FOLD)

FOLD:  5


## Load DF and report images in each species

In [12]:
split_df = pd.read_csv(f'./datasplit_fold1.csv')
unique_sp = {}
for i in range(len(split_df)):
    tmp_id = split_df['Id'][i].split('/')
    tmp_sp = tmp_id[-3].capitalize()+' '+tmp_id[-2]
    if tmp_sp not in unique_sp:
        unique_sp[tmp_sp]=1
    else:
        unique_sp[tmp_sp]+=1
for sp in unique_sp:
    print(sp,unique_sp[sp])

Aedes aegypti 921
Aedes albopictus 692
Aedes atlanticus 80
Aedes canadensis 128
Aedes cantator 8
Aedes condolescens 10
Aedes dorsalis 288
Aedes fairfax-1 6
Aedes flavescens 72
Aedes hendersoni 16
Aedes infirmatus 106
Aedes japonicus 1093
Aedes mediovittatus 84
Aedes melanimon 20
Aedes nigromaculis 42
Aedes sierrensis 8
Aedes sollicitans 414
Aedes spilotus 4
Aedes sticticus 4
Aedes taeniorhynchus 184
Aedes tortilis 21
Triseriatus_sl sl 87
Aedes trivittatus 143
Aedes vexans 364
Anopheles cf-coustani 21
Anopheles coustani 142
Crucians_sl a 19
Crucians_sl b 74
Crucians_sl c 61
Crucians_sl sl 114
Crucians_sl d 5
Crucians_sl e 5
Anopheles freeborni 570
Funestus_sl sl 238
Funestus_sl ss 972
Gambiae_sl arabiensis 800
Gambiae_sl coluzzii 86
Gambiae_sl sl 48
Gambiae_sl ss 226
Anopheles maculipalpis 30
Anopheles pharoensis 6
Anopheles pretoriensis 12
Anopheles pseudopunctipennis 53
Anopheles punctipennis 212
Anopheles quadrimaculatus 195
Anopheles rufipes 12
Anopheles squamosus 13
Anopheles teneb