# Algorithm for downloading images from iDigBio
###  (all images are verified by experts)

All Coleoptera occurences from iDigBio are obtained and cleaned to preserve only those that have associated image of the dorsal view (head) of the specimens.

### NOTE: to download the same images as in our study use this csv file 

    D2_list_of_filtered_images.csv
    
    
**Input**: multimedia.csv - a list of records from iDigBio obtained with query keywords ("hasImage":"true" and "order":"Coleoptera")

**Outputs**: images of frontal habitus sorted by family names

Procedure: 

	Step 1.
		collect:
			- images with keywords
				- 'dorsal'
				- 'habitus_dor', 'Habitus_dor'
				- '_D.', "_had"
			- images from institutions that provide mainly dorsal view 
				- 'Denver Museum of Nature & Science'
	            - 'University of Tennessee at Chattanooga (UTC-UTCI)'
	            - 'United States National Museum, Entomology Collections (USNM-USNMENT)'
		skip: 
			- images with keywords: 
				- "lateral", "frontal", "ventral", 'anterior'
				- "head", 'antenna', "labels", 
				- 'mesosoma', "genitalia"
				- "_L", "_F", "_V", 
				- 'web', 'habitus_lat', 'Habitus_lat' 
				- "hed", "hef", "hal", "hed" (head images) 
            - images from institutions that provided fossil images
		check:
			- from records that are not skipped or collected depict images from poorly represented families 
	Step 2.
    	- download images from families with N+ records
		- manually check all the images (to avoid drawings, images of labels, etc.)

In [1]:
import csv
import os
import urllib

In [2]:
with open('metadata/multimedia_raw.csv', 'rb') as csvfile:
    reader = csv.reader(csvfile)
    total_media = 0
    remained = []
    cleaned_list = []
    first_row = []
    #iterate over each row and count how many they are
    for row in reader:
        if first_row == []:
            first_row = row
        
        
        total_media +=1
        
        # and clean row 100 - some institutions provided only photos of labels with insects barelly visible or fossils
        # Arizona, Hawaii, Yale, Michigan, Texas
        if row[100]  == 'University of Arizona Insect Collection' or\
            row[100] == 'University of Arizona' or\
            row[100] == 'University of Hawaii Insect Museum' or\
            row[100] == 'Invertebrate Paleontology Division, Yale Peabody Museum' or\
            row[100] == 'Department of Bioagricultural Sciences and Pest Management' or\
            row[100] == 'Michigan State University Digitized Collection':
            pass
       
        
        # Arizona Hasbrouck OK    
        elif row[100]== 'Arizona State University Hasbrouck Insect Collection (ASU-ASUHIC)':
            if 'habitus_dor' in row[15] or 'Habitus_dor' in row[15]:
                cleaned_list.append(row)
            else:
                pass
            
        
        
        # Texax OK
        elif row[100]== 'Museum of Texas Tech University' and 'dorsal' in row[15]:
            cleaned_list.append(row)
        
        elif row[100]== 'Museum of Texas Tech University' and 'lateral' in row[15]:
            pass
        elif row[100]== 'Museum of Texas Tech University' and 'labels' in row[15]:
            pass
        
        
        # Colorado
        elif row[100]== 'Colorado Plateau Museum of Arthropod Biodiversity (NAUF-CPMAB)' and '_D.' in row[15]:
            cleaned_list.append(row)
            
        elif row[100]== 'Colorado Plateau Museum of Arthropod Biodiversity (NAUF-CPMAB)' and '_V.' in row[15]:
            pass
        elif row[100]== 'Colorado Plateau Museum of Arthropod Biodiversity (NAUF-CPMAB)' and '_F.' in row[15]:
            pass
        elif row[100]== 'Colorado Plateau Museum of Arthropod Biodiversity (NAUF-CPMAB)' and '_L.' in row[15]:
            pass
        elif row[100]== 'Colorado Plateau Museum of Arthropod Biodiversity (NAUF-CPMAB)' and '_LV.' in row[15]:
            pass
        elif row[100]== 'Colorado Plateau Museum of Arthropod Biodiversity (NAUF-CPMAB)' and '_L_0' in row[15]:
            pass
        
        
        # HARVARD
        elif 'Harvard University' in row[100] and '_had' in row[15]:
            cleaned_list.append(row)
            
        elif row[100]== 'Museum of Comparative Zoology, Harvard University' and '_hed' in row[15]:
            pass
        elif row[100]== 'Museum of Comparative Zoology, Harvard University' and '_hef' in row[15]:
            pass
        elif row[100]== 'Museum of Comparative Zoology, Harvard University' and '_hal' in row[15]:
            pass
        elif row[100]== 'Museum of Comparative Zoology, Harvard University' and '_hed' in row[15]:
            pass
        
        # DENVER, TENNESSEE, USNM - these are for mostly dorsal view or lateral in a few cases
        elif row[100]== 'Denver Museum of Nature & Science' or\
            row[100]== 'University of Tennessee at Chattanooga (UTC-UTCI)' or\
            row[100]== 'United States National Museum, Entomology Collections (USNM-USNMENT)':
            cleaned_list.append(row)
        
        
        # NMSU
        elif row[100]== 'NMSU' and '_D' in row[15]:
            cleaned_list.append(row)
        elif row[100]== 'NMSU' and '_L' in row[15]:
            pass
        
        # also to exclude
        elif row[15]=='' or \
            '_F_lg' in row[15] or\
            '_L_lg' in row[15] or\
            '_V_lg' in row[15] or\
            'PALE' in row[15] or \
            'ventral' in row[15] or\
            'head' in row[15] or\
            'lateral' in row[15] or\
            'web' in row[15] or\
            'habitus_lat' in row[15] or\
            'Habitus_lat' in row[15] or\
            'genitalia' in row[15]:
            pass
        
        # get rid the images which are reported as not dorsal view    
        elif 'lateral' in row[10] or\
            'ventral' in row[10] or\
            'anterior' in row[10] or\
            'genitalia' in row[10] or\
            'antenna' in row[10] or\
            'mesosoma' in row[10] or\
            'head' in row[10]:
            pass   
        
        # how many images are not treated
        else:   
            remained.append(row)
            

print 'true\t\t' , len(cleaned_list)
print 'cleaned\t\t', total_media- len(cleaned_list)- len(remained)
print
print 'remained to treat\t', len(remained)
print 'total data\t\t', total_media
print

true		3506
cleaned		147224

remained to treat	25
total data		150755



#### next, we checked institutions which we still have to treat

In [3]:
to_treat={}
for row in remained:
    if row[100] in to_treat.keys():
        to_treat[row[100]]+=1
    else:
        to_treat[row[100]]=1
for key in to_treat:
    print to_treat[key], key, '\n'

1 xmpRights:Owner 

14 Museum of Texas Tech University 

9 Colorado Plateau Museum of Arthropod Biodiversity (NAUF-CPMAB) 

1 NMSU 



### save the list as csv file and add the titles in the first row

In [5]:
cleaned_list.insert(0, first_row)

with open('metadata/cleaned_head.csv', 'wb') as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
    for row in range(len(cleaned_list)):
        wr.writerow(cleaned_list[row])

In [6]:
remained.insert(0, first_row)
with open('metadata/remained.csv', 'wb') as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
    for row in range(len(remained)):
        wr.writerow(remained[row])

# download the images 
from **cleaned_head.csv** and if you wish add manually more examples  from **remained.csv** for families with fewer examples

In [7]:
def download_all(records, dictionary, save_to, tax_level = 'family', num=25):
    if tax_level == 'genus':
        level = 4
    else:
        level = 3
    for key in dictionary.keys():
        if len(dictionary[key])>num:
            print 
            print len(dictionary[key]), key
            print
            directory = save_to + '/' + key
            if not os.path.exists(directory):
                os.makedirs(directory)
            for value in dictionary[key]:
                for i in records:
                    if value == i[0]:
                        print i[level]
                        try:
                            urllib.urlretrieve(i[1], directory+'/'+i[2]+".jpg")
                        except:
                            pass

In [8]:
with open('metadata/cleaned_head.csv', 'rb') as csv1:
    dorsal = csv.reader(csv1)
    records = []
    record = []
    for row in dorsal:
        fullname = row[85].replace(' ', '_')
        record = [row[0], row[5], fullname]
        records.append(record)
    records=records[1:]    

    coreIDs = []
    for i in records:
        coreIDs.append(i[0])

In [9]:
with open('metadata/occurrence_raw.csv', 'rb') as csv2:
    reader = csv.reader(csv2)
    family ={}
    genus = {}
    for row in reader:
        #append to records family and genus
        for i in records:
            if row[0] == i[0]:
                i.append(row[56])
                i.append(row[61])
                
                if row[56] in family.keys():
                    family[row[56]].append(row[0])
                    if row[61] in genus.keys():
                        genus[row[61]].append(row[0])

                    else:
                        genus[row[61]] = []
                        genus[row[61]].append(row[0])
                else:
                    family[row[56]] = []
                    family[row[56]].append(row[0])

In [None]:
download_all(records, family, 'family', num=18)


95 Anthribidae

Anthribidae
Anthribidae
Anthribidae
Anthribidae
Anthribidae
Anthribidae
Anthribidae
Anthribidae
Anthribidae
Anthribidae


# make sure you examine and clean your dataset manually

### or you can simply download filtered images a list we provided 

    D2_list_of_filtered_images.csv