In [1]:
import sys, os

In [2]:
root = '/media/ul1994/ssd1tb/scans'

def cleandir(path=''):
    ignore = ['.DS_Store', '^']
    return [folder for folder in os.listdir('%s/%s' % (root, path)) if folder not in ignore]

groups = cleandir()
groups

['benign_without_callbacks', 'benigns', 'cancers']

In [26]:
metadata = []
import time

devices = [
    'DBA-MGH', # ('A' and DBA)
    'HOWTEK-MGH', # ('A' and HOWTEK)
    'LUMISYS-WFU', # ('B' or 'C' and LUMISYS)
    'HOWTEK-ISMD', # ('D' and HOWTEK) 
]

def process_case(ii, jj, kk, groupName, volume, case):
    files = cleandir('%s/%s/%s' % (groupName, volume, case))
    ics = [fl for fl in files if '.ics' in fl][0]
    fullpath = '%s/%s/%s/%s' % (root, groupName, volume, case)
    
    icsdata = None
    with open('%s/%s' % (fullpath, ics)) as fl:
        icsdata = fl.read()
        
    icslines = icsdata.split('\n')
    icslookup = {}
    for line in icslines:
        parts = line.split(' ')
        info = parts[0]
        icslookup[info] = ' '.join(parts[1:])
    if 'DATE_OF_STUDY' not in icslookup:
        print '\n\nBROKEN? %s %s %s\n\n' %(groupName, volume, case)
        return
    
#     if 'PATIENT_AGE' not in icslookup:
#         print '\n\nBROKEN? %s %s %s\n\n' %(groupName, volume, case)
    try:
        meta = {}
        meta['diagnosis'] = groupName
        meta['date'] = icslookup['DATE_OF_STUDY']
        meta['density'] = int(icslookup['DENSITY'])
        meta['digitizer_raw'] = icslookup['DIGITIZER']
        header = icslookup['filename'].split('-')[0]
        digitizer = None
        if header == 'A' and 'DBA' in icslookup['DIGITIZER']:
            digitizer = devices[0]
        elif header == 'A' and 'HOWTEK' in icslookup['DIGITIZER']:
            digitizer = devices[1]
        elif header == 'B' or header == 'C' and 'LUMISYS' in icslookup['DIGITIZER']:
            digitizer = devices[2]
        elif header == 'D' and 'HOWTEK' in icslookup['DIGITIZER']:
            digitizer = devices[3]
        assert digitizer != None
            
        meta['digitizer'] = digitizer
        meta['name'] = icslookup['filename']
        rawname = meta['name']
        while '-' in rawname:
            rawname = rawname.replace('-', '_')
        if  icslookup['PATIENT_AGE'] == '':
            meta['age'] = None
        else:
            meta['age'] = int(icslookup['PATIENT_AGE'])
        meta['root'] = '%s/%s/%s/%s' % (root, groupName, volume, case)
        
        sequences = []
        readingSequence = False
        for line in icslines:
            if line == 'SEQUENCE':
                readingSequence = True
            if readingSequence:
                if line == '':
                    break
                else:
                    scandata = line.split(' ')
                    scanName = scandata[0]
                    scanOverlay = scandata[-1]
                    sequences.append({
                        'name': rawname + '.' + scanName,
                        'overlay': scanOverlay == 'OVERLAY'
                    })
        meta['scans'] = sequences

        sys.stdout.write('%d %s %s %s: %s\r' % (len(icsdata), groupName, volume, case, icslookup['DATE_OF_STUDY']))
        sys.stdout.flush()
        metadata.append(meta)
    except:
        print icsdata
        
t0 = time.time()
counter = 0
for ii, gname in enumerate(groups):
    for jj, volume in enumerate(cleandir(gname)):
        cases = cleandir('%s/%s' % (gname, volume))
        for kk, case in enumerate(cases):
            counter+= 1
            process_case(ii, jj, kk, gname, volume, case)
     
print '\n'
print counter, time.time() - t0

499 benigns benign_01 case3185: 5 6 199886t_callback_02 case3685: 20 12 9498

BROKEN? benigns benign_01 case3186


498 cancers cancer_15 case3517: 30 4 98923

1910 2.84178113937


In [27]:
print len(metadata)

1909


In [28]:
print metadata[0]

{'name': 'B-3159-1', 'density': 2, 'age': 58, 'digitizer_raw': 'LUMISYS LASER', 'digitizer': 'LUMISYS-WFU', 'date': '29 5 1998', 'diagnosis': 'benign_without_callbacks', 'root': '/media/ul1994/ssd1tb/scans/benign_without_callbacks/benign_without_callback_01/case3159', 'scans': [{'name': 'B_3159_1.SEQUENCE', 'overlay': False}, {'name': 'B_3159_1.LEFT_CC', 'overlay': True}, {'name': 'B_3159_1.LEFT_MLO', 'overlay': True}, {'name': 'B_3159_1.RIGHT_CC', 'overlay': False}, {'name': 'B_3159_1.RIGHT_MLO', 'overlay': False}]}


In [29]:
import json
with open('metadata.json', 'wb') as fl:
    json.dump(metadata, fl, indent=4)