In [2]:
import sys, os
from __future__ import print_function

In [3]:
root = '/media/ul1994/ssd1tb/scans'

def cleandir(path=''):
    ignore = ['.DS_Store', '^']
    return [folder for folder in os.listdir('%s/%s' % (root, path)) if folder not in ignore]

groups = cleandir()
groups

['benigns', 'callbacks', 'cancers', 'normals']

In [20]:
metadata = []
import time

devices = [
    'DBA-MGH', # ('A' and DBA)
    'HOWTEK-MGH', # ('A' and HOWTEK)
    'LUMISYS-WFU', # ('B' or 'C' and LUMISYS)
    'HOWTEK-ISMD', # ('D' and HOWTEK) 
]

def process_case(ii, jj, kk, groupName, volume, case):
    files = cleandir('%s/%s/%s' % (groupName, volume, case))
    ics = [fl for fl in files if '.ics' in fl][0]
    fullpath = '%s/%s/%s/%s' % (root, groupName, volume, case)
    
    icsdata = None
    try:
        with open('%s/%s' % (fullpath, ics)) as fl:
            icsdata = fl.read()
    except:
        print (fullpath)
        return
        
    icslines = icsdata.split('\n')
    icslookup = {}
    for line in icslines:
        parts = line.split(' ')
        info = parts[0]
        icslookup[info] = ' '.join(parts[1:])
    if 'DATE_OF_STUDY' not in icslookup:
        print('\n\nBROKEN? %s %s %s\n\n' %(groupName, volume, case))
        return
    
#     if 'PATIENT_AGE' not in icslookup:
#         print '\n\nBROKEN? %s %s %s\n\n' %(groupName, volume, case)
    try:
        meta = {}
        meta['diagnosis'] = groupName
        meta['date'] = icslookup['DATE_OF_STUDY']
        meta['density'] = int(icslookup['DENSITY'])
        meta['digitizer_raw'] = icslookup['DIGITIZER']
        header = icslookup['filename'].split('-')[0]
        digitizer = None
        if header == 'A' and 'DBA' in icslookup['DIGITIZER']:
            digitizer = devices[0]
        elif header == 'A' and 'HOWTEK' in icslookup['DIGITIZER']:
            digitizer = devices[1]
        elif header == 'B' or header == 'C':
            digitizer = devices[2]
        elif header == 'D' and 'HOWTEK' in icslookup['DIGITIZER']:
            digitizer = devices[3]
        assert digitizer != None
            
        meta['digitizer'] = digitizer
        meta['name'] = icslookup['filename']
        rawname = meta['name']
        while '-' in rawname:
            rawname = rawname.replace('-', '_')
        if  icslookup['PATIENT_AGE'] == '':
            meta['age'] = None
        else:
            meta['age'] = int(icslookup['PATIENT_AGE'])
        meta['root'] = '%s/%s/%s/%s' % (root, groupName, volume, case)
        
        sequences = []
        readingSequence = False
        for line in icslines:
            if 'RIGHT_' in line or 'LEFT_' in line:
                scandata = line.split(' ')
                scanName = scandata[0]
                scanOverlay = scandata[-1]
                sequences.append({
                    'name': rawname + '.' + scanName,
                    'overlay': scanOverlay == 'OVERLAY'
                })
        meta['scans'] = sequences
        try:
            assert len(sequences) == 4
        except:
            print(len(sequences), meta['root'])
            raise Exception('Too many? %d' % len(sequences))

        sys.stdout.write('%d %s %s %s: %s\r' % (len(icsdata), groupName, volume, case, icslookup['DATE_OF_STUDY']))
        sys.stdout.flush()
        metadata.append(meta)
    except:
        print(icsdata)
        
t0 = time.time()
counter = 0
for ii, gname in enumerate(groups):
    for jj, volume in enumerate(cleandir(gname)):
        cases = cleandir('%s/%s' % (gname, volume))
        for kk, case in enumerate(cases):
            counter+= 1
            process_case(ii, jj, kk, gname, volume, case)
     
print ('\n')
print (counter, time.time() - t0)

/media/ul1994/ssd1tb/scans/benigns/benign_01/case3186
513 normals normal_12 case2043: 7 7 199494ase3685: 20 12 9498

2605 2.3415205478668213


In [21]:
print(len(metadata))

2604


In [22]:
print(metadata[0])

{'digitizer_raw': 'LUMISYS LASER', 'age': 66, 'root': '/media/ul1994/ssd1tb/scans/benigns/benign_01/case0029', 'scans': [{'overlay': True, 'name': 'C_0029_1.LEFT_CC'}, {'overlay': True, 'name': 'C_0029_1.LEFT_MLO'}, {'overlay': False, 'name': 'C_0029_1.RIGHT_CC'}, {'overlay': False, 'name': 'C_0029_1.RIGHT_MLO'}], 'digitizer': 'LUMISYS-WFU', 'density': 3, 'diagnosis': 'benigns', 'name': 'C-0029-1', 'date': '2 3 1993'}


In [23]:
import json
with open('metadata.json', 'w') as fl:
    json.dump(metadata, fl, indent=4)