In [1]:
import sys, os
from __future__ import print_function
from pprint import pprint
import json
import numpy as np

In [2]:
# root = '/media/ul1994/ssd1tb/scans'
root = '/media/ul1994/ssd1tb1/scans'

def cleandir(path=''):
    ignore = ['.DS_Store', '^']
    return [folder for folder in os.listdir('%s/%s' % (root, path)) if folder not in ignore]

groups = cleandir()
groups

['benigns', 'callbacks', 'cancers', 'normals']

In [3]:
metadata = {}
import time

devices = [
    'DBA-MGH', # ('A' and DBA)
    'HOWTEK-MGH', # ('A' and HOWTEK)
    'LUMISYS-WFU', # ('B' or 'C' and LUMISYS)
    'HOWTEK-ISMD', # ('D' and HOWTEK) 
]
countMarks = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
def process_case(ii, jj, kk, groupName, volume, case):
    files = cleandir('%s/%s/%s' % (groupName, volume, case))
    ics = [fl for fl in files if '.ics' in fl][0]
    fullpath = '%s/%s/%s/%s' % (root, groupName, volume, case)
    
    icsdata = None
    try:
        with open('%s/%s' % (fullpath, ics)) as fl:
            icsdata = fl.read()
    except:
        print (fullpath)
        return
        
    icslines = icsdata.split('\n')
    icslookup = {}
    for line in icslines:
        parts = line.split(' ')
        info = parts[0]
        icslookup[info] = ' '.join(parts[1:])
    if 'DATE_OF_STUDY' not in icslookup:
        print('\n\nBROKEN? %s %s %s\n\n' %(groupName, volume, case))
        return
    

    meta = {}
    meta['diagnosis'] = groupName
    meta['date'] = icslookup['DATE_OF_STUDY']
    meta['density'] = int(icslookup['DENSITY'])
    meta['digitizer_raw'] = icslookup['DIGITIZER']
    header = icslookup['filename'].split('-')[0]
    digitizer = None
    if header == 'A' and 'DBA' in icslookup['DIGITIZER']:
        digitizer = devices[0]
    elif header == 'A' and 'HOWTEK' in icslookup['DIGITIZER']:
        digitizer = devices[1]
    elif header == 'B' or header == 'C':
        digitizer = devices[2]
    elif header == 'D' and 'HOWTEK' in icslookup['DIGITIZER']:
        digitizer = devices[3]
    assert digitizer != None

    meta['digitizer'] = digitizer
    meta['name'] = icslookup['filename']
    rawname = meta['name']
    while '-' in rawname:
        rawname = rawname.replace('-', '_')
    if  icslookup['PATIENT_AGE'] == '':
        meta['age'] = None
    else:
        meta['age'] = int(icslookup['PATIENT_AGE'])
    meta['root'] = '%s/%s/%s/%s' % (root, groupName, volume, case)

    sequences = {}
    dx = [0,1,1,1,0,-1,-1,-1]
    dy = [-1,-1,0,1,1,1,0,-1]
    readingSequence = False
    nums = [str(val) for val in [1, 2, 3, 4, 5, 6, 7, 8, 9]]
    for line in icslines:
        if 'RIGHT_' in line or 'LEFT_' in line:
            scandata = line.split(' ')
            scanName = scandata[0]
            scanOverlay = scandata[-1]

            seqdata = {
                'name': rawname + '.' + scanName,
                'overlay': scanOverlay == 'OVERLAY',
#                     'annotations': []
            }

            if seqdata['overlay']:
                details = { 'markups': [] }
                try:
                    with open('%s/%s.OVERLAY' % (meta['root'], seqdata['name']), 'rb') as fl:
                        rawov = [ln for ln in fl.read().split('\n')]
                except:
                    print(seqdata['name'], 'missing overlay')
                    continue
                for line in rawov:
                    ident = line.split(' ')[0]
                    if ident == '': continue
                    if ident[0] in nums:
                        isNumber = int(ident)
                        # this is 999% an annotation
                        anndata = [int(val) for val in line.split(' ') if val not in ['#', '']]
                        details['markups'].append({
                            'start': anndata[:2],
                            'trace': json.dumps([(dy[dir], dx[dir]) for dir in anndata[2:]])
                        })
                    else:
                        value = ' '.join(line.split(' ')[1:])
                        try:
                            value = int(value)
                        except:
                            pass
                        details[ident.lower()] = value
                seqdata['details'] = details
                countMarks[len( seqdata['details']['markups'])] += 1
            sequences[scanName] = seqdata
    meta['scans'] = sequences
    try:
        assert len(sequences) == 4
    except:
        print(len(sequences), meta['root'])
        print('Too many? %d' % len(sequences))
        return

    sys.stdout.write('%d %s %s %s: %s\r' % (len(icsdata), groupName, volume, case, icslookup['DATE_OF_STUDY']))
    sys.stdout.flush()
    if rawname in metadata:
        print()
        print('COLLISION', rawname, metadata[rawname]['diagnosis'], metadata[rawname]['date'], \
              groupName, meta['date'])
        if meta['diagnosis'] != 'cancers':
            return # ignore unless developed into cancer
    metadata[rawname] = meta
        
t0 = time.time()
counter = 0
for ii, gname in enumerate(groups):
    for jj, volume in enumerate(cleandir(gname)):
        cases = cleandir('%s/%s' % (gname, volume))
        for kk, case in enumerate(cases):
            counter+= 1
            process_case(ii, jj, kk, gname, volume, case)
#             raise Exception('BREAK')
     
print ('\n')
print (counter, time.time() - t0)

B_3102_1.LEFT_CC missing overlay23 8 19966
B_3102_1.LEFT_MLO missing overlay
B_3102_1.RIGHT_CC missing overlay
B_3102_1.RIGHT_MLO missing overlay
0 /media/ul1994/ssd1tb1/scans/benigns/benign_01/case3102
Too many? 0
499 benigns benign_01 case3185: 5 6 19988

BROKEN? benigns benign_01 case3186


509 benigns benign_07 case1512: 7 4 199342
COLLISION A_1512_1 benigns 7 4 1993 benigns 7 4 1993
500 benigns benign_13 case0412: 26 8 19963
COLLISION C_0412_1 benigns 28 5 1996 benigns 26 8 1996
501 benigns benign_14 case0478: 26 10 1994
COLLISION C_0478_1 benigns 7 4 1994 benigns 26 10 1994
500 benigns benign_14 case3470: 10 3 19987
COLLISION B_3470_1 benigns 7 4 1998 benigns 10 3 1998
500 callbacks benign_without_callback_01 case3159: 29 5 1998
COLLISION B_3159_1 benigns 29 5 1998 callbacks 29 5 1998
491 callbacks benign_without_callback_01 case3160: 9 6 1998
COLLISION B_3160_1 benigns 9 6 1998 callbacks 9 6 1998
491 callbacks benign_without_callback_01 case3162: 4 6 1998
COLLISION B_3162_1 beni

In [4]:
print(len(metadata))
print(countMarks, np.sum(countMarks))

2586
[23, 3424, 416, 97, 30, 13, 11, 2, 0, 0] 4016


In [5]:
pprint(metadata[metadata.keys()[0]])

{'age': 40,
 'date': '27 11 1991',
 'density': 2,
 'diagnosis': 'benigns',
 'digitizer': 'HOWTEK-MGH',
 'digitizer_raw': 'HOWTEK 43.5',
 'name': 'A-1519-1',
 'root': '/media/ul1994/ssd1tb1/scans/benigns/benign_05/case1519',
 'scans': {'LEFT_CC': {'details': {'abnormality': 1,
                                   'assessment': 4,
                                   'boundary': '',
                                   'lesion_type': 'MASS SHAPE OVAL MARGINS OBSCURED',
                                   'markups': [{'start': [3494, 1540],
                                                'trace': '[[0, -1], [0, -1], [0, -1], [0, -1], [0, -1], [0, -1], [0, -1], [0, -1], [0, -1], [0, -1], [0, -1], [0, -1], [0, -1], [0, -1], [0, -1], [0, -1], [0, -1], [0, -1], [0, -1], [0, -1], [0, -1], [0, -1], [0, -1], [0, -1], [0, -1], [0, -1], [0, -1], [0, -1], [0, -1], [0, -1], [0, -1], [0, -1], [1, 0], [1, 0], [1, 0], [1, 0], [1, 0], [1, 0], [1, 0], [1, 0], [0, -1], [0, -1], [0, -1], [0, -1], [0, -1], [0, -1]

In [13]:
for elem in metadata.values():
    if elem['diagnosis'] != 'benigns': continue
    for scan in elem['scans'].values():
        if scan['overlay']:
            print(scan['details']['assessment'], scan['details']['lesion_type'])

4 MASS SHAPE OVAL MARGINS OBSCURED
4 MASS SHAPE OVAL MARGINS OBSCURED
4 MASS SHAPE ROUND MARGINS ILL_DEFINED
4 MASS SHAPE ROUND MARGINS ILL_DEFINED
4 CALCIFICATION TYPE PLEOMORPHIC DISTRIBUTION CLUSTERED
4 CALCIFICATION TYPE PLEOMORPHIC DISTRIBUTION CLUSTERED
3 MASS SHAPE LOBULATED MARGINS ILL_DEFINED
3 MASS SHAPE LOBULATED MARGINS ILL_DEFINED
4 MASS SHAPE LOBULATED MARGINS ILL_DEFINED
4 MASS SHAPE LOBULATED MARGINS ILL_DEFINED
4 CALCIFICATION TYPE PLEOMORPHIC DISTRIBUTION CLUSTERED
4 CALCIFICATION TYPE PLEOMORPHIC DISTRIBUTION CLUSTERED
0 MASS SHAPE LOBULATED MARGINS OBSCURED
0 MASS SHAPE LOBULATED MARGINS OBSCURED
4 MASS SHAPE LOBULATED MARGINS ILL_DEFINED
4 MASS SHAPE LOBULATED MARGINS ILL_DEFINED
4 CALCIFICATION TYPE AMORPHOUS DISTRIBUTION CLUSTERED
4 CALCIFICATION TYPE AMORPHOUS DISTRIBUTION CLUSTERED
0 MASS SHAPE OVAL MARGINS OBSCURED
0 MASS SHAPE OVAL MARGINS OBSCURED
0 MASS SHAPE OVAL MARGINS OBSCURED
4 CALCIFICATION TYPE PLEOMORPHIC DISTRIBUTION CLUSTERED
4 CALCIFICATION TYPE 

4 CALCIFICATION TYPE AMORPHOUS DISTRIBUTION CLUSTERED
4 CALCIFICATION TYPE AMORPHOUS DISTRIBUTION CLUSTERED
4 CALCIFICATION TYPE AMORPHOUS DISTRIBUTION CLUSTERED
4 CALCIFICATION TYPE AMORPHOUS DISTRIBUTION CLUSTERED
4 MASS SHAPE IRREGULAR MARGINS SPICULATED
4 MASS SHAPE IRREGULAR MARGINS SPICULATED
4 MASS SHAPE LOBULATED MARGINS ILL_DEFINED
4 MASS SHAPE LOBULATED MARGINS ILL_DEFINED
3 MASS SHAPE OVAL MARGINS CIRCUMSCRIBED
3 MASS SHAPE OVAL MARGINS CIRCUMSCRIBED
4 CALCIFICATION TYPE AMORPHOUS DISTRIBUTION REGIONAL
4 CALCIFICATION TYPE AMORPHOUS DISTRIBUTION REGIONAL
3 CALCIFICATION TYPE PLEOMORPHIC DISTRIBUTION CLUSTERED
3 CALCIFICATION TYPE PLEOMORPHIC DISTRIBUTION CLUSTERED
4 MASS SHAPE OVAL MARGINS CIRCUMSCRIBED
4 MASS SHAPE OVAL MARGINS CIRCUMSCRIBED
4 CALCIFICATION TYPE PLEOMORPHIC DISTRIBUTION SEGMENTAL
4 CALCIFICATION TYPE PLEOMORPHIC DISTRIBUTION SEGMENTAL
4 CALCIFICATION TYPE PLEOMORPHIC DISTRIBUTION CLUSTERED
4 CALCIFICATION TYPE PLEOMORPHIC DISTRIBUTION CLUSTERED
4 MASS SHAPE

In [71]:
import json
with open('metadata.json', 'w') as fl:
    json.dump(metadata, fl, indent=4)