In [9]:
import sys
import sqlite3
import codecs
import cPickle as pickle
import os
from enum import Enum

from scipy import ndimage

In [None]:
METADATA_DIR = '/storage/metadata'

# Parse nor -> eng translation from tags.txt

In [2]:
def parse_to_sqlite(f, conn):
    for nor, eng in parse_gen(f):
        try:
            conn.execute('INSERT INTO translation VALUES (?, ?)', (nor, eng))
        except sqlite3.IntegrityError as e:
            print e, ':', nor, '->', eng

def parse_to_json(f, out):
    json.dump(dict(parse_gen(f)), out)

def subj_to_tag(subj):
    return subj.strip().lower().replace(' ', '-').replace('(', '[').replace(')', ']').replace(',', '')

def parse_trans(f):
    nor_synonims = []
    curr_indent = 0
    eng = None
    result = {}
    for raw_line in f:
        line = raw_line.strip()
        if not line or  line[0] in ('"', '=', '/', ';', '$'):
            continue
        prev_indent = curr_indent
        curr_indent = len(raw_line) - len(raw_line.lstrip())
        if line[0].isalnum():
            for nor in nor_synonims:
                nor = subj_to_tag(nor)
                if nor in result:
                    continue
                elif eng is None:
                    raise ValueError("No translation for ", nor)
                else:
                    result[nor] = subj_to_tag(eng)
            nor_synonims = [line]
            eng = None
        elif line[0] == '#':
            nor_synonims.append(line[1:])
        elif line[0] == u'£':
            if curr_indent != prev_indent and nor_synonims:
                print curr_indent, prev_indent, raw_line, line
                raise ValueError("No translation for ", nor_synonims)
            eng = line.lstrip(u';£')
        elif line[0] == '*':
            # connected
            continue
        else:
            raise ValueError("Wrong line: {}".format(line))
    return result

In [3]:
with codecs.open(os.path.join(METADATA_DIR, 'tags.txt'), encoding='utf-8') as inp:
    trans = parse_trans(inp)

# Parse tfo files to metadata.json

In [13]:
BLACK_AND_WHITE_TAGS = {'svarthvitt', 'blackandwhite', 'monochrome', 'svartvit'}

def is_greyscale(image_dict):
    if set(image_dict.get('extra', [])).intersection(BLACK_AND_WHITE_TAGS):
        return True
    im = ndimage.imread(image_path(image_dict))
    return len(im.shape) == 2

def subj_fixes(subj):
    if subj == "pc'er":
        return "pc-er"
    subj = subj.replace(u'é', u'e')
    subj = subj.replace(u'è', 'e')
    subj = subj.rstrip(u'¨.*<>|')
    subj = subj.lstrip(u'¨.*<>|')
    return subj

class Indicator(Enum):
    newfile = "R^"
    filename = "1F^"
    subject = "70F^"
#     persons = "66F^"
#     caption = "63F^"
    extra = "47F^"

def read_tfo_field_gen(f):
    while True:
        entry = f.readline().strip()
        if not entry or not entry[1:-1]:
            return
        else:
            yield entry[1:-1].lower()

def parse_tfo(f, folder):
    images = {}
    current_image = {}
    skipped = 0
    not_found = 0
    while True:
        line = f.readline()
        if not line:
            print "{}: saved {}, skipped (G/S) {}, not found {}".format(folder, len(images), skipped, not_found)
            return images
        try:
            field = Indicator(line.strip())
        except ValueError:
            continue
        if field == Indicator.newfile:
            if current_image:
                current_image['folder'] = folder

                try:
                    is_image_greyscale = is_greyscale(current_image)
                except IOError:
                    not_found += 1

                if is_image_greyscale:
                    skipped += 1 # skipping greyscale images
                else:
                    current_image.setdefault('tags', [])
                    del current_image['extra'] # don't need this field anymore
                    images[current_image['filename']] = current_image
            current_image = {}
        elif field == Indicator.filename:
            filename = next(read_tfo_field_gen(f))
            current_image['filename'] = filename
        elif field == Indicator.subject:
            current_image['tags'] = list(set(map(subj_to_tag, map(subj_fixes, read_tfo_field_gen(f)))))
        else:
            current_image[field.name] = list(set(read_tfo_field_gen(f)))

def translate(trans, metadata):
    for pic in metadata.itervalues():
        pic['tags'] = filter(None, map(lambda t: trans.get(t, None), pic['tags']))

def missing_translations_gen(trans, metadata):
    for pic in metadata.itervalues():
        for tag in pic['tags']:
            if tag not in trans:
                yield tag

In [14]:
%%time
metadata = {}
for folder in PICS_FOLDERS:
    with codecs.open(os.path.join(METADATA_DIR, 'metadata_tfo_files', folder + '.tfo'), encoding='iso8859') as f:
        metadata.update(parse_tfo(f, folder))
print "Missing translations: ", len(list(missing_translations_gen(trans, metadata)))

1996_TO_2001: saved 81623, skipped (G/S) 29657, not found 1
2005_TO_2007: saved 142772, skipped (G/S) 12467, not found 0
2010_TO_2011: saved 123085, skipped (G/S) 291, not found 0
2013: saved 58253, skipped (G/S) 72, not found 0
2015: saved 74474, skipped (G/S) 50, not found 0
2002_TO_2004: saved 103985, skipped (G/S) 10432, not found 0
2008_TO_2009: saved 109473, skipped (G/S) 549, not found 0
2012: saved 71029, skipped (G/S) 251, not found 0
2014: saved 71864, skipped (G/S) 102, not found 0
2016_01_TO_10: saved 75768, skipped (G/S) 174, not found 0
Missing translations:  105
CPU times: user 41min 39s, sys: 1min 20s, total: 42min 59s
Wall time: 5h 42min 54s


Might need to create file and give write permissions for all befor running due to owner bug:

In [15]:
translate(trans, metadata)

In [16]:
with open(os.path.join(METADATA_DIR, 'metadata.pickle'), mode='w') as out:
        pickle.dump(metadata, out, protocol=pickle.HIGHEST_PROTOCOL)

# Parse tags file to newick file

In [5]:
def children_to_str(children):
    children_str = ','.join(children)
    if children_str:
        return '({})'.format(children_str)
    else:
        return ''

def read_children(ls, ident=0, max_children=None):
    children = []
    while True:
        if not ls:
            return children_to_str(children)
        line = ls.pop(0)
        curr_ident = len(line) - len(line.lstrip(' '))
        if curr_ident == ident:
            child_name = subj_to_tag(line)
            child = read_children(ls, ident + 2) + child_name
            children.append(child)
        elif curr_ident < ident:
            ls.insert(0, line)
            return children_to_str(children)
        else:
            print ident, children, line, curr_ident
            raise ValueError("Wrong tree")
    return tree, None

def read_tree(tree_str):
    return read_children(tree_str.splitlines())[1:-1] + ';'

Might need to create file and give write permissions for all befor running due to owner bug:

In [6]:
with open(os.path.join(METADATA_DIR, 'tags.eng.txt')) as in_file:
    tag_tree = read_tree(in_file.read())
with open(os.path.join(METADATA_DIR, 'tags.nw'), mode='w') as out_file:
    out_file.write(tag_tree)