In [34]:
import sys
import sqlite3
import codecs
import cPickle as pickle
import os
from enum import Enum

from ntb.constants import *
from ntb.util import *

# Parse nor -> eng translation from tags.txt

In [35]:
def parse_to_sqlite(f, conn):
    for nor, eng in parse_gen(f):
        try:
            conn.execute('INSERT INTO translation VALUES (?, ?)', (nor, eng))
        except sqlite3.IntegrityError as e:
            print e, ':', nor, '->', eng

def parse_to_json(f, out):
    json.dump(dict(parse_gen(f)), out)

def parse_trans(f):
    nor_synonims = []
    curr_indent = 0
    eng = None
    result = {}
    for raw_line in f:
        line = raw_line.strip()
        if not line or  line[0] in ('"', '=', '/', ';', '$'):
            continue
        prev_indent = curr_indent
        curr_indent = len(raw_line) - len(raw_line.lstrip())
        if line[0].isalnum():
            for nor in nor_synonims:
                nor = subj_to_tag(nor)
                if nor in result:
                    continue
                elif eng is None:
                    raise ValueError("No translation for ", nor)
                else:
                    result[nor] = subj_to_tag(eng)
            nor_synonims = [line]
            eng = None
        elif line[0] == '#':
            nor_synonims.append(line[1:])
        elif line[0] == u'£':
            if curr_indent != prev_indent and nor_synonims:
                print curr_indent, prev_indent, raw_line, line
                raise ValueError("No translation for ", nor_synonims)
            eng = line.lstrip(u';£')
        elif line[0] == '*':
            # connected
            continue
        else:
            raise ValueError("Wrong line: {}".format(line))
    return result

In [36]:
with codecs.open(os.path.join(BASE_DIR, 'tags.txt'), encoding='utf-8') as inp:
    trans = parse_trans(inp)

# Parse tfo files to metadata.json

In [38]:
BLACK_AND_WHITE_TAGS = {'svarthvitt', 'blackandwhite', 'monochrome', 'svartvit'}

def subj_fixes(subj):
    if subj == "pc'er":
        return "pc-er"
    subj = subj.replace(u'é', u'e')
    subj = subj.replace(u'è', 'e')
    subj = subj.rstrip(u'¨.*<>|')
    subj = subj.lstrip(u'¨.*<>|')
    return subj

class Indicator(Enum):
    newfile = "R^"
    filename = "1F^"
    subject = "70F^"
#     persons = "66F^"
#     caption = "63F^"
    extra = "47F^"

def read_tfo_field_gen(f):
    while True:
        entry = f.readline().strip()
        if not entry or not entry[1:-1]:
            return
        else:
            yield entry[1:-1].lower()

def parse_tfo(f, folder):
    images = {}
    current_image = {}
    skipped = 0
    while True:
        line = f.readline()
        if not line:
            print "{}: saved {}, skipped (B/W) {}".format(folder, len(images), skipped)
            return images
        try:
            field = Indicator(line.strip())
        except ValueError:
            continue
        if field == Indicator.newfile:
            if current_image:
                if set(current_image.get('extra', [])).intersection(BLACK_AND_WHITE_TAGS):
                    skipped += 1 # skipping black and white images
                else:
                    current_image.setdefault('tags', [])
                    current_image['folder'] = folder
                    del current_image['extra'] # don't need this field for now
                    images[current_image['filename']] = current_image
            current_image = {}
        elif field == Indicator.filename:
            filename = next(read_tfo_field_gen(f))
            current_image['filename'] = filename
        elif field == Indicator.subject:
            current_image['tags'] = list(set(map(subj_to_tag, map(subj_fixes, read_tfo_field_gen(f)))))
        else:
            current_image[field.name] = list(set(read_tfo_field_gen(f)))

def translate(trans, metadata):
    for pic in metadata.itervalues():
        pic['tags'] = filter(None, map(lambda t: trans.get(t, None), pic['tags']))

def missing_translations_gen(trans, metadata):
    for pic in metadata.itervalues():
        for tag in pic['tags']:
            if tag not in trans:
                yield tag

In [39]:
metadata = {}
for folder in PICS_FOLDERS:
    with codecs.open(os.path.join(BASE_DIR, 'metadata_tfo_files', folder + '.tfo'), encoding='iso8859') as f:
        metadata.update(parse_tfo(f, folder))
print "Missing translations: ", len(list(missing_translations_gen(trans, metadata)))

1996_TO_2001: saved 81737, skipped (B/W) 29543
2005_TO_2007: saved 142875, skipped (B/W) 12364
2010_TO_2011: saved 123335, skipped (B/W) 41
2013: saved 58305, skipped (B/W) 20
2015: saved 74474, skipped (B/W) 50
2002_TO_2004: saved 104066, skipped (B/W) 10351
2008_TO_2009: saved 109684, skipped (B/W) 338
2012: saved 71247, skipped (B/W) 33
2014: saved 71912, skipped (B/W) 54
2016_01_TO_10: saved 75799, skipped (B/W) 143
Missing translations:  105


Might need to create file and give write permissions for all befor running due to owner bug:

In [41]:
translate(trans, metadata)

In [42]:
with open(os.path.join(BASE_DIR, 'metadata.pickle'), mode='w') as out:
        pickle.dump(metadata, out, protocol=pickle.HIGHEST_PROTOCOL)

# Parse tags file to newick file

In [5]:
def children_to_str(children):
    children_str = ','.join(children)
    if children_str:
        return '({})'.format(children_str)
    else:
        return ''

def read_children(ls, ident=0, max_children=None):
    children = []
    while True:
        if not ls:
            return children_to_str(children)
        line = ls.pop(0)
        curr_ident = len(line) - len(line.lstrip(' '))
        if curr_ident == ident:
            child_name = subj_to_tag(line)
            child = read_children(ls, ident + 2) + child_name
            children.append(child)
        elif curr_ident < ident:
            ls.insert(0, line)
            return children_to_str(children)
        else:
            print ident, children, line, curr_ident
            raise ValueError("Wrong tree")
    return tree, None

def read_tree(tree_str):
    return read_children(tree_str.splitlines())[1:-1] + ';'

Might need to create file and give write permissions for all befor running due to owner bug:

In [6]:
with open(os.path.join(BASE_DIR, 'tags.eng.txt')) as in_file:
    tag_tree = read_tree(in_file.read())
with open(os.path.join(BASE_DIR, 'tags.nw'), mode='w') as out_file:
    out_file.write(tag_tree)