konko

#!/usr/bin/env python

import os
import sys
import cgi
import filtering
import naming
import pathabbr
import kconfig
import kexcel
import kstyle
import kutil
from io import open


REPORT = 10000


def write(s):
    sys.stdout.write(s)
    sys.stdout.flush()


class Token(object):
    def write(self, f, match):
        class_string = u' '.join(self.html_class(match))
        longtext = filtering.printable_nl(self.raw)
        f.write(u'<span class="')
        f.write(cgi.escape(class_string, quote=True))
        f.write(u'">')
        f.write(cgi.escape(longtext, quote=False))
        f.write(u'</span>')

    def html_id(self):
        return u"l{}c{}".format(self.line, self.char)

    def process0(self):
        self.delete = False
        self.merge_next = False
        self.simpletext = filtering.printable_compact(self.raw)


class Word(Token):
    def html_class(self, match):
        k = [u"w"]
        if self.delete:
            k.append(u"d")
        if match:
            k.append(u"m")
        return k

    def process(self, conc):
        pass

    def for_context_rich(self, match):
        if match:
            return u"hl", self.simpletext
        else:
            return u"normal", self.simpletext


class Tag(Token):
    def html_class(self, match):
        k = [u"t"]
        if self.delete:
            k.append(u"d")
        if self.samplekey is not None:
            k.append(u"i")
        return k

    def descr(self):
        return u'tag {} on line {}, column {}'.format(self.raw, self.line, self.char)

    def process(self, conc):
        self.textkey = kutil.try_capture(conc.config.text, self.raw)
        self.samplekey = kutil.try_capture(conc.config.sample, self.raw)
        for a in conc.config.delete:
            if kutil.exact_match(a, self.raw):
                self.delete = True
        self.pairs = {
            "delete": self.try_pairs(conc.config.delete_pair),
            "compound": self.try_pairs(conc.config.compound_pair),
        }

    def try_pairs(self, pairs):
        p_open = []
        p_close = []
        for i, a in enumerate(pairs):
            a1, a2 = a
            if kutil.exact_match(a1, self.raw):
                p_open.append(i)
            if kutil.exact_match(a2, self.raw):
                p_close.append(i)
        return p_open, p_close

    def for_context_rich(self, match):
        return u"light", self.simpletext


class Sep(Token):
    def html_class(self, match):
        return [u"s"]

    def process(self, conc):
        pass

    def for_context_rich(self, match):
        return u"normal", self.simpletext


class Compound(object):
    def __init__(self, tokens):
        assert len(tokens) > 0
        self.tokens = tokens

    def write(self, f):
        match = len(self.match) > 0
        if match:
            f.write(u'<span class="c" id="')
            f.write(cgi.escape(self.tokens[0].html_id(), quote=True))
            f.write(u'">')
        else:
            f.write(u'<span class="c">')
        for t in self.tokens:
            t.write(f, match)
        f.write(u'</span>')

    def do_match(self, conc):
        self.has_word = False
        self.raw = ''
        self.word_raw = ''
        self.samplekey = None
        for t in self.tokens:
            self.raw += t.raw
            if isinstance(t, Word):
                if not t.delete:
                    self.has_word = True
                    self.word_raw += t.raw
            elif isinstance(t, Tag):
                if t.samplekey is not None:
                    assert self.samplekey is None
                    self.samplekey = t.samplekey
            elif isinstance(t, Sep):
                if conc.config.separators_in_compound and not t.delete:
                    self.word_raw += t.raw

        self.match = []
        if not self.has_word:
            return
        self.compword = filtering.printable_compact(self.word_raw)
        self.simpletext = filtering.printable_compact(self.raw)
        self.lemma = self.compword.lower()
        for search in conc.search:
            if kutil.exact_match(search.re, self.word_raw):
                self.match.append(search)

    def link(self, text):
        i = self.tokens[0].html_id()
        assert i is not None
        return u'{}#{}'.format(text.url, i)

    def set_context(self, text, i, sample):
        assert text.compounds[i] == self
        self.sample = sample
        cb = self.get_context_rich(text, i, -1)
        cc = self.get_this_rich()
        ca = self.get_context_rich(text, i, +1)
        self.rich_before = kexcel.rich_simplify(cb)
        self.rich = kexcel.rich_simplify(cc)
        self.rich_after = kexcel.rich_simplify(ca)
        self.left = self.get_context_simple(text, i, -1)
        self.right = self.get_context_simple(text, i, +1)

    def get_context_rich(self, text, i, d):
        ctx = []
        j = i + d
        l = 0
        while 0 <= j < len(text.compounds) and l < text.conc.config.context:
            c = text.compounds[j]
            l0, ctx0 = c.for_context_rich()
            l += l0
            ctx = ctx + ctx0 if d > 0 else ctx0 + ctx
            j += d
        return ctx

    def get_context_simple(self, text, i, d):
        CTX_WORDS = 4
        ctx = []
        j = i + d
        while 0 <= j < len(text.compounds) and len(ctx) < CTX_WORDS:
            c = text.compounds[j]
            if c.has_word:
                ctx.append(c.lemma)
            j += d
        return u' '.join(ctx)

    def get_this_rich(self):
        l, ctx = self.for_context_rich()
        return ctx

    def for_context_rich(self):
        match = len(self.match) > 0
        l = 0
        ctx = []
        for t in self.tokens:
            if not t.delete:
                fmt, txt = t.for_context_rich(match)
                l += len(txt)
                ctx.append((fmt, txt))
        return l, ctx


class Sample(object):
    def __init__(self, text, key):
        assert isinstance(key, tuple)
        self.text = text
        self.key = key
        self.words = []

    def set_name(self):
        self.name = self.text.samplenames.get(self.key)


class Text(object):
    def __init__(self, file, key):
        assert isinstance(key, tuple)
        self.file = file
        self.conc = file.conc
        self.key = key
        fullkey = (file.shortname,) + key
        self.safename = file.source.safenames.get(fullkey)
        self.htmlfile = self.safename + u".html"
        self.url = file.source.url + u'/' + self.htmlfile
        self.htmlpath = os.path.join(file.source.htmlpath, self.htmlfile)
        self.samplenames = naming.printable_naming()
        self.samplemap = {}
        self.tokens = []

    def set_name(self):
        self.name = self.file.textnames.get(self.key)
        self.fullname = self.file.shortname
        if self.name != u'':
            self.fullname += u": " + self.name

    def process(self):
        self.process_delete()
        self.process_compound()
        self.do_match()
        self.process_context_sample()
        self.report()

    def write(self):
        HEAD = u'''<!DOCTYPE html>
<html lang="en">
<head>
<title>{}</title>
<meta charset="UTF-8">
<style>
{}
</style>
</head>
<body>
<pre>'''
        FOOT = u'''</pre>
</body>
</html>
'''
        with open(self.htmlpath, u"w") as f:
            f.write(HEAD.format(
                cgi.escape(self.fullname, quote=False),
                cgi.escape(kstyle.css, quote=False)
            ))
            for c in self.compounds:
                c.write(f)
            f.write(FOOT)

    def process_delete(self):
        ranges = self.find_ranges("delete")
        for i,j in ranges:
            for k in xrange(i, j+1):
                self.tokens[k].delete = True

    def process_compound(self):
        self.process_compound_tags()
        if not self.conc.config.tag_breaks_word:
            self.merge_words_broken_with_tags()
        self.unmerge_if_needed()
        self.build_compounds()

    def process_compound_tags(self):
        ranges = self.find_ranges("compound")
        for i,j in ranges:
            for k in xrange(i, j):
                self.tokens[k].merge_next = True

    def merge_words_broken_with_tags(self):
        i = None
        for j,t in enumerate(self.tokens):
            if isinstance(t, Tag):
                pass
            elif isinstance(t, Word):
                if i is not None and i < j - 1:
                    # case: Word Tag ... Word
                    for k in xrange(i, j):
                        self.tokens[k].merge_next = True
                i = j
            else:
                i = None

    def unmerge_if_needed(self):
        # This is to handle strange cases where sample identifier
        # happens to be in the middle of a compounds. Split compound.
        for t in self.tokens:
            if isinstance(t, Tag) and t.samplekey is not None:
                t.merge_next = False

    def build_compounds(self):
        self.compounds = []
        i = None
        for j,t in enumerate(self.tokens):
            if i is None:
                i = j
            if not t.merge_next:
                self.compounds.append(Compound(self.tokens[i:j+1]))
                i = None

    def find_ranges(self, kind):
        m = len(self.conc.config.delete_pair)
        stack = [ [] for x in xrange(m) ]
        r = []
        for j,t in enumerate(self.tokens):
            if isinstance(t, Tag):
                for x in t.pairs[kind][0]:
                    stack[x].append(j)
                for x in t.pairs[kind][1]:
                    if len(stack[x]) == 0:
                        self.conc.warn(self.file.filename, u"{}: no matching opening tag".format(t.descr()))
                    else:
                        i = stack[x].pop()
                        r.append((i,j))
        for l in stack:
            for i in l:
                t = self.tokens[i]
                self.conc.warn(self.file.filename, u"{}: no matching closing tag".format(t.descr()))
        return r

    def do_match(self):
        for c in self.compounds:
            c.do_match(self.conc)

    def process_context_sample(self):
        self.words = []
        samplekey = ()
        for i,c in enumerate(self.compounds):
            if c.samplekey is not None:
                samplekey = c.samplekey
            if c.has_word:
                sample = self.get_sample(samplekey)
                if len(c.match) > 0:
                    c.set_context(self, i, sample)
                self.words.append(c)
                sample.words.append(c)

    def get_sample(self, samplekey):
        if samplekey not in self.samplemap:
            self.samplemap[samplekey] = Sample(self, samplekey)
        return self.samplemap[samplekey]

    def report(self):
        self.samplelist = [self.samplemap[x] for x in sorted(self.samplemap.keys())]
        for sample in self.samplelist:
            sample.set_name()
        for w in self.words:
            for search in w.match:
                search.add(w, self)
        self.report2(self.words)
        for sample in self.samplelist:
            self.report2(sample.words, sample)

    def report2(self, words, sample=None):
        counts = dict(( s.key, kutil.Counter()) for s in self.conc.search)
        for w in words:
            for search in w.match:
                counts[search.key].add(w.lemma)
        self.conc.add(self, sample, len(words), counts)


class File(object):
    def __init__(self, source, filename, shortname):
        assert isinstance(shortname, unicode)
        self.source = source
        self.conc = source.conc
        self.filename = filename
        self.shortname = shortname
        self.textnames = naming.printable_naming()
        self.texts = []
        self.textmap = {}

    def read(self):
        with open(self.filename, encoding=self.conc.config.encoding) as f:
            self.data = f.read()

    def process(self):
        self.parse()
        self.split()

    def parse(self):
        self.tokens = []
        self.line = 1
        self.char = 1
        self.prev = 0
        n = len(self.data)
        while self.prev < n:
            tag = kutil.try_search(self.conc.config.tag, self.data, self.prev)
            word = kutil.try_search(self.conc.config.word, self.data, self.prev)
            if tag is not None and tag.start() < word.start():
                a = tag.start()
                b = tag.end()
                self.feed(Sep, a)
                self.feed(Tag, b)
            elif word is not None:
                a = word.start()
                b = word.end()
                if tag is not None and tag.start() < b:
                    b = tag.start()
                self.feed(Sep, a)
                self.feed(Word, b)
            else:
                self.feed(Sep, n)

    def feed(self, kind, b):
        a = self.prev
        assert a <= b
        self.prev = b
        if a == b:
            return
        raw = self.data[a:b]
        t = kind()
        t.raw = raw
        t.start = a
        t.line = self.line
        t.char = self.char
        t.process0()
        t.process(self.conc)
        self.tokens.append(t)
        for c in raw:
            if c == u'\n':
                self.line += 1
                self.char = 1
            else:
                self.char += 1
        if len(self.tokens) % REPORT == 0:
            write(u'-')

    def split(self):
        text = None
        for i,t in enumerate(self.tokens):
            if isinstance(t, Tag) and t.textkey is not None:
                text = self.get_text(t.textkey)
            if text is None:
                text = self.get_text(())
            text.tokens.append(t)
        for text in self.texts:
            text.set_name()

    def get_text(self, textkey):
        if textkey in self.textmap:
            return self.textmap[textkey]
        else:
            text = Text(self, textkey)
            self.textmap[textkey] = text
            self.texts.append(text)
            return text


class Source(object):
    def __init__(self, conc, key, globs):
        self.conc = conc
        self.key = key
        self.globs = globs
        self.safename = conc.safenames.get(key)
        self.safenames = naming.safe_naming()

    def process(self):
        self.url = self.conc.url + u'/' + self.safename
        self.htmlpath = os.path.join(self.conc.config.output_dir, self.safename)
        kutil.try_makedirs(self.htmlpath)
        write(self.key + u' ')
        skip = set(kutil.listglob(self.conc.config.skip_files))
        l = kutil.listglob(self.globs)
        l = [x for x in l if x not in skip]
        if len(l) == 0:
            sys.exit(u"{}: after skipping, there are no files left".format(self.key))
        shortnames = pathabbr.pathabbr(l)
        self.files = []
        for filename in l:
            write(u':')
            f = File(self, filename, shortnames[filename])
            self.files.append(f)
            try:
                f.read()
            except:
                write(u'\n')
                kutil.exception_exit(u'error reading input file: {}'.format(filename))
            f.process()
            for t in f.texts:
                write(u'.')
                t.process()
                try:
                    t.write()
                except:
                    write(u'\n')
                    kutil.exception_exit(u'error writing output file: {}'.format(t.htmlfile))
        write(u'\n')


class Search(object):
    def __init__(self, conc, key, re):
        self.key = key
        self.re = re
        self.conc = conc

    def get_columns(self):
        return [
            (u"N",),
            (u"Before", u"right"),
            (u"Word", u"key"),
            (u"After",),
            (u"Simple",),
            (u"Lemma",),
            (u"Link",),
            (u"Source",),
            (u"File",),
            (u"Text",),
            (u"Sample",),
            (u"Line",),
            (u"Column",),
            (u"Left", u"light", 10),
            (u"Right", u"light", 10),
        ]

    def add(self, word, text):
        self.xs.write_number(self.xs.r)
        self.xs.write_rich(word.rich_before)
        self.xs.write_rich(word.rich)
        self.xs.write_rich(word.rich_after)
        self.xs.write_string(word.lemma)
        self.xs.write_string(word.lemma)
        self.xs.write_url(word.link(text), u"text")
        self.xs.write_string(text.file.source.key)
        self.xs.write_string(text.file.shortname)
        self.xs.write_string(text.name)
        self.xs.write_string(word.sample.name)
        self.xs.write_number(word.tokens[0].line)
        self.xs.write_number(word.tokens[0].char)
        self.xs.write_string(word.left)
        self.xs.write_string(word.right)
        self.xs.next_row()

    def xl_open(self):
        xlsx = self.key + u".xlsx"
        filename = os.path.join(self.conc.config.output_dir, xlsx)
        self.xl = kexcel.Excel(filename)
        self.xs = self.xl.sheet(u"Concordance", self.get_columns())

    def xl_close(self):
        self.xl.close()


class Conc(object):
    def __init__(self, config_file):
        self.log_file = sys.stderr
        self.safenames = naming.safe_naming()
        self.config = kconfig.KConfig(config_file)
        self.source = [Source(self, key, val) for key, val in self.config.source]
        self.search = [Search(self, key, re) for key, re in self.config.search]

    def warn(self, filename, msg):
        write(u'!')
        self.log(filename, msg)

    def log(self, filename, msg):
        self.log_file.write(u'{}: {}\n'.format(filename, msg))
        self.log_file.flush()

    def log_open(self):
        filename = os.path.join(self.config.output_dir, u"log.txt")
        try:
            self.log_file = open(filename, u"w")
        except:
            kutil.exception_exit(u'error creating log file: {}'.format(filename))

    def log_close(self):
        self.log_file.close()
        self.log_file = sys.stderr

    def get_columns(self, what):
        cols = []
        cols.append((u"Link",))
        cols.append((u"Source",))
        cols.append((u"File",))
        cols.append((u"Text",))
        if what == u"samples":
            cols.append((u"Sample",))
        elif what == u"files":
            pass
        else:
            assert False, what
        cols.append((u"Words",))
        for s in self.search:
            cols.append((u"{} tokens".format(s.key),))
        for s in self.search:
            cols.append((u"{} types".format(s.key),))
        return cols

    def add(self, text, sample, words, counts):
        if sample is None:
            xs = self.xsf
        else:
            xs = self.xss
        xs.write_url(text.url, u"text")
        xs.write_string(text.file.source.key)
        xs.write_string(text.file.shortname)
        xs.write_string(text.name)
        if sample is not None:
            xs.write_string(sample.name)
        xs.write_number(words)
        for s in self.search:
            xs.write_number(counts[s.key].total)
        for s in self.search:
            xs.write_number(counts[s.key].types())
        xs.next_row()

    def xl_open(self):
        xlsx = u"summary.xlsx"
        filename = os.path.join(self.config.output_dir, xlsx)
        self.xl = kexcel.Excel(filename)
        self.xsf = self.xl.sheet(u"Files", self.get_columns(u"files"))
        self.xss = self.xl.sheet(u"Samples", self.get_columns(u"samples"))

    def xl_close(self):
        self.xl.close()

    def do(self):
        self.url = u'http://localhost:{}'.format(self.config.server_port)
        kutil.try_makedirs(self.config.output_dir)
        self.log_open()
        self.xl_open()
        for search in self.search:
            search.xl_open()
        for source in self.source:
            source.process()
        for search in self.search:
            search.xl_close()
        self.xl_close()
        self.log_close()


def main():
    param = sys.argv[1:]
    if len(param) != 1:
        sys.exit(u'usage: {} CONFIGURATION'.format(sys.argv[0]))
    config_file, = param
    conc = Conc(config_file)
    conc.do()


main()