In [17]:
%load_ext autoreload
%autoreload 2

from pathlib2 import Path
import re
import pandas as pd
from difflib import SequenceMatcher

from collections import namedtuple

import splitword  # modules for tokenization

token = namedtuple('token', 'string tag offset')  # type(elem): string, type(tags): list
keytag = namedtuple('key', 'tag attrs')  # type(elem): string, type(tags): list

class ReadFile:

    def __init__(self, **kwargs):
        self.textfile = kwargs['fdir'] / kwargs['textfile']
        with self.textfile.open(encoding='utf-8') as f:
            self.doc = f.read()
        self.section = kwargs['fdir'] / kwargs['sectionfile']
        self.tag = kwargs['fdir'] / kwargs['tagfile']
        self.df_section = pd.read_csv(self.section)
        self.df_tag = pd.read_csv(self.tag)

    def __repr__(self):
        return self.doc

    @property
    def sectionlist(self):
        return self.df_section

    @property
    def taglist(self):
        return self.df_tag

    def rmsections(self, sect):
        secs = self.df_section[self.df_section['sec_title'].str.match(re.compile(sect, re.I))]
        newdoc = self.doc
        tups = []
        rm_secs = []
        rm_tags = []
        for sec in secs.itertuples():
            s, e = sec.start, sec.end
            tups.append([s, e])
            newdoc = newdoc.replace(self.doc[s:e], '', 1)

            sloc = self.df_section[(self.df_section.start >= s) & (self.df_section.end <= e)].index
            rm_secs += [i for i in sloc]

            tloc = self.df_tag[(self.df_tag.start >= s) & (self.df_tag.end <= e)].index
            if len(tloc) > 0:
                rm_tags += [i for i in tloc]

        for tup in tups[::-1]:  # subtract from the end of lists
            s, e = tup
            offset = e - s
            self.df_section.loc[self.df_section.start >= s, 'start'] -= offset
            self.df_section.loc[self.df_section.end >= e, 'end'] -= offset
            self.df_tag.loc[self.df_tag.start >= s, 'start'] -= offset
            self.df_tag.loc[self.df_tag.end >= e, 'end'] -= offset

        self.df_section.drop(rm_secs)
        self.df_tag.drop(rm_tags)
        return newdoc

    
    def process(self, newdoc):
        paratextlist = []
        for i, sec in enumerate(self.df_section.itertuples()):
            part_tag = self.df_tag.loc[(self.df_tag.start >= sec.start)  & (self.df_tag.end <= sec.end)]
            st, se = sec.start, sec.end
            textlist = []
            for j, tag in enumerate(part_tag.itertuples()):
                s, e = tag.start, tag.end
                keys = [keytag(list(it.keys())[0], list(it.values())[0]) \
                        for it in eval(tag.taglist)]
                if newdoc[st : s]:
                    textlist += [token(newdoc[st : s], None, (st, s))]
                textlist += [token(newdoc[s : e], keys, (s, e))]
                st = e
            if newdoc[e : sec.end]:
                textlist += [token(newdoc[e : sec.end], None, (e, sec.end))]
                e = sec.end
                
            paratextlist.append(textlist)
        return paratextlist  


class TokenList:
    def __init__(self, diclist):
        self.diclist = diclist

    def __getitem__(self, index):
        return self.diclist[index]

    def __len__(self):
        return len(self.diclist)

    def __repr__(self):
        return ''.join([list(dic.values())[0] for dic in self.diclist])

    def getlist(self, attr):
        return [list(dic.values())[0] for dic in self.diclist if attr in dic.keys()]

    def getkeys(self):
        return set([list(dic.keys())[0] for dic in self.diclist])

    def splitby(self, mod, comms):
        textlist = self.diclist
        for command, attr in comms.items():
            newlist = []
            for n, dic in enumerate(textlist):
                newlist += eval(mod + '.' + command)(dic)
            textlist = newlist[:]
        return textlist


def setparagraph(tokenlist):
    paragraphtexts = []
    indexes = [ii for ii, w in enumerate(tokenlist) if list(w.values())[0] == '\n']
    s = 0
    for index in indexes:
        e = index + 1
        text = ''.join([''.join(list(w.values())) for w in tokenlist[s:e]])
        if text != '\n':
            paragraphtexts.append(text)
        s = e
    return paragraphtexts


def calcoffset(tokenlist, offset):
    entity_dic = {
        'TK': '_',
        'PR': '_',
        'SR': '_',
        'PN': '_',
        'BL1': '_',
        'BL2': '_',
        'LF': '_',
        'SP': '_',
        'TK1': 'compound',
        'TK2': 'compound',
        'TK3': 'compound',
        'TK4': 'compound',
        'IN1': 'compound',
        'IN2': 'compound',
        'IN3': 'compound',
        'SUB': 'subscript',
        'SUP': 'superscript',
        'ITL': 'italic',
        'REF': 'reference',
        '_': '_'
    }
    dic_token = {}
    ipar = 1  # for dic_token.keys()
    itok = 1  # for dic_token.keys()
    ient = 1  # counter for entities
    for i in range(len(tokenlist)):
        value = list(tokenlist[i].values())[0]
        ent = list(tokenlist[i].keys())[0]
        entity = entity_dic.get(ent)
        if entity and entity != '_':
            if entity_dic.get(list(tokenlist[i - 1].keys())[0]) != entity_dic.get(
                    ent):  # different entity case from previous
                ient += 1
            entity += f'[{ient}]'

        if i < len(tokenlist) - 1 and 'LF' not in tokenlist[i].keys() and 'SP' in tokenlist[i + 1].keys():
            s = offset
            e = offset + len(value)
            dic_token[ipar, itok] = (s, e, value, entity)  # words without space
            e += len(list(tokenlist[i + 1].values())[0])  # count for space
            offset = e
            itok += 1
        elif 'LF' in tokenlist[i].keys():
            dic_token[ipar, itok] = value
            offset += len(value)
            ipar += 1
            itok = 1
        elif 'SP' not in tokenlist[i].keys():
            s = offset
            e = offset + len(value)
            dic_token[ipar, itok] = (s, e, value, entity)
            offset = e
            itok += 1
    return dic_token


class OutFile:

    def __init__(self, tokenlist, **kwargs):
        self.tokenlist = tokenlist
        self.outfile = kwargs['webannotsv']

    def outputtsv(self, orig_paragraph):

        fw = open(self.outfile, 'w', encoding='utf-8')
        tsv_text = '#FORMAT=WebAnno TSV 3.2\n'
        tsv_text += '#T_SP=webanno.custom.Xml|xml_tag\n\n\n'

        paragraph_texts = setparagraph(self.tokenlist)
        dic_token = calcoffset(self.tokenlist, offset=0)

        # check paragraphs
        matcher = SequenceMatcher()
        matcher.set_seq1(paragraph_texts)
        matcher.set_seq2(orig_paragraph)
        assert matcher.quick_ratio() == 1, "Paragraphs don't match!"

        for i, paragraph_text in enumerate(paragraph_texts):
            tsv_text += f'#Text={paragraph_text}'
            for keys, values in dic_token.items():
                if keys[0] == i + 1:
                    if values == '\n':
                        tsv_text += values
                    else:
                        tsv_text += ('{}-{}\t{}-{}\t{}\t{}\t\n'.format(*keys, *values))

        print(tsv_text.rstrip('\n\n'), file=fw)


# main
filelist = {
    'fdir': Path('./xml2text'),
    'textfile': '10.1063_1.5004600_fulltext_20190516.txt',
    'sectionfile': '10.1063_1.5004600_section_offset_20190516.csv',
    'tagfile': '10.1063_1.5004600_xmltag_offset_20190516.csv',
    'webannotsv': '10.1063_1.5004600_webanno.tsv'
}

doc = ReadFile(**filelist)

section = r'TABLE(.+?)-body'  # remove sections (Table body)
new_doc = doc.rmsections(section)
para_text_list = doc.process(new_doc)

# tokenize
module = 'splitword'
commands = {
    'splitbyspace': 'TX',
#     'splitbypunct': 'TK',
#     'splitbyblacket2': 'TK',
    'splitbyinfix': 'TK',
#     'splitbyprefix2': 'TK',
#     'splitbysurfix2': 'TK'
}

new_para_text_list = []
for i, text_list in enumerate(para_text_list[:3]):
    tok = TokenList(text_list)
    text_list = tok.splitby(module, commands)
    print(i, len(text_list))
    new_para_text_list.append(text_list)

# output
out = OutFile(text_list, **filelist)
# out.outputtsv(doc.newparagraph(new_doc))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
[token(string='Influence', tag=None, offset=(0, 9))]
[token(string=' ', tag=['SP'], offset=(9, 10))]
[token(string='of', tag=None, offset=(10, 12))]
[token(string=' ', tag=['SP'], offset=(12, 13))]
[token(string='magnetic', tag=None, offset=(13, 21))]
[token(string=' ', tag=['SP'], offset=(21, 22))]
[token(string='frustration', tag=None, offset=(22, 33))]
[token(string=' ', tag=['SP'], offset=(33, 34))]
[token(string='and', tag=None, offset=(34, 37))]
[token(string=' ', tag=['SP'], offset=(37, 38))]
[token(string='structural', tag=None, offset=(38, 48))]
[token(string=' ', tag=['SP'], offset=(48, 49))]
[token(string='disorder', tag=None, offset=(49, 57))]
[token(string=' ', tag=['SP'], offset=(57, 58))]
[token(string='on', tag=None, offset=(58, 60))]
[token(string=' ', tag=['SP'], offset=(60, 61))]
[token(string='magnetocaloric', tag=None, offset=(61, 75))]
[token(string=' ', tag=['SP'], offset=(75,

In [None]:
a = ReadFile(**filelist)
set(a.taglist.taglist)

In [None]:
import difflib
paragraph_texts = setparagraph(prefix_tokenlist)
para = ''.join(paragraph_texts).rstrip().split('\n')
orig_parahgraph_texts = new_doc.rstrip().split('\n')
d = difflib.Differ()
diff = d.compare(para, orig_parahgraph_texts)
print('\n'.join(diff))

In [None]:
[(i, token) for i, token in enumerate(para_text_list)]