In [1]:
import os
import numpy as np
import pandas as pd

### CoNLL-X and CoNLL-U file readers and writers
https://raw.githubusercontent.com/pnugues/ilppp/master/programs/labs/relation_extraction/python/conll.py

In [2]:
## Returns all the files in a folder ending with suffix. Recursive version. Return the list of file names.
def _get_files(dir = 'out', suffix = 'conll'):
    files = []
    for file in os.listdir(dir):
        path = dir + '/' + file
        if os.path.isdir(path):
            files += get_files(path, suffix)
        elif os.path.isfile(path) and file.endswith(suffix):
            files.append(path)
    return files

Creates a list of sentences from the corpus.
Each sentence is a string

In [3]:
def read_sentences(file):
    f = open(file).read().strip()
    _sentences = f.split('\n\n')
    return _sentences

Creates a list of sentence where each sentence is a list of lines.
Each line is a dictionary of columns.

In [4]:
# _column_names_2006 = ['id', 'form', 'lemma', 'cpostag', 'postag', 'feats', 'head', 'deprel', 'phead', 'pdeprel']
# _column_names_u = ['id', 'form', 'lemma', 'upostag', 'xpostag', 'feats', 'head', 'deprel', 'deps', 'misc']
def split_rows(sentences, column_names = ['id', 'form', 'lemma']):
    new_sentences = []
#     root_values = ['0', 'ROOT', 'ROOT', 'ROOT', 'ROOT', 'ROOT', '0', 'ROOT', '0', 'ROOT']
#     start = [dict(zip(column_names, root_values))]
    for sentence in sentences:
        rows = sentence.split('\n')
        sentence = [dict(zip(column_names, row.split('\t'))) for row in rows if row[0] != '#']
#         sentence = start + sentence
        new_sentences.append(sentence)
    return new_sentences

In [5]:
def _save(file, formatted_corpus, column_names):
    f_out = open(file, 'w')
    for sentence in formatted_corpus:
        for row in sentence[1:]:
            # print(row, flush=True)
            for col in column_names[:-1]:
                if col in row:
                    f_out.write(row[col] + '\t')
                else:
                    f_out.write('_\t')
            col = column_names[-1]
            if col in row:
                f_out.write(row[col] + '\n')
            else:
                f_out.write('_\n')
        f_out.write('\n')
    f_out.close()

In [6]:
path_in = 'in/treebank/'
files = open(path_in + 'index.txt').read().strip().split()
files

['æls', 'apt', 'chrona', 'or', 'wscp', 'eustace']

In [7]:
sentences = split_rows(np.concatenate(list((read_sentences(f) for f in (path_in + f + '.conll' for f in files)))))
len(sentences)

2673

In [8]:
pd.DataFrame(sentences[0])

Unnamed: 0,id,form,lemma
0,1,Mæg,mag
1,2,gehyran,gehyran
2,3,se,se
3,4,ðe,þe
4,5,wyle,willan
5,6,be,be
6,7,þam,se
7,8,halgan,halig
8,9,mædene,mægden
9,10,Eugenian,Eugenia


In [9]:
def get_sentence(sentence, column = 'form'):
    return ' '.join((w[column] for w in sentence)) #[1:]
print('form   -> ', get_sentence(sentences[0]))
print('lemma - > ', get_sentence(sentences[0], 'lemma'))

form   ->  Mæg gehyran se ðe wyle be þam halgan mædene Eugenian Philyppus dæhter hu heo ðurh mægðhad mærlice þeah and þurh martyrdom þisne middaneard oferswað
lemma - >  mag gehyran se þe willan be se halig mægden Eugenia Philippus dohtor hu heo þurh mægþhad mærlice þeon and þurh martyrdom þes middangeard oferswiðan


In [10]:
data = pd.DataFrame(((get_sentence(s), get_sentence(s, 'lemma')) for s in sentences))
data.to_csv('data/iswoc-treebank.tsv', '\t', index=False)
data

Unnamed: 0,0,1
0,Mæg gehyran se ðe wyle be þam halgan mædene Eu...,mag gehyran se þe willan be se halig mægden Eu...
1,Sum æþelboren þægn wæs Philippus gehaten ðone ...,sum æþelboren þegen wesan Philippus gehatan se...
2,And he hine gesette to heahgerefan ofer Alexan...,and he he gesettan to heahgerefa ofer Alexandr...
3,Ðæs ðægn Philippus n æs na gefullod on Gode fo...,þes þegen Philippus ne wesan na gefullian on G...
4,His wif wæs gecyged Claudia be þære he gestryn...,his wif wesan gecigan Claudia be se he gestryn...
...,...,...
2668,car je les aim ore mielz a soffrir,car je le amer or bon a sofrir
2669,mes done moi pooir e force de pacience que mes...,mais doner je pooir et force de pacience que m...
2670,Nostre Sires li respondi,nostre sieur il respondre
2671,Eustace soies forz e vainquerres,Eustace estre fort et veinqueor
