In [1]:
import os
import csv
import itertools
from bs4 import BeautifulSoup
path_data = '../../data/'

In [2]:
def normalize(s):
    for v in [('ġ', 'g'), ('ƿ', 'w'), ('ċ', 'с')]:
        s = s.replace(v[0], v[1]) 
    return s

In [3]:
# nesan	''unattested''
def filter_forms(arr):
    return sorted(set(v for v in arr if v not in ['tō', "''unattested''", '']))

In [11]:
def load_oe(file = 'oe.lemmas'):
    def make(s):
        a = s.split()
        return {
            "file": file,
            "lemma": a[0],
            "forms": filter_forms(a[1:])
        }
    with open(path_data + file, 'r') as f:
        arr = normalize(f.read()).split('\n')
        return [make(s) for s in arr if s not in ['', '&\t&']]
load_oe()[0]

{'file': 'oe.lemmas',
 'lemma': 'macian',
 'forms': ['gemacod,macod',
  'maca',
  'macast',
  'macaþ',
  'macian',
  'maciaþ,macigaþ',
  'macie,macige',
  'macien,macigen',
  'maciende,macigende',
  'macienne',
  'macode',
  'macoden',
  'macodest',
  'macodon']}

In [12]:
def load_dict(file = 'ang.txt'):
    def make(w):
        return {
            "file": file,
            "lemma": w.split()[0],
            "forms": filter_forms([l.split()[1] for l in w.split('\n')])
        }
    with open(path_data + file, 'r') as f:
        arr = normalize(f.read()).split('\n\n')
        return [make(w) for w in arr if len(w) != 0]
load_dict()[0]

{'file': 'ang.txt',
 'lemma': 'feohtan',
 'forms': ['feaht',
  'feoht',
  'feohtan',
  'feohtaþ',
  'feohte',
  'feohten',
  'feohtende',
  'fieht',
  'fiehtst',
  'fohten',
  'fuhte',
  'fuhten',
  'fuhton']}

In [4]:
def load_node(limit = 100000000, file = 'datasets/node.csv'):
    def pars_grammar(s):
        gr = BeautifulSoup(s).find('grammar')
        if not gr: return []
        group = ''.join(gr.findAll(text=True, recursive=False)).split(';')
        res = [item.strip() for sublist in (g.split(',') for g in group) for item in sublist]
#         print(gr.prettify())
#         print('====>', res)
        return res
    def _filter(w):
#         return True
        return w.isupper() and len(w) > 1
    def make(w):
#         print(w[6])
        return {
            "file": file,
            "lemma": w[6].lower(),
            "forms": filter_forms(pars_grammar(w[7]))
#             "forms": pars_grammar(w[7])
        }
    with open(path_data + file, 'r') as f:
        gen = csv.reader(f, delimiter=';')
        return [make(w) for w in itertools.islice(gen, limit) if _filter(w[6])]
# [v["forms"] for v in load_node()]
len(load_node(1000))

FileNotFoundError: [Errno 2] No such file or directory: '../../data/datasets/node.csv'

In [13]:
def merge_array(base, add):
    def to_set(arr):
        return set(e["lemma"] for e in arr)
    diff = to_set(add).difference(to_set(base))
    diff_arr = [v for v in add if v["lemma"] in diff]
    return base + diff_arr
merge_array(load_dict(), load_oe())[0]

{'file': 'ang.txt',
 'lemma': 'feohtan',
 'forms': ['feaht',
  'feoht',
  'feohtan',
  'feohtaþ',
  'feohte',
  'feohten',
  'feohtende',
  'fieht',
  'fiehtst',
  'fohten',
  'fuhte',
  'fuhten',
  'fuhton']}

In [14]:
def sort_dict(arr):
    def get_key(v):
        return v["lemma"]
    arr.sort(key=get_key)
    return arr
[v['lemma'] for v in sort_dict(merge_array(load_dict(), load_oe()))[:10]]

['Bryttas',
 'Crist',
 'Dene',
 'Engle',
 'Englisc',
 'Nazarenisc',
 'Seaxisc',
 'Wendlas',
 'a',
 'abarimathia']

In [15]:
def get_all():
    return sort_dict(merge_array(load_dict(), load_oe()))
get_all()[0]

{'file': 'ang.txt',
 'lemma': 'Bryttas',
 'forms': ['Brytta', 'Bryttas', 'Bryttum']}

In [16]:
def save_cltk(arr, file='out.txt'):
    with open(file, 'w') as f:
        for v in arr:
            for w in v["forms"]:
                s = f'{v["lemma"]}\t{w}\n'
                f.write(s)
            f.write("\n")
save_cltk(get_all())

In [76]:
def save_dict(arr, file='out_dict.txt'):
    with open(file, 'w') as f:
        for v in arr:
            f.write(f'{v["lemma"]}\t{"|".join(v["forms"])}\n')
save_dict(get_all())

In [106]:
save_dict((w for w in merge_array(get_all(), load_node()) if w['file'] == 'datasets/node.csv'), file='out_node.txt')

In [77]:
len(get_all())

6367