In [3]:
import re
import xml.etree.ElementTree as ET
import numpy as np
from collections import defaultdict

In [1]:
def getLemma(form):
    if form in ["и", "а", "но", "или", "чтобы", "что", "зато", 'тоже']:
        return form, "CONJ"

    if form in ["по", "в", "с", "со", "на", "у", "за", "для", "при", "через", "до", "среди", "между", "возле"]:
        return form, "PR"

    if form in ["не", "ни", "затем", "тогда", "итак", "наверно", "бы", "ли", "же", "вот", "только", "уже",
                       "видимо", "потом"]:
        return form, "ADV"

    if form not in formToLemma:
        if np.random.random() > 0.3:
            return form, 'S'
        return form, "ADV"

    choices = list(formToLemma[form])
    freqs = np.array([frequencies[form][choice] if frequencies[form][choice] != 0 else 1 for choice in choices])
    probs = freqs / sum(freqs)
    if np.random.random() > 0.1:
        return choices[np.argmax(probs)]
    else:
        return choices[np.random.choice(len(probs), 1, p=probs)[0]]


def proccessTest(inputName, outputName):
    with open(outputName, 'w') as output:
        output.write('\n'.join(proccessFile(inputName)))


def proccessFile(filename):
    with open(filename, 'r') as input:
        return [processLine(line) for line in input.readlines()]


def processLine(line):
    tokens = line.replace(',', '').replace('.', '').replace('?', '').replace('!', '').replace('\n', '').split(' ')
    tokens = [word.lower() for word in tokens]
    return ' '.join([processToken(token) for token in tokens])

def processToken(token):
    lemma, part = getLemma(token)
    return word + '{' + lemma + '=' + getCorrectPartName(part) + '}'

In [18]:
def getCorrectPartName(part):
    if part in ['ADJF', 'ADJS', 'COMP']:
        return 'A'
    elif part in ['NOUN']:
        return 'S'
    elif part in ['VERB', 'INFN', 'PRTF', 'PRTS', 'GRND']:
        return 'V'
    elif part in ['ADVB', 'PRCL', 'INTJ', 'PRED']:
        return 'ADV'
    elif part in'PREP']:
        return 'PR'
    return part

In [186]:
dictPath = '../dict.opcorpora.xml'
corpPath = '../annot.opcorpora.no_ambig.nonmod.xml'

In [187]:
dictionary = ET.parse(dictPath).getroot()
corpus = ET.parse(corpPath).getroot()

lemmas = dictionary.findall('lemmata/lemma')
links = dictionary.findall('links/link')

lemmasById = dict()
lemmasTrueId = dict()
formToLemma = defaultdict(set)

for lemma in lemmas:
    lemmaId = lemma.get('id')
    lemmasTrueId[lemmaId] = lemmaId
    lemmasById[lemmaId] = lemma

for link in links:
    toId = link.get('to')
    fromId = link.get('from')
    lemmasTrueId[toId] = lemmasTrueId[fromId]

In [271]:
import csv
with open('odict.csv', encoding='windows-1251', newline='') as f:
    spamreader = csv.reader(f)
    h = defaultdict(bool)
    nouns = ['со', 'с', 'мо-жо', 'мо', 'жо', 'мн.', 'ж', 'м', 'предик.']
    adv = ['н', 'межд.', 'вводн.', 'част.']
    adj = ['п', 'сравн.']
    conj = ['союз']
    pr = ['предл.']
    v = ['св-нсв', 'нсв', 'св']
    ha = defaultdict(bool)
    for row in spamreader:
        if row[1] in nouns:
            formPart = 'S'
        elif row[1] in adv:
            formPart = 'ADV'
        elif row[1] in conj:
            formPart = 'CONJ'
        elif row[1] in pr:
            formPart = 'PR'
        elif row[1] in adj:
            formPart = 'A'
        elif row[1] in v:
            formPart = 'V'
        else:
            formPart = 'A'
        for word in row[2:]:
            formToLemma[word.lower()].add((row[0].lower(), formPart))
        formToLemma[row[0].lower()].add((row[0].lower(), formPart))

In [272]:
for lemma in lemmas:
    lemmaId = lemmasTrueId[lemma.get('id')]
    mainLemma = lemmasById[lemmaId][0]
    lemmaForm = mainLemma.get('t')
    lemmaPart = mainLemma[0].get('v')
    for f in lemma:
        form = f.get('t')
        formToLemma[form.lower()].add((lemmaForm, getCorrectPartName(lemmaPart)))

In [329]:
tokens = corpus.findall('./text/paragraphs/paragraph/sentence/tokens/token')
frequencies = defaultdict(lambda: defaultdict(lambda: 1))

for token in tokens:
    form = token[0].get('t').lower()
    lemma = token[0][0][0].get('t')
    formPart = token[0][0][0][0].get('v')
    frequencies[form][(lemma, getCorrectPartName(formPart))] += 1

In [338]:
import os
proccessTest('/Users/vladimir.egorov/Downloads/dataset_37845_1.txt', 'output.txt')
os.remove("/Users/vladimir.egorov/Downloads/dataset_37845_1.txt")