In [1]:
from yargy.relations import gnc_relation
from yargy import Parser, rule, and_, or_
from yargy.predicates import gram, is_capitalized, dictionary, normalized
from yargy.pipelines import morph_pipeline
from yargy.interpretation import fact

from yargy.predicates import (
    lte,
    gte,
    dictionary
)


gnc = gnc_relation()
Person = fact(
    "Person",
    ["first", "last"]
)

Entry = fact(
    "Entry",
    ["name", "birth_date", "birth_place"]
)

NAME = rule(
    is_capitalized().match(gnc).interpretation(Person.first.inflected()).optional(),
    is_capitalized().match(gnc).interpretation(Person.last.inflected()),
).interpretation(
    Person
)

MONTHS = {
    'январь',
    'февраль',
    'март',
    'апрель',
    'мая',
    'июнь',
    'июль',
    'август',
    'сентябрь',
    'октябрь',
    'ноябрь',
    'декабрь'
}


MONTH_NAME = dictionary(MONTHS)
MONTH = and_(
    gte(1),
    lte(12)
)
DAY = and_(
    gte(1),
    lte(31)
)
YEAR = and_(
    gte(1900),
    lte(2100)
)
DATE = or_(
    rule(DAY, MONTH_NAME, YEAR),
    rule(YEAR, '-', MONTH, '-', DAY),
    rule(YEAR, '.', MONTH, '.', DAY),
    rule(DAY, '-', MONTH, '-', YEAR),
    rule(DAY, '.', MONTH, '.', YEAR),
    rule(YEAR, 'г', '.'),
    rule(YEAR, 'году')
).named('DATE')


NAME = rule(
    gram("Name"),
    gram("Surn")
)

BIRTH_VERB = morph_pipeline([
    "родился",
    "дата рождения",
    "был рожден"
])
BIRTH_PLACE = rule(
    and_(
        gram("NOUN"),
        is_capitalized()
    ).optional().repeatable()
)

BIRTH_PLACE_TYPE = rule(
    gram("PREP"),
    dictionary({
        "городе",
        "селе",
        "поселке"
    }).optional(),
)

SENT = rule(
    NAME.interpretation(Entry.name),
    or_(
        rule(
            BIRTH_VERB,
            DATE.interpretation(Entry.birth_date),
            BIRTH_PLACE_TYPE,
            BIRTH_PLACE.interpretation(Entry.birth_place.normalized().custom(str.title))
        ),
        rule(
            BIRTH_VERB,
            DATE.interpretation(Entry.birth_date),
        ),
        rule(
            BIRTH_VERB,
            BIRTH_PLACE_TYPE,
            BIRTH_PLACE.interpretation(Entry.birth_place.normalized().custom(str.title))
        ),
    ).optional()
    
).interpretation(Entry)

text = "Владимир Иванов был рожден 31.01.1992 в Москве"
parser = Parser(SENT)
for match in parser.findall(text):
    print(match.fact)

Entry(name='Владимир Иванов', birth_date='31.01.1992', birth_place='Москва')


In [2]:
import gzip

from dataclasses import dataclass
from typing import Iterator

@dataclass
class Text:
    label: str
    title: str
    text: str


def read_texts(fn: str) -> Iterator[Text]:
    with gzip.open(fn, "rt", encoding="utf-8") as f:
        for line in f:
            yield Text(*line.strip().split("\t"))

texts = list(read_texts("../data/news.txt.gz"))

In [3]:
from tqdm import tqdm 
import codecs

for text in tqdm(texts):
    try:
        for match in parser.findall(text.text):
            with codecs.open("../task1/search_res.txt", 'a', 'utf-8') as f:
                f.write(f'{match.fact}\n')
    except:
       # empty
       pass

100%|██████████| 10000/10000 [02:04<00:00, 80.56it/s]
