In [37]:
from dataclasses import dataclass
from typing import Optional
from yargy import Parser, rule, and_, or_
from yargy.predicates import gram, is_capitalized, dictionary, normalized, gte, lte, dictionary, caseless
from yargy.interpretation import fact
from yargy.pipelines import morph_pipeline
from yargy.relations import gnc_relation

In [38]:
@dataclass
class Entry:
    name: str
    birth_date: Optional[str]
    birth_place: Optional[str]

In [39]:
Name = fact(
    "Name",
    ["first", "last"]
)
gnc = gnc_relation()
NAME = rule(
    is_capitalized().match(gnc).interpretation(Name.first.inflected()).optional(),
    is_capitalized().match(gnc).interpretation(Name.last.inflected()),
).interpretation(Name)

In [40]:
Birthdate = fact(
    "Birthdate",
    ["day","month","year"]
)

MONTHS = {"январь", "февраль", "март", "апрель", 
    "мая", "июнь", "июль", "август", 
    "сентябрь", "октябрь", "ноябрь", "декабрь"}
    
YEAR_WORDS = or_(
    rule(caseless('г'),'.'),
    rule(normalized('год'))
)

DAY = and_(
    gte(1),
    lte(31)
).interpretation(Birthdate.day)

MONTH = and_(
    gte(1),
    lte(12)
).interpretation(Birthdate.month)

MONTH_NAME = dictionary(MONTHS).interpretation(Birthdate.month)

YEAR = and_(
    gte(1000),
    lte(2024),
).interpretation(Birthdate.year)

WORDS_DATE = morph_pipeline([
    "в",
])

BIRTH_DATE = or_(
    rule(
        DAY,
        MONTH_NAME,
    ),
    rule(
        DAY,
        MONTH,
    ),
    rule(
        DAY,
        MONTH_NAME,
        YEAR,
        YEAR_WORDS.optional()
    ),
    rule(
        WORDS_DATE,
        YEAR,
        YEAR_WORDS.optional()
    )
).interpretation(Birthdate)

In [41]:
Birthplace = fact(
    "Birthplace",
    ["birth_place"]
)

WORDS_PLACE = morph_pipeline([
    "в",
    "на",
])

PLACE = and_(
    is_capitalized(), 
    gram('NOUN')
).interpretation(Birthplace.birth_place)

BIRTH_PLACE = rule(
    WORDS_PLACE,
    PLACE
).interpretation(Birthplace)

In [42]:
Person = fact(
    'Person',
    ['name', 'birth_date', 'birth_place']
)

KEYWORDS = morph_pipeline([
    "родился", 
    "родилась", 
    "день рождения",
    "дата рождения",
    "рожден",
    "рождена"
])

In [43]:
RULE1 = rule(
        NAME.interpretation(Person.name), 
        KEYWORDS,
        BIRTH_DATE.interpretation(Person.birth_date), 
        BIRTH_PLACE.interpretation(Person.birth_place)
).interpretation(Person)

RULE2 = rule(
        NAME.interpretation(Person.name), 
        KEYWORDS,
        BIRTH_PLACE.interpretation(Person.birth_place),
        BIRTH_DATE.interpretation(Person.birth_date)
).interpretation(Person)

PERSON_RULE = or_(
        RULE1,
        RULE2
).interpretation(Person)

In [44]:
parser = Parser(PERSON_RULE)
def extract_person_data(file_path):
    entries = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                parts = line.strip().split('\t')
                text_to_parse = parts[2] if len(parts) > 2 else ""

                for match in parser.findall(text_to_parse):
                    person = match.fact
                    entries.append(Entry(person.name, person.birth_date, person.birth_place))
            except AttributeError as e:
                print(f"Error processing line: {line.strip()} | Error: {e}")
                continue
    return entries

In [45]:
file_path = "D:/xuexi/nlp/nlp-2023-master/data/news.txt"
person_entries = extract_person_data(file_path)

for entry in person_entries:
    print(f"Name: {entry.name}, Birth Date: {entry.birth_date}, Birth Place: {entry.birth_place}")

Name: Name(first='бетси', last='палмера'), Birth Date: Birthdate(day=None, month=None, year='1926'), Birth Place: Birthplace(birth_place='США')
Name: Name(first=None, last='трэмиел'), Birth Date: Birthdate(day=None, month=None, year='1928'), Birth Place: Birthplace(birth_place='Польше')
Name: Name(first=None, last='он'), Birth Date: Birthdate(day='11', month='сентября', year='1865'), Birth Place: Birthplace(birth_place='Польше')
Name: Name(first=None, last='ребёнок'), Birth Date: Birthdate(day='25', month='декабря', year=None), Birth Place: Birthplace(birth_place='Калифорнии')
Name: Name(first='дмитрий', last='чернявский'), Birth Date: Birthdate(day='5', month='марта', year='1992'), Birth Place: Birthplace(birth_place='Артемовске')
Name: Name(first=None, last='миллиардер'), Birth Date: Birthdate(day=None, month=None, year='1938'), Birth Place: Birthplace(birth_place='Лондоне')
Name: Name(first='яковлевюрий', last='яковлев'), Birth Date: Birthdate(day=None, month=None, year='1928'), Bir