In [2]:
from bs4 import BeautifulSoup
from lxml import etree
import urllib.request
from html import unescape
import requests
import re

In [3]:
url = "http://az.lib.ru/s/sumarokow_a_p/text_0310oldorfo.shtml"
response = urllib.request.urlopen(url)
soup = BeautifulSoup(response, "html.parser")

In [4]:
heroes = []

# Go through each <dd> tag (which contains lines of dialogue)
for item in soup.find_all("dd"):
    line = item.get_text(strip=True)

    # Try to match a speaker name at the beginning of the line
    match = re.match(r'^([А-ЯЁ]{3,15}Ъ?)[\.:,\s]', line)
    if match:
        hero = match.group(1)
        if hero != 'КОНЕЦЪ':  # Exclude 'КОНЕЦЪ'
            heroes.append(hero.capitalize() + '.' if not hero.endswith('.') else hero.capitalize())

# Remove duplicates and sort
unique_heroes = sorted(set(heroes))

# Print results
for hero in unique_heroes:
    print(hero)

Ангелика.
Геростратъ.
Демифонъ.
Дромонъ.
Исмена.
Клитандръ.
Купецъ.
Менедемъ.
Парменонъ.
Секретарь.


In [5]:
# Extract stage directions from <i> tags (italic text)
stages_all = set()

# Words that usually indicate stage directions
stage_verbs = [
    'одинъ', 'идучи', 'входитъ', 'выходитъ', 'въходитъ', 'вынимаетъ',
    'обнимая', 'спрашивая', 'бросяся', 'нарочно', 'тихонько', 'особливо', 'тяжко'
]

# Common garbage indicators
blacklist_keywords = [
    'программист', 'собрані', 'ордена', 'статскаго', 'университет', 'сайт',
    'совѣтника', 'стихахъ', 'вѣдомств', 'академіи', 'конецъ', 'томъ', 'императорск'
]

for item in soup.find_all("i"):
    text = item.text.strip()

    # Skip clearly irrelevant lines
    if any(bad_word in text.lower() for bad_word in blacklist_keywords):
        continue

    # Normalize whitespace and clean trailing punctuation
    text = re.sub(r"\s+", " ", text).strip(" .,;:\n") + "."

    # Positive filters
    words = text.split()
    likely_stage = (
        re.match(r'[А-Яа-яЁёѣъ҃]{2,}(,|\s).*', text)
        and (
            any(verb in text.lower() for verb in stage_verbs) or
            ',' in text or
            ' и ' in text
        )
        and 2 <= len(words) <= 30
        and len(text) <= 150
    )

    if likely_stage:
        stages_all.add(text)

# Print clean, pretty output and modify the set during iteration
for stage in sorted(stages_all):
    if stage == 'Геростратъ, Ангелика, Дромонъ и.':
        # Remove the old value from the set and add the modified one
        stages_all.remove(stage)
        stages_all.add('Геростратъ, Ангелика, Дромонъ и Исмена.')

# Print the updated set
for stage in sorted(stages_all):
    print(stage)

Ангелика бросяся къ нему.
Геростратъ вынимаетъ шпагу.
Геростратъ и Дромонъ.
Геростратъ и подъячій.
Геростратъ одинъ.
Геростратъ особливо.
Геростратъ, Ангелика, Дромонъ и Исмена.
Геростратъ, идучи за нею.
Демифонъ обнимая ее.
Дромонъ Исменѣ тихонько.
Дромонъ и Исмена.
Дромонъ тяжко нарочно воздыхаетъ.
Исмена Дромону тихонько.
Клитандръ, Ангелика, Геростратъ, Дромонъ, Исмена, Секретарь и Подъячій.
Клитандръ, Ангелика, Дромонъ и Исмена.
Клитандръ, Геростратъ, Дромонъ и Исмена.
Менедемъ и Демифонъ.
Менедемъ одинъ.
Менедемъ, Демифонъ и Геростратъ.
Менедемъ, Демифонъ и Дромонъ.
Менедемъ, Демифонъ, Клитандръ, Ангелика, Геростратъ, Дромонъ, Исмена, Секретарь и Подъячій.
Секретарь, спрашивая Менедема.
Тѣ же, Парменонъ и Купецъ.
Тѣ же, кромѣ Секретаря и Подъячаго.


In [6]:
# Download raw HTML for line-by-line processing
sumarokov_play = requests.get(url).text
sumarokov_lines = sumarokov_play.split('\n')

In [8]:
root = etree.Element("TEI")
new_div = None     # Initialize div
new_speech = None  # Initialize speech element

for line in sumarokov_lines:
    # Replace encoded 'ѣ'
    line = re.sub("&#1123;", "ѣ", line)

    # Remove HTML-like tags such as &lt;sup&gt;
    line = re.sub(r"&lt;[^>]+&gt;", "", line)

    # Match speaker name like "Менедемъ."
    name = re.search(r"[А-Я][а-я]{6,10}\.", line)

    # Match paragraph (general line)
    paragraph = re.search(r"[А-Я].?\s?[а-яѣ]+,?\s?.*", line)

    # Match predefined stage directions
    for stage in stages_all:
        if stage in line and new_div is not None:
            new_stage = etree.SubElement(new_div, 'stage')
            new_stage.text = stage

    # Title and author block
    if 'ЯДОВИТЫЙ.' in line:
        new_title = etree.SubElement(root, 'titleStmt')
        new_title_element = etree.SubElement(new_title, 'title')
        new_title_element.set("type", "main")
        new_title_element.text = line.strip(".").capitalize()

        new_title_sub = etree.SubElement(new_title, 'title')
        new_title_sub.set("type", "sub")
        new_title_sub.text = "Комедия"

        new_author = etree.SubElement(root, 'author')
        persname = etree.SubElement(new_author, 'persName')

        forename = etree.SubElement(persname, 'forename')
        forename.text = "Александр"

        patronym = etree.SubElement(persname, 'forename')
        patronym.set("type", "patronym")
        patronym.text = "Петрович"

        surname = etree.SubElement(persname, 'surname')
        surname.text = "Сумароков"

    # New division for each act
    if 'ЯВЛЕНІЕ' in line:
        new_div = etree.SubElement(root, 'div')
        act_head = etree.SubElement(new_div, 'head')
        act_head.text = line
        new_speech = None  # Reset speech when act changes

    # Match speaker
    elif name and new_div is not None:
        if name.group() in unique_heroes:
            new_speech = etree.SubElement(new_div, 'sp')
            new_speaker = etree.SubElement(new_speech, 'speaker')
            new_speaker.text = name.group()

    # Match paragraph spoken by character
    elif paragraph and new_speech is not None:
        new_p = etree.SubElement(new_speech, 'p')

        # Extract and tag inline stage directions in parentheses
        inline_stages = re.findall(r"\([^()]+\)", paragraph.group())
        cleaned_text = paragraph.group()
        for inline in inline_stages:
            cleaned_text = cleaned_text.replace(inline, '').strip()

        new_p.text = cleaned_text

        # Add stage directions after speech
        for inline in inline_stages:
            inline_stage = etree.SubElement(new_p, "stage")
            inline_stage.text = inline

    # End of play
    if 'КОНЕЦЪ КОМЕДІИ.' in line and new_div is not None:
        ending_tag = etree.SubElement(new_div, 'trailer')
        ending_tag.text = line
        break

# Output XML
print(etree.tostring(root, pretty_print=True, encoding='unicode'))

<TEI>
  <titleStmt>
    <title type="main">Ядовитый</title>
    <title type="sub">Комедия</title>
  </titleStmt>
  <author>
    <persName>
      <forename>Александр</forename>
      <forename type="patronym">Петрович</forename>
      <surname>Сумароков</surname>
    </persName>
  </author>
  <div>
    <head>ЯВЛЕНІЕ I.</head>
    <stage>Менедемъ одинъ.</stage>
  </div>
  <div>
    <head>ЯВЛЕНІЕ   ІІ.</head>
    <stage>Менедемъ и Демифонъ.</stage>
    <sp>
      <speaker>Демифонъ.</speaker>
    </sp>
    <sp>
      <speaker>Менедемъ.</speaker>
      <p>А! другъ мой! отколѣ ты взялся? сколько лѣтъ мы съ тобою не видались! какъ ты скрылся? гдѣ былъ? обойми меня, и раскажи мнѣ свое подробно похожденіе.</p>
    </sp>
    <sp>
      <speaker>Демифонъ.</speaker>
      <p>Благоденствуешь ли возлюбленный мой другъ? въ добромъ ли ты здоровьи, и въ хорошихъ ли обстоятельствахъ?</p>
    </sp>
    <sp>
      <speaker>Менедемъ.</speaker>
      <p>И такъ и сякъ; однако, какъ ты видишъ, живъ еще по сіе

In [9]:
with open ('sumarokov_dracor.xml', 'w', encoding = 'utf-8') as dracor:
  dracor.write (etree.tostring(root, pretty_print=True, encoding='unicode'))