In [40]:
from datasets import load_dataset, concatenate_datasets

In [41]:
dataset = load_dataset("slon-hk/BooksSummarizationRU")

In [42]:
dataset

DatasetDict({
    train: Dataset({
        features: ['ID', 'Title', 'Author', 'Summary', 'Full Text'],
        num_rows: 1271
    })
    test: Dataset({
        features: ['ID', 'Title', 'Author', 'Summary', 'Full Text'],
        num_rows: 142
    })
})

In [43]:
dataset = dataset.map(lambda x: {"len": len(x["Full Text"])})

In [44]:
full_dataset = concatenate_datasets([dataset["train"], dataset["test"]])

In [45]:
import pandas as pd

df = pd.DataFrame(full_dataset)
df.shape

(1413, 6)

In [46]:
df.isnull().sum()

ID           0
Title        0
Author       0
Summary      0
Full Text    0
len          0
dtype: int64

In [47]:
df = (
    df.assign(author_books=lambda d: d['Author'].map(d['Author'].value_counts()))
    .sort_values(by=['Title', 'author_books'], ascending=[True, False])
    .drop_duplicates(subset='Title', keep='first')
    .drop(columns='author_books')
)
df.shape

(647, 6)

In [48]:
median_len = df['len'].median()
print(f"Median length: {median_len}")
mean_len = df['len'].mean()
print(f"Mean length: {mean_len}")
max_len = df['len'].quantile(0.5)
print(f"Max length: {max_len}")
min_len = df['len'].quantile(0.1)
print(f"Min length: {min_len}")

Median length: 134369.0
Mean length: 299835.64760432765
Max length: 134369.0
Min length: 12415.800000000001


In [49]:
import math

CHARS_PER_PAGE = 2000
df['num_pages'] = df['len'].apply(lambda x: math.ceil(x / CHARS_PER_PAGE))
df = df[df['num_pages'] > 1]

In [50]:
df_selected = df[(df['len'] < max_len) & (df['len'] > min_len)].copy()
print(df_selected.shape)
df_selected['len'].describe()

(258, 7)


count       258.000000
mean      53796.259690
std       37108.223272
min       12437.000000
25%       21913.250000
50%       40690.000000
75%       85124.000000
max      134216.000000
Name: len, dtype: float64

In [51]:
from razdel import sentenize

df_selected['num_sentences'] = df_selected['Full Text'].apply(lambda x: len(list(sentenize(x))))

In [52]:
df_selected.reset_index(drop=True, inplace=True)
df_selected[['Title', 'len', 'num_pages', 'num_sentences']].head()

Unnamed: 0,Title,len,num_pages,num_sentences
0,«Простите нас!»,25903,13,302
1,«Человек за бортом»,36979,19,489
2,Адский житель,59232,30,435
3,Аленький цветочек,43553,22,252
4,Альберт,50643,26,650


In [53]:
import time
import requests
import os

def process_text_udpipe(text, model='russian-syntagrus-ud-2.15-241121', output_format='conllu'):
    url = 'https://lindat.mff.cuni.cz/services/udpipe/api/process'

    params = {
        'tokenizer': 'ranges',  # Включает токенизацию по умолчанию и сохранение диапазонов токенов
        'tagger': '',           # Включает POS-тегирование, морфологический анализ и лемматизацию (соответствует "Tag and Lemmatize")
        'parser': '',           # Включает синтаксический анализ зависимостей (соответствует "Parse")
        'model': model,         # Модель для обработки (соответствует "russian-syntagrus-ud-2.15-241121")
        'data': text,           # Входной текст
        'output': output_format # Формат вывода
    }

    response = requests.post(url, data=params)

    if response.status_code == 200:
        result = response.json()
        if 'result' in result:
            return result['result']
        else:
            return result
    else:
        raise Exception(f"Ошибка запроса: {response.status_code}, {response.text}")

if not os.path.exists('books'):
    os.mkdir('books')
for index, row in df_selected.iterrows():
    print(index, row['Title'], row['len'], row['num_sentences'])
    text = row['Full Text']
    conllu_output = process_text_udpipe(text)
    id = row['ID']
    with open(f'books/{id}.conllu', 'w') as f:
        f.write(conllu_output)
    time.sleep(5)

0 «Простите нас!» 25903 302
1 «Человек за бортом» 36979 489
2 Адский житель 59232 435
3 Аленький цветочек 43553 252


KeyboardInterrupt: 