In [1]:
import re

from docx import Document
import fitz
import pydjvu
import zipfile

In [None]:
import spacy
from spacy import displacy

nlp = spacy.load('en_core_web_trf')
nlp.add_pipe('coreferee')

In [4]:
# предобработка текста
def preprocess_text(text):

    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'[^\w\s.,!?;:]', '', text)
    text = re.sub(r'\s+', ' ', text)

    text = text.lower()

    return text.strip()

In [None]:
def extract_text_from_pdf(pdf_path):
    text = ""
    doc = fitz.open(pdf_path)

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text("text")

    return text

In [None]:
def extract_text_from_docx(file_path):

    doc = Document(file_path)

    text = []
    for para in doc.paragraphs:
        text.append(para.text)

    return '\n'.join(text)

In [None]:
def extract_text_from_djvu(file_path):
    doc = pydjvu.DjVuDocument(file_path)

    text = []
    for page_num in range(doc.num_pages):
        page = doc.pages[page_num]
        page_text = page.text
        if page_text:
            text.append(page_text)

    return '\n'.join(text)

In [6]:
def extract_text_from_zip(zip_path):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        text = []

        for file_name in zip_ref.namelist():
            if file_name.endswith('.txt'):
                with zip_ref.open(file_name) as file:
                    file_text = file.read().decode('utf-8')
                    text.append(file_text)

    return '\n'.join(text)

In [7]:
# Разрешение анафоры
def resolve_coreferences(text):
    doc = nlp(text)
    resolved_text = doc.text

    for chain in doc._.coref_chains:
        for mention in chain:
            if mention.is_coref:
                antecedent = doc[chain[0][0]:chain[0][1]]
                if doc[mention[0]].pos_ == 'PRON':
                    resolved_text = resolved_text.replace(doc[mention[0]:mention[1]].text, antecedent.text)

    return resolved_text