In [20]:
import pytesseract
from pdf2image import convert_from_path
from transformers import AutoTokenizer, MBartForConditionalGeneration, pipeline
import os


tokenizer = AutoTokenizer.from_pretrained('./ru-mbart-large-summ')
model = MBartForConditionalGeneration.from_pretrained('./ru-mbart-large-summ')

os.environ['TESSDATA_PREFIX'] = '/opt/homebrew/share/tessdata'

pages = convert_from_path('test3.pdf', dpi=300)

extracted_text = ''

for i, page in enumerate(pages):
    text = pytesseract.image_to_string(page, lang='rus+eng')
    extracted_text += f'--- Страница {i+1} ---\n' + text + '\n'

summ = pipeline("summarization", model=model, tokenizer=tokenizer)

res = summ(extracted_text)[0]['summary_text']
res

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

'Министр образования и науки РФ Владимир ПРЕОБРАЖЕНСКИЙ: «Высшее профессиональное образование Москвы — это школа дополнительного образования»'

In [29]:
import pytesseract
from pdf2image import convert_from_path
from transformers import AutoTokenizer, MBartForConditionalGeneration, pipeline
import os
import torch

os.environ['TESSDATA_PREFIX'] = '/opt/homebrew/share/tessdata'

device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained('tuned/final_model')
model = MBartForConditionalGeneration.from_pretrained('tuned/final_model').to(device)


def convert_docx_to_txt(docx_path):
    """
    Читает документ .docx и возвращает его содержимое в виде строки.
    """
    doc = Document(docx_path)
    full_text = [para.text for para in doc.paragraphs]
    text = "\n".join(full_text)
    return text

def ocr_pdf(pdf_path):
    """
    Takes a .pdf document and returns extracted text
    """
    os.environ['TESSDATA_PREFIX'] = '/opt/homebrew/share/tessdata'
    pages = convert_from_path(pdf_path, dpi=300)
    extracted_text = ''
    for i, page in enumerate(pages):
        text = pytesseract.image_to_string(page, lang='rus+eng')
        extracted_text += text
    return extracted_text

def get_text_from_file(file_path):
    """
    Извлекает текст из файла с поддержкой форматов .txt, .docx и .pdf.
    """
    _, ext = os.path.splitext(file_path)
    ext = ext.lower()
    
    if ext == ".txt":
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read()
    elif ext == ".docx":
        return convert_docx_to_txt(file_path)
    elif ext == ".pdf":
        return ocr_pdf(file_path)
    else:
        print(f"Неподдерживаемый формат файла: {file_path}")
        return ""
    

summ = pipeline("summarization", device=device, model=model, tokenizer=tokenizer, do_sample=False, max_length=120, min_length=60)

text = get_text_from_file("proc_data/7.txt")
tokenizer_kwargs = {'truncation':True,'max_length':1024}
res = summ(text, **tokenizer_kwargs)[0]['summary_text']
res

'Настоящее положение дел в Российском государственном научно-техническом университете создано в соответствии с законом «О государственной политике Российской Федерации» и уставом Национального исследовательского университета «Высшая школа экономики» (НОУ) «Управление правовой политики НИУ ВШЭ».'

In [14]:
tokenizer_kwargs = {'truncation':True,'max_length':1024}
summ(extracted_text, **tokenizer_kwargs)

[{'summary_text': 'Эксперты «МК» узнали, как будет развиваться отечественный интеллектуальный интеллект'}]