In [None]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.embeddings import HuggingFaceEmbeddings
import aspose.words as aw
import os
import pdfminer
from unstructured.partition.pdf import partition_pdf
from unstructured.staging.base import elements_to_json
import pytesseract
os.environ['PATH'] += r';C:\Program Files\Tesseract-OCR' # dodajem tesseract u Path varijable
#C:\Program Files\Tesseract-OCR

**Parsiranje pdf fajla u elemente pomocu partition_pdf, hi_res strategije i yolox modela. Elemente ćemo sačuvati u folder results/ da bi se mogle isprobavati različite strategije chunkovanja i veličine chunkova.**

In [None]:
elements = partition_pdf(
    filename='../data/ISLP.pdf',  # putanja do pdf fajla
    strategy="hi_res", # strategija za obradu pdf dokumenta
    infer_table_structure=True, # prepoznaje da u dokumentu postoji tabela i strukturira je
    model_name="yolox", # model koji ćemo koristiti za prepoznavanje i analizu objekata u slikama
    extract_images_in_pdf=True, # omogucuje ekstrakciju slika iz pdf - a u folder figures/
)

In [None]:
import pickle
with open("results/pdf_elements.pkl","wb") as file:
    pickle.dump(elements,file)

In [None]:
with open("results/pdf_elements.pkl","rb") as file:
    elements = pickle.load(file)

In [None]:
len(elements)

8555

**Vidimo da nisu svi elementi koji su Header/Footer prepoznati kao Header/Footer, vec da su mnogi Header/Footer elementi prepoznati kao elementi klase ListItem, pa je potrebno po sadržaju odrediti koji elementi remete grupisanje u chunkove. Zato će sadržaj svih ListItem, Header, Footer elemenata biti skladišten u .csv fajl da bi se istrenirao klasifikator koji prepoznaje po sadržaju da li je element Header/Footer ili regularni ListItem (bullet u nabrajanju).**

In [None]:
import csv

train_data = list()
for element in elements:
    if str(element.__class__.__name__)=="Footer" or str(element.__class__.__name__)=='Header' or str(element.__class__.__name__)=="ListItem":
        train_data.append({'text':element.text,'content_type':str(element.__class__.__name__),'junk':''})
        #print(element.text)
filename="../data/junk_classification.csv"
with open(filename,mode='w',newline='',encoding='utf-8') as file:
    writer = csv.DictWriter(file,fieldnames=train_data[0].keys())
    writer.writeheader()
    writer.writerows(train_data)

**U fajlu bert_for_text_classification nalazi se fine - tune - ovan bert-base uncased model**

In [None]:
from bert_for_text_classification import load_model, predict_probabilities

model, tokenizer =load_model()
predictions = predict_probabilities("1. Introduction 11",model, tokenizer) # provera

In [None]:
model, tokenizer =load_model()
for element in elements:
    if str(element.__class__.__name__)=='Header' or str(element.__class__.__name__)=='Footer' or str(element.__class__.__name__)=='ListItem': #
        probs = predict_probabilities(element.text,model, tokenizer)
        if probs[0][1].float() > 0.5: # znaci da je u pitanju junk
            elements.remove(element)

In [None]:
len(elements) # obrisano je 585 elemenata

7975

**Sačuvaćemo elemente koji su redukovani za Header / Footer sadržaj u odnosu na početni skup pdf elemenata.**

In [None]:
with open("results/elements_without_header.pkl","wb") as file:
    pickle.dump(elements,file)

In [None]:
import pickle
with open("results/elements_without_header.pkl","rb") as file:
    elements = pickle.load(file)

**Povezivanje tekstualnih elemenata u instance klase CompositeElement kao skup tekstualnih elemenata**

In [None]:
from unstructured.chunking.basic import chunk_elements
from unstructured.chunking.title import chunk_by_title


elements_chunk_by_title = chunk_by_title(elements,
                                         combine_text_under_n_chars=2500, # svi manji chunkovi od 1000 karaktera se kombinuju
                                         max_characters=4000, # maximalna velicina chunka
                                         multipage_sections=True,
                                         new_after_n_chars=3800, # ako je chunk veci od ovoga nece vise rasti do 1500
                                         overlap=True,
                                         #ovde je text separator by title, tj. \n\n, samim tim ne postoji ova opcija da se doda
                                         )

In [None]:
with open("results/chunking_strategy_4/chunked_elements.pkl","wb") as file:
    pickle.dump(elements_chunk_by_title,file)

In [None]:
elements_after_chunking =  chunk_elements(elements,       # ostavljen je prostor za drge strategije chunk - ovanja
                                          max_characters=8000,
                                          new_after_n_chars=7800,
                                          )

**U sledećoj ćeliji nalazi se celokupni kod ubačen u jednu funkciju**

In [None]:
def parse_pdf():
    # ekstrakcija teksta
    elements = partition_pdf(
        filename='../data/ISLP.pdf',  # putanja do pdf fajla
        strategy="hi_res", # strategija za obradu pdf dokumenta
        infer_table_structure=True, # prepoznaje da u dokumentu postoji tabela i strukturira je
        model_name="yolox", # model koji ćemo koristiti za prepoznavanje i analizu objekata u slikama
        extract_images_in_pdf=True,
        )


    #eliminacija headera i footera
    model, tokenizer =load_model()
    for element in elements:
        if str(element.__class__.__name__)=='Header' or str(element.__class__.__name__)=='Footer' or str(element.__class__.__name__)=='ListItem':
            probs = predict_probabilities(element.text,model, tokenizer)
        if probs[0][1].float() > 0.5: # znaci da je u pitanju junk
            elements.remove(element) # znaci da je pobrkao title sa headerom


    #chunkovanje
    elements_chunk_by_title = chunk_by_title(elements,
                                            combine_text_under_n_chars=1000, # svi manji chunkovi od 7500 karaktera se kombinuju
                                            max_characters=1600, # maximalna velicina chunka
                                            multipage_sections=True,
                                            new_after_n_chars=1200, # ako je chunk veci od ovoga nece vise rasti do 8000
                                            overlap=True,
                                            #ovde je text separator by title, tj. \n\n, samim tim ne postoji ova opcija da se doda
                                            )
    return elements_chunk_by_title

In [None]:
chunks = parse_pdf()

Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
