In [None]:
%pip install unstructured
%pip install trasformers
%pip install "unstructured[all-docs]"
%pip install chromadb
%pip install sentence_transformers

In [None]:
from unstructured.partition.pdf import partition_pdf
import chromadb
from sentence_transformers import SentenceTransformer
from PIL import Image
import os
from transformers import pipeline
import pickle
from transformers import AutoTokenizer, CLIPTextModelWithProjection
import requests


In [None]:
def get_chunks_of_texts_and_images_from_pdf(path_file):
    image_path = "figures"
    chunks_of_texts = partition_pdf(path_file, chunking_strategy="by_title", strategy="hi_res", extract_images_in_pdf=True,
                                    extract_image_block_to_payload=False, 
                                    extract_image_block_output_dir=image_path,
                                    max_characters=1000,
                                    overlap=100)
   
    return chunks_of_texts, image_path

In [None]:
def emdedd_images(image_path, embedding_model):

    def get_images_and_paths_from_folder(folder_path): # function written by chatGPT
        images = []
        image_paths = []
        for filename in os.listdir(folder_path):
            image_path = os.path.join(folder_path, filename)
            try:
                img = Image.open(image_path)
                images.append(img)
                image_paths.append(image_path)
            except Exception as e:
                print(f"Error loading image {filename}: {e}")
        return images,image_paths

    images_original,image_paths = get_images_and_paths_from_folder(image_path+ '/')
    image_embeddings = embedding_model.encode(images_original)

    return images_original, image_embeddings, image_paths

In [None]:
def embedd_texts(chunks_of_texts,summarize_model,embedding_model,embedding_tokenizer):
    original_texts = []
    text_ids=[]
    for text_element in chunks_of_texts:
        original_texts.append(text_element.text)
        text_ids.append(text_element.id)
    short_original_texts_dict = summarize_model(original_texts, max_length=77,min_length=10)
    short_original_texts = [element['summary_text'] for element in short_original_texts_dict]
    inputs = embedding_tokenizer(short_original_texts, padding=True,  return_tensors="pt", truncation=True)
    outputs = embedding_model(**inputs)
    text_embeddings = outputs.text_embeds
    text_embeddings = text_embeddings.tolist()

    return original_texts, text_embeddings, text_ids
    

In [None]:

class SearchEngine():
    def __init__(self,url) -> None:
        
        path_file = 'paper.pdf'  
        response = requests.get(url)
        with open(path_file, 'wb') as f:
            f.write(response.content)

        chunks_of_texts, image_path = get_chunks_of_texts_and_images_from_pdf(path_file)
        # with open('chunks_pick.pkl', 'rb') as file:
        #     chunks_of_texts = pickle.load(file)
        # with open('image_path.pkl', 'rb') as file:
        #     image_path = pickle.load(file)

        self.embedding_model_images = SentenceTransformer('clip-ViT-B-32')
        summarize_model = pipeline("summarization", model="facebook/bart-large-cnn")                
        self.embedding_model = CLIPTextModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")
        self.embedding_tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

        images_original, image_embeddings, image_paths = emdedd_images(image_path, self.embedding_model_images)

        original_texts, text_embeddings, text_ids = embedd_texts(chunks_of_texts,summarize_model,self.embedding_model,self.embedding_tokenizer)

        client = chromadb.PersistentClient(path="chroma_collections/")
        self.collection = client.get_or_create_collection(name="pdf_collection")

        self.collection.add(
            embeddings = image_embeddings,
            documents = image_paths,
            ids = image_paths
        )

        self.collection.add(
            embeddings = text_embeddings,
            documents = original_texts,
            ids = text_ids
        )
    
    def retriewe_information(self,query_text:str,n_results):
        inputs = self.embedding_tokenizer(query_text, padding=True,  return_tensors="pt", truncation=True)
        outputs = self.embedding_model(**inputs)
        text_embeddings = outputs.text_embeds
        text_embeddings = text_embeddings.tolist()
        results = self.collection.query(query_embeddings=text_embeddings, n_results=n_results)

        return results


In [None]:
#path_file = "docs/2307.06435.pdf"
url = 'https://arxiv.org/pdf/2307.06435.pdf'
engine = SearchEngine(url)

In [None]:
n_answers = 3
query_result = engine.retriewe_information(['What is the trend of papers released over the years containing the keywords Large Language Model'],n_answers)

def vizualize_results():
    for document in query_result['documents'][0]:
        if document.split('/')[0]=='figures':
            img = Image.open(document)
            img.show()
        else:
            print(document)

vizualize_results()
               