In [None]:
#@title Instalation des packages
!pip install salesforce-lavis &> /dev/null
!pip install faiss-cpu &> /dev/null
!pip install xmltodict &> /dev/null
!pip install gradio &> /dev/null
exit()
print("OK")

🟧 **WAIT RESTART** 🟧
**Attendez que l'environnement redémarre avant de lancer les cellules suivantes**

In [1]:
#@title Configuration GoogleDrive

from google.colab import drive
drive.mount('/content/drive/')

SMANTIC_DIR = "drive/MyDrive/smantic/"

Mounted at /content/drive/


In [2]:
#@title Imports
from enum import Enum
import os
from math import dist
import time
from transformers import MarianMTModel, MarianTokenizer
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import csv
import requests
import xmltodict
from PIL import Image
import shutil
import faiss
import numpy as np
from lavis.models import load_model_and_preprocess
import zipfile
from io import BytesIO
import ast
import sys
import traceback
import io
import gradio as gr

class SearchMantic:
    class Source(Enum):
        """Type de corpus

        Args:
            Enum (str): Type possible de corpus à traiter
        """
        ARK = "Ark"
        URL = "Url"
        LOCAL = "Local"

    def __init__(self):
        os.chdir(os.path.dirname(os.path.abspath("__file__")))

        self.translationModelName = 'Helsinki-NLP/opus-mt-fr-en'
        self.blip2ModelName = 'Salesforce/blip2-opt-2.7b-coco'

        self.default_batch_size = 1

        self.arkColName = "ark"
        self.urlImageColName = "image"
        self.LegendColName = "legend"
        self.LocalImageColName = "local_image"
        self.EnLegendColName = "en_legend"
        self.LegendEmbeddingColName = "legend_embedding"
        self.ImageEmbeddingColName = "image_embedding"

        self.indexsDirName = "INDEXS"
        self.imagesDirname = "IMAGES"
        self.datasetsDirname = "TMP"

        self.indexsDir = SMANTIC_DIR + self.indexsDirName
        self.imagesDir = SMANTIC_DIR + self.imagesDirname
        self.datasetsDir = SMANTIC_DIR + self.datasetsDirname

        self.device = self.get_device()
        self.CPU = "cpu"

        self.create_directories()
        self.load_models()
        self.get_local_indexs()

    def create_directories(self):
        if not os.path.exists(self.indexsDir):
            os.makedirs(self.indexsDir)
        if not os.path.exists(self.imagesDir):
            os.makedirs(self.imagesDir)
        if not os.path.exists(self.datasetsDirname):
            os.makedirs(self.datasetsDirname)

    def nombre_pages(self, ark):
        """Retourne le nombre de pages du document)

        Args:
            ark (str): Identifiant ark

        Returns:
            int: Nombre de pages dans le document
        """
        # In : identifiant ark | Out : nombre de pages (int)
        PAGINATION_BASEURL = 'https://gallica.bnf.fr/services/Pagination?ark='
        url = "".join([PAGINATION_BASEURL, ark])

        s = requests.get(url, stream=True)

        paginationdic = xmltodict.parse(s.text)
        nb_pages = int(paginationdic["livre"]["structure"]["nbVueImages"])
        return nb_pages

    def rect_distance(self, rect1, rect2):
        """Calcul la distance entre le milieu-bas du rectangle superieur (image) et le milieu-haut du rectangle inferieur(légende)

        Args:
            rect1 ([int]): Coordonées de l'image
            rect2 ([int]): Coordonnées de la possible légende

        Returns:
            float: Distance entre
        """
        x1, y1, x1b, y1b = rect1
        x2, y2, x2b, y2b = rect2
        # Coordonnées des milieux des côtés
        milieu_haut_rect2 = ((x2 + x2b) / 2, y2)
        milieu_bas_rect1 = ((x1 + x1b) / 2, y1b)
        # Calcul de la distance entre les milieux
        distance = dist(milieu_haut_rect2, milieu_bas_rect1)
        return distance

    def get_device(self):
        """Détecte la présence d'un GPU

        Returns:
            _type_: _description_
        """
        if torch.cuda.is_available():
            print(f"Using GPU : {torch.cuda.get_device_name(0)}")
            device = torch.device("cuda:0")
        else:
            print("Using CPU")
            device = torch.device("cpu")
        return device
    def reload_indexes(self):
        self.indexs = {filename: os.path.join(self.indexsDir, filename)  for filename in os.listdir(self.indexsDir) if filename.endswith(".smantic")}

    def get_local_indexs(self):
        """Liste les corpus indexés disponibles
        """
        if not os.path.exists(self.indexsDir):
            os.makedirs(self.indexsDir)
            print(f"\nLe dossier d'indexs {self.indexsDir} n'existe pas et a été créé.")
        #print(f"\nDossier d'indexs : {self.indexsDir}")
        _index_file_names = os.listdir(self.indexsDir)
        #self.indexs = {{filename: os.path.join(self.indexsDir, filename)} for filename in _index_file_names if filename.endswith(".smantic")}
        self.indexs = {filename: os.path.join(self.indexsDir, filename)  for filename in _index_file_names if filename.endswith(".smantic")}
        #print(f"\t{len(self.indexs)} corpus trouvé(s)")
        #for index in self.indexs:
            #print(f"\t\t- {index[0]} : {index[1]}")

    def load_corpus(self, corpus_name):
        print(f"Loading {corpus_name}")
        options = ["Image"]
        with zipfile.ZipFile(self.indexs[corpus_name], "r") as zipf:
            _serialized_index_image = zipf.read("cpu_image.index")
            try:
                _serialized_index_legend = zipf.read("cpu_legend.index")
                _serialized_index_mean = zipf.read("cpu_mean.index")

                options+=["Texte & Image"]
                options+=["Texte"]
            except:
                pass
            with zipf.open("dataset.csv") as dataframe_file:
                self.dataset = pd.read_csv(dataframe_file)

        _serialized_index_image = BytesIO(_serialized_index_image)
        _serialized_index_image = np.frombuffer(_serialized_index_image.getvalue(), dtype=np.uint8)
        self.index_image = faiss.deserialize_index(_serialized_index_image)
        print("Image index deserialized")

        try:

            _serialized_index_legend = BytesIO(_serialized_index_legend)
            _serialized_index_legend = np.frombuffer(_serialized_index_legend.getvalue(), dtype=np.uint8)
            self.index_legend = faiss.deserialize_index(_serialized_index_legend)
            print("Legend index deserialized")

            _serialized_index_mean = BytesIO(_serialized_index_mean)
            _serialized_index_mean = np.frombuffer(_serialized_index_mean.getvalue(), dtype=np.uint8)
            self.index_mean = faiss.deserialize_index(_serialized_index_mean)
            print("Mean index deserialized")
            print(f"'{corpus_name}' loaded")
        except:pass
        return options

    def embedd_input(self, input_txt, translate = True):

        if translate:
            print("Input translation")

            input_txt = self.translation_tokenizer(input_txt, return_tensors="pt", padding=True, truncation=True)
            input_txt = input_txt.to(self.device)
            input_txt = self.translation_model.generate(**input_txt)
            input_txt = self.translation_tokenizer.decode(input_txt[0], skip_special_tokens=True)
            print("Input translation done.")
        print("Input Embdding")
        sample = {"text_input": input_txt}
        text_emb = self.blip2_model.extract_features(sample, mode="text").text_embeds_proj[:,0,:] # size (1, 256)
        text_emb /= text_emb.norm(dim=-1, keepdim=True)
        print("Input embedding done.")
        return text_emb

    def search_sim_images(self, input_embedding, search_type, img_count, use_local_images = True):
        if search_type == "legend_embedding":
            index = self.index_legend
        elif search_type == "image_embedding":
            index = self.index_image
        else:
            index = self.index_mean
        distances, indices = index.search(input_embedding.cpu(), img_count)
        distances = distances[0]
        indices = indices[0]

        indices_distances = list(zip(indices, distances))
        indices_distances.sort(key=lambda x: x[1], reverse=True)

        if use_local_images and self.LocalImageColName in self.dataset.columns:
            print([SMANTIC_DIR+self.dataset.loc[indices,self.LocalImageColName] for indices, distances in indices_distances])
            if self.LegendColName in self.dataset.columns:
                return [(Image.open(SMANTIC_DIR+self.dataset.loc[indices, self.LocalImageColName]).convert("RGB"), f"{distances} - {self.dataset.loc[indices, self.LegendColName]}") for indices, distances in indices_distances]
            elif self.EnLegendColName in self.dataset.columns:
                return [(Image.open(SMANTIC_DIR+self.dataset.loc[indices, self.LocalImageColName]).convert("RGB"), f"{distances} - {self.dataset.loc[indices, self.EnLegendColName]}") for indices, distances in indices_distances]
            else:
                return [(Image.open(SMANTIC_DIR+self.dataset.loc[indices, self.LocalImageColName]).convert("RGB"), f"{distances}") for indices, distances in indices_distances]

        elif self.urlImageColName in self.dataset.columns:
            print([self.dataset.loc[indices,self.urlImageColName] for indices, distances in indices_distances])

            if self.LegendColName in self.dataset.columns:
                return [(Image.open(requests.get(self.dataset.loc[indices, self.urlImageColName], stream=True).raw), f"{distances} - {self.dataset.loc[indices, self.LegendColName]}") for indices, distances in indices_distances]
            elif self.EnLegendColName in self.dataset.columns:
                return [(Image.open(requests.get(self.dataset.loc[indices, self.urlImageColName], stream=True).raw), f"{distances} - {self.dataset.loc[indices, self.EnLegendColName]}") for indices, distances in indices_distances]
            else:
                return [(Image.open(requests.get(self.dataset.loc[indices, self.urlImageColName], stream=True).raw), f"{distances}") for indices, distances in indices_distances]


    def load_models(self):
        """Charge les models de traduction et d'embedding
        """
        print("Loading translation model...")
        self.translation_tokenizer = MarianTokenizer.from_pretrained(self.translationModelName, device_map=self.device)
        self.translation_model = MarianMTModel.from_pretrained(self.translationModelName)
        self.translation_model.to(self.device)
        #self.translation_model = self.translation_model.to(self.device)
        print("Done.")

        print("Loading Blip2 model...")
        self.blip2_model, self.blip2_image_processor, self.blip2_text_processor = load_model_and_preprocess(name="blip2_feature_extractor", model_type="coco", device=self.device)
        #self.blip2_tokenizer = AutoTokenizer.from_pretrained(self.blip2ModelName, device_map='auto', torch_dtype = torch.float16)
        #self.blip2_model = Blip2Model.from_pretrained(self.blip2ModelName, device_map='auto', torch_dtype = torch.float16)
        #self.blip2_processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b", device_map='auto', torch_dtype = torch.float16)
        print("Done.")

    def get_data_from_arks(self, arks, corpus_name):
        """Collecte les images et les légendes dans les pages des documents ark sur Gallica

        Args:
            arks ([str]): Liste d'identifiants ark  sur  Gallica
            corpus_name (str): Nom du corpus
        """
        if os.path.exists(os.path.join(self.datasetsDir, f"{corpus_name}.csv")):
            os.remove(os.path.join(self.datasetsDir, f"{corpus_name}.csv"))
        with open(os.path.join(self.datasetsDir, f"{corpus_name}.csv"), "a", encoding='utf-8') as data_file:
            csv_writer = csv.writer(data_file)
            csv_writer.writerow([self.urlImageColName, self.LegendColName])
            for ark in arks:
                print("\nARK :", ark)
                links = {}
                try:
                    pages = self.nombre_pages(ark)
                except:
                    print(f"Pas de page trouvé pour {ark}. Skipped")
                    continue

                for page in range(1, pages+1) :
                    images = []
                    texts = []
                    alto_url = 'https://gallica.bnf.fr/RequestDigitalElement?O={}&E=ALTO&Deb={}'.format(ark, page)
                    # Boucle de requête de l'alto de la page. Si erreur sleep 15 secondes. Skip la page à la 3eme erreur
                    fail_counter = 0
                    while True:
                        try:
                            s = requests.get(alto_url, stream=True)
                            break
                        except:
                            fail_counter += 1
                            if fail_counter > 2:
                                print("Echec de collecte de l'alto avec l'url :", alto_url, "Echecs:", fail_counter)
                                print("Top d'echecs, page is skiped")
                                break
                            else:
                                print("Echec de collecte de l'alto avec l'url :", alto_url, "Echecs:", fail_counter)
                                print("Nouvel essai dans 15 secondes")
                                time.sleep(15)
                                continue

                    # Vérifier si la page est est océrisée, sinon la page est skiped
                    try:
                        altodic = xmltodict.parse(s.text)
                    except :
                        print(ark, "Document non océrisé. Skiped")
                        break

                    # Collecte des images et légendes
                    print("==========", "Page", page,"==========")
                    cbs = altodic["alto"]["Layout"]["Page"].get("PrintSpace", {}).get("TextBlock", [])
                    if not isinstance(cbs, list): cbs = [cbs]

                    for cb in cbs:
                        content = []
                        textLines = cb.get("TextLine",[])
                        if not isinstance(textLines, list): textLines = [textLines]
                        for textLine in textLines:
                            strings = textLine.get("String",[])
                            if not isinstance(strings, list): strings = [strings]
                            content.extend(string.get("@CONTENT") for string in strings)
                        texts.append(((int(cb["@HPOS"]), int(cb["@VPOS"]), int(cb["@HPOS"])+int(cb["@WIDTH"]), int(cb["@VPOS"])+int(cb["@HEIGHT"])), " ".join(content)))

                    cbs = altodic["alto"]["Layout"]["Page"].get("PrintSpace", {}).get("Illustration", [])
                    if not isinstance(cbs, list): cbs = [cbs]
                    for cb in cbs:
                        images.append(((int(cb["@HPOS"]), int(cb["@VPOS"]), int(cb["@HPOS"])+int(cb["@WIDTH"]), int(cb["@VPOS"])+int(cb["@HEIGHT"])), cb))

                    cbs = altodic["alto"]["Layout"]["Page"].get("PrintSpace", {}).get("ComposedBlock", [])
                    if not isinstance(cbs, list): cbs = [cbs]
                    for cb in cbs:
                        illustration = cb.get("Illustration", [])
                        if not isinstance(illustration, list):
                            illustration = [illustration]
                        textBlocks = cb.get("TextBlock", [])
                        if not isinstance(textBlocks, list):
                            textBlocks = [textBlocks]
                        for cb in textBlocks:
                            content = []
                            textLines = cb.get("TextLine",[])
                            if not isinstance(textLines, list):
                                textLines = [textLines]
                            for textLine in textLines:
                                strings = textLine.get("String",[])
                                if not isinstance(strings, list): strings = [strings]
                                content.extend(string.get("@CONTENT") for string in strings)
                            texts.append(((int(cb["@HPOS"]), int(cb["@VPOS"]), int(cb["@HPOS"])+int(cb["@WIDTH"]), int(cb["@VPOS"])+int(cb["@HEIGHT"])), " ".join(content)))
                        for cb in illustration:
                            images.append(((int(cb["@HPOS"]), int(cb["@VPOS"]), int(cb["@HPOS"])+int(cb["@WIDTH"]), int(cb["@VPOS"])+int(cb["@HEIGHT"])),cb))

                    # Récupérer toutes les images (et leur légende) identifiées sur la page
                    for i,img in enumerate(images) :
                        url = "https://gallica.bnf.fr/iiif/ark:/12148/{}/f{}/{},{},{},{}/{}/0/native.jpg".format(ark,page,img[1]["@HPOS"],img[1]["@VPOS"],img[1]["@WIDTH"],img[1]["@HEIGHT"],"full")

                        # Identifier et récupérer la légende de l'image (si trouvée)
                        txt_rank = []
                        legend = []
                        for txt in texts:
                            distance = self.rect_distance(img[0], txt[0])
                            if distance <100 : legend.append(txt[1])
                            txt_rank.append((distance, txt[1]))
                        txt_rank.sort(key= lambda x : x[0])
                        if legend != []:
                            txt_legned = " ".join(legend)
                        else:
                            txt_legend = None
                        print("Image :", i, "| Description :", txt_legned)
                        csv_writer.writerow([url, txt_legned])

    def create_new_corpus(self, corpus_name, csv_path, translate_legends = True, translation_batch_size = None, embedding_batch_size = None, keep_images = True, sep=";", col_ark=None):
        """Créer un nouveau corpus d'images et de légendes à rechercher :
            - Collecte des urls pour les corpus d'identifiants arks
            - Traduction des légendes si option activée
            - Embedding des légendes (si présentes)
            - Embedding des images
            - Créationdes l'index FAISS à 3 canaux
            - Sauvegarde de l'index

        Args:
            corpus_name (str): Nom du corpus
            csv_path (str): chemin du fichier csv de corpus
            translate_legends (bool, optional): Traduire les légendes. Defaults to True.
            translation_batch_size (int, optional): Taille du batch pour la traduction. Defaults to None.
            embedding_batch_size (int, optional): Taille du batch pour les embeddings. Defaults to None.
        """
        if translation_batch_size is None:
            translation_batch_size = self.default_batch_size
        if embedding_batch_size is None:
            embedding_batch_size = self.default_batch_size

        print(f"\nCréation du corpus \"{corpus_name}\" à partir de \"{csv_path}\"")
        corpus_dataframe = pd.read_csv(csv_path, encoding="utf-8", sep = sep)
        """
        if self.arkColName in corpus_dataframe.columns:
            source = self.Source.ARK
        elif self.urlImageColName in corpus_dataframe.columns:
            source = self.Source.URL
        elif self.LocalImageColName in corpus_dataframe.columns:
            source = self.Source.LOCAL
        else:
            print("Erreur : Impossible de définir le type de source.")
            return
        """
        #print(f"\tType de source détecté : {source.value}")

        arks = corpus_dataframe[col_ark].tolist()
        if len(arks) == 0:
            print("Erreur : Aucun ark touvé")
            return
        else:
            print(f"\t{len(arks)} arks trouvés")
            self.get_data_from_arks(arks, corpus_name)
        corpus_csv_path = os.path.join(self.datasetsDir, f"{corpus_name}.csv")

        self.create_new_index(corpus_name,
                              csv_file = corpus_csv_path,
                              translate_legends = translate_legends,
                              translation_batch_size = translation_batch_size,
                              embedding_batch_size = embedding_batch_size,
                              keep_images=keep_images,
                              img_col = self.urlImageColName,
                              legend_col=self.LegendColName,
                              _sep=",",
                              local=False)

    def create_new_index(self, corpus_name, csv_file, translate_legends = True, translation_batch_size = None, embedding_batch_size = None, keep_images=True, img_col = None, legend_col=None, _sep=";", local=False):
        """ Crée un index FAISS pour un corpus embeddé.

        Args:
            corpus_name (str): Nom du corpus
            csv_file (str): path du corpus csv
            translate_legends (bool, optional): Traduire les légendes du corpus du françaos vers l'anglais. Defaults to True.
            translation_batch_size (int, optional): Taille du batch pour la traducrion. Defaults to None.
            embedding_batch_size (int, optional): Taille du batch pour les embeddings (textes et images). Defaults to None.
        """

        dataset = pd.read_csv(csv_file, encoding ="utf-8", sep=_sep)
        if translate_legends is True:
            dataset = dataset.rename(columns={legend_col: self.LegendColName})
        elif translate_legends is False:
            dataset = dataset.rename(columns={legend_col: self.EnLegendColName})
            print("titi",legend_col, local)
        if local is False:
            dataset = dataset.rename(columns={img_col: self.urlImageColName})
        else:
            dataset = dataset.rename(columns={img_col: self.LocalImageColName})



        if translation_batch_size is None:
            translation_batch_size = self.default_batch_size
        if embedding_batch_size is None:
            embedding_batch_size = self.default_batch_size

        # Traduction des legendes (s'il y a des légendes et que l'optio de traduction est activée)
        if self.LegendColName in dataset.columns and translate_legends and not self.EnLegendColName in dataset.columns :
            print("Traduction des légendes...")
            num_batches = len(dataset) // translation_batch_size + 1
            batches = [dataset.iloc[i*translation_batch_size:(i+1)*translation_batch_size] for i in range(num_batches)]
            dataset = pd.concat([self.translate_legend(batch) for batch in batches])
            print("Done.")
            dataset.to_csv(os.path.join(self.datasetsDir, f"{corpus_name}.csv"), encoding ="utf-8", index=False)

        if self.EnLegendColName in dataset.columns :
            print("toto")
            legendForEmbedding = self.EnLegendColName
        elif self.LegendColName in dataset.columns:
            legendForEmbedding = self.LegendColName
        else:
            legendForEmbedding = None
        print(legendForEmbedding)
        print(dataset.columns)
        # Embedding des legendes
        if legendForEmbedding is not None and self.LegendEmbeddingColName not in dataset.columns:
            num_batches = len(dataset) // embedding_batch_size + 1
            batches = [dataset.iloc[i*embedding_batch_size:(i+1)*embedding_batch_size] for i in range(num_batches)]
            dataset = pd.concat([self.legendsEmbedding(batch, legendForEmbedding) for batch in batches])
            dataset.to_csv(os.path.join(self.datasetsDir, f"{corpus_name}.csv"), encoding ="utf-8", index=False)

        # Embedding des images
        if not self.ImageEmbeddingColName in dataset.columns:
            if keep_images:
                IIIF_folder_path = os.path.join(self.imagesDir, corpus_name)
                if not os.path.exists(IIIF_folder_path):
                    os.makedirs(IIIF_folder_path)
                else:
                    #shutil.rmtree(IIIF_folder_path)
                    os.makedirs(IIIF_folder_path)

            num_batches = len(dataset) // embedding_batch_size + 1
            batches = [dataset.iloc[i*embedding_batch_size:(i+1)*embedding_batch_size] for i in range(num_batches)]
            dataset = pd.concat([self.imagesEmbedding(batch, corpus_name, keep_images, local) for batch in batches])
            dataset.to_csv(os.path.join(self.datasetsDir, f"{corpus_name}.csv"), encoding ="utf-8", index=False)

        # Création des indexs
        print("Création des indexs")
        if legendForEmbedding is not None:

            array = np.array(dataset[self.LegendEmbeddingColName].apply(lambda x:ast.literal_eval(str(x))).to_list())
            cpu_index = faiss.IndexFlatIP(array.shape[1])
            cpu_index.add(array)
            serialized_index_legend = faiss.serialize_index(cpu_index)
            serialized_index_legend = BytesIO(serialized_index_legend)

            legend_array = np.array(dataset[self.LegendEmbeddingColName].apply(lambda x:ast.literal_eval(str(x))).to_list())
            image_array = np.array(dataset[self.ImageEmbeddingColName].apply(lambda x:ast.literal_eval(str(x))).to_list())
            array_mean = (legend_array + image_array) / 2
            cpu_index = faiss.IndexFlatIP(array_mean.shape[1])
            cpu_index.add(array_mean)
            serialized_index_mean = faiss.serialize_index(cpu_index)
            serialized_index_mean = BytesIO(serialized_index_mean)

        array = np.array(dataset[self.ImageEmbeddingColName].apply(lambda x:ast.literal_eval(str(x))).to_list())
        cpu_index = faiss.IndexFlatIP(array.shape[1])
        cpu_index.add(array)
        serialized_index_image = faiss.serialize_index(cpu_index)
        serialized_index_image = BytesIO(serialized_index_image)


        print(f"Crétions des indexs pour {corpus_name}")
        dataset = pd.read_csv(os.path.join(self.datasetsDir, f"{corpus_name}.csv"), encoding ="utf-8")
        dataset_buffer = BytesIO()
        dataset.to_csv(dataset_buffer, index=False, encoding="utf-8")
        dataset_buffer.seek(0)

        with zipfile.ZipFile(os.path.join(self.indexsDir,f"{corpus_name}.smantic"), "w") as zipf:
            if legendForEmbedding is not None:
                zipf.writestr("cpu_legend.index", serialized_index_legend.getvalue())
                zipf.writestr("cpu_mean.index", serialized_index_mean.getvalue())
            zipf.writestr("cpu_image.index", serialized_index_image.getvalue())
            zipf.writestr("dataset.csv", dataset_buffer.getvalue())
        print("Done.")
        print(f"Index file : {corpus_name}.smantic")

    def translate_legend(self, batch):
        """Traduit les légendes par batch

        Args:
            batch (pd.DataFrame): Batch du corpus pour la traduction
        Returns:
            pd.DataFrame: Batch du corpuis traduit.
        """
        legends = batch[self.LegendColName].fillna("").tolist()
        legends_tokenized = self.translation_tokenizer(legends, return_tensors="pt", padding=True, truncation=True)
        legends_tokenized = legends_tokenized.to(self.device)
        translated_tokenized = self.translation_model.generate(**legends_tokenized)
        en_legends = [self.translation_tokenizer.decode(t, skip_special_tokens=True) for t in translated_tokenized]
        batch = batch.copy()
        batch[self.EnLegendColName] = en_legends
        print("\n\tTranslation batch done.")
        return batch

    def legendsEmbedding(self, batch, legendForEmbedding):
        """Embedding des légendes par batch

        Args:
            batch (pd.DataFrame): Batch du corpus pour l'embedding des légendes
            legendForEmbedding (str): Nom de la colonne de légende pour l'embedding
        Returns:
            pd.DataFrame: Batch du corpuis avec les légendes embeddés.
        """
        #text_proj = nn.Linear(2560, 256, device = self.device, dtype=torch.float16)
        base = batch[legendForEmbedding].fillna("").tolist()
        #inputs_text = self.blip2_tokenizer(base, padding=True, return_tensors="pt")
        #inputs_text = inputs_text.to(self.device)
        #text_features = self.blip2_model.get_text_features(**inputs_text, output_hidden_states = True)
        #text_embeddings = F.normalize(text_proj(text_features.hidden_states[-1][:, -1, :]), dim=-1, )

        text_input = [self.blip2_text_processor["eval"](txt) for txt in base]
        sample = {"text_input": text_input}

        text_emb = self.blip2_model.extract_features(sample, mode="text").text_embeds_proj[:,0,:] # size (1, 256)
        text_emb /= text_emb.norm(dim=-1, keepdim=True)

        batch = batch.copy()
        batch[self.LegendEmbeddingColName] = text_emb.tolist()
        print("\n\tLegend embedding batch done.")

        return batch

    def imagesEmbedding(self, batch, corpus_name, keep_images = False, local=False):
        """Embedding des images par batch

        Args:
            batch (pd.DataFrame): Batch du corpus pour l'embedding des images
        Returns:self.LocalImageColName
            pd.DataFrame: Batch du corpuis avec les images embeddés.
        """
        #text_proj = nn.Linear(1408, 256, device = self.device, dtype=torch.float16)
        if local is True:
            base = batch[self.LocalImageColName].tolist()
        else:
            base = batch[self.urlImageColName].tolist()
        indexs = batch.index.values.tolist()
        print(base)
        local_image_paths = []
        retry_count = 3
        while True:
            try:
                if local is False:
                    local_image_paths = []
                    local_image_paths_short_list = []
                    images = []
                    for i,image_url in enumerate(base):
                        img = requests.get(image_url, stream=True).content
                        images.append(img)

                        if keep_images:
                            local_image_path = os.path.join(self.imagesDir, corpus_name,f"{indexs[i]}.jpg")
                            local_image_path_short = os.path.join(self.imagesDirname, corpus_name,f"{indexs[i]}.jpg")
                            local_image_paths.append(local_image_path)
                            local_image_paths_short_list.append(local_image_path_short)
                            with open(local_image_path, 'wb') as out_file:
                                shutil.copyfileobj(io.BytesIO(img), out_file)
                    samples = [{"image": self.blip2_image_processor["eval"](Image.open(io.BytesIO(image)).convert("RGB")).unsqueeze(0).to(self.device)} for image in images]
                    base_images = [self.blip2_model.extract_features(sample, mode="image").image_embeds_proj[:,0,:].tolist()[0] for sample in samples]
                else:
                    samples = [{"image": self.blip2_image_processor["eval"](Image.open(image).convert("RGB")).unsqueeze(0).to(self.device)} for image in base]
                    base_images = [self.blip2_model.extract_features(sample, mode="image").image_embeds_proj[:,0,:].tolist()[0] for sample in samples]
                break
            except Exception:
                print(traceback.format_exc())
                if retry_count > 0:
                    print("Error : retry in 15 seconds.")
                    print(f"{retry_count} retry before skip")
                    retry_count-=1
                    time.sleep(15)
                else:
                    print(f"Error - images skipped")
                    break
        batch = batch.copy()
        batch[self.ImageEmbeddingColName] = base_images
        if keep_images and local is False:
            batch[self.LocalImageColName] = local_image_paths_short_list
        elif local is True:
            batch[self.LocalImageColName] = base

        print("\n\tImage embedding batch done.")

        return batch

class Logger:
    def __init__(self, filename):
        self.terminal = sys.stdout
        self.log = open(filename, "w")

    def write(self, message):
        self.terminal.write(message)
        self.log.write(message)

    def flush(self):
        self.terminal.flush()
        self.log.flush()

    def isatty(self):
        return False
print("OK")

OK


In [None]:
#@title Lancer l'interface

def main():
    #sys.stdout = Logger("output.log")

    def read_logs():
        sys.stdout.flush()
        with open("output.log", "r") as f:
            return f.read()

    type_search = {
        "Texte & Image":"mean_embedding",
        "Image":"image_embedding",
        "Texte":"legend_embedding"
    }

    smantic = SearchMantic()

    def search_csv_fn(coll_name_image, file_image, dropdown_image, keep_local_image, file_sep_image, r2, dropdown_legend, r):
        if r == "Urls":
            local=False
        else:
            local=True
        if file_sep_image == '[ , ] virgule':
            sep = ","
        else:
            sep= ";"

        if r2 == "Pas de légende":
            col_lgd=None
            trad = None
        elif r2 == "Français":
            trad = True
            col_lgd=dropdown_legend
        else:
            trad=False
            col_lgd=dropdown_legend
        smantic.create_new_index(coll_name_image, file_image, trad, translation_batch_size = 3, embedding_batch_size = 3, keep_images=keep_local_image, _sep=sep, img_col=dropdown_image, legend_col=col_lgd, local=local)

    def search_dir_fn(coll_name_dir_img, img_dir, use_filename_dir):
        img_dir = img_dir.replace("\\","/")
        noms_fichiers = [os.path.join(img_dir, fichier) for fichier in os.listdir(img_dir) if fichier[::-1].split(".",1)[0][::-1].lower() in ["png","jpg","jpeg"]]
        noms_seulement = [fichier for fichier in os.listdir(img_dir) if fichier[::-1].split(".",1)[0][::-1].lower() in ["png","jpg","jpeg"]]

        if(use_filename_dir):
            df = pd.DataFrame({smantic.LocalImageColName: noms_fichiers, smantic.LegendColName: noms_seulement})
            legend_col=smantic.LegendColName
        else:
            df = pd.DataFrame({smantic.LocalImageColName: noms_fichiers})
            legend_col=None
        csv_file_name=smantic.datasetsDirname+"/"+coll_name_dir_img+".csv"

        df.to_csv(csv_file_name, encoding="utf-8")
        smantic.create_new_index(coll_name_dir_img, csv_file_name, translate_legends=True, translation_batch_size = 3, embedding_batch_size = 3, keep_images=True, _sep=",", img_col=smantic.LocalImageColName, legend_col=legend_col, local=True)


    def search_ark_fn(coll_name_ark, file_ark, dropdown_ark, keep_local_ark, file_sep_ark):
        smantic.create_new_corpus(coll_name_ark, file_ark, translation_batch_size=3, embedding_batch_size=3, keep_images = keep_local_ark, sep=file_sep_ark, col_ark=dropdown_ark)

    def deactivate():
        return gr.update(interactive=False), gr.update(interactive=False),gr.update(interactive=False), gr.update(interactive=False),gr.update(interactive=False), gr.update(interactive=False),gr.update(interactive=False), gr.update(interactive=False)

    def activate():
        return gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True)

    def update_corpus():
        smantic.reload_indexes()
        return gr.update(choices= [corpus for corpus in smantic.indexs])

    def load_corpus(corpus_select):
        st = smantic.load_corpus(corpus_select)
        return gr.update(choices= st, value=st[0])

    def search(input_search_txt, input_search_type, img_count, search_translate, use_local_images):
        search_embedd = smantic.embedd_input(input_search_txt, search_translate)
        images_legend = smantic.search_sim_images(search_embedd,type_search[input_search_type], img_count, use_local_images)
        return images_legend

    def update_legend_choice(r2):
        if r2 == "Pas de légende":
            return gr.update(visible=False)
        else:
            return gr.update(visible=True)

    def update_image_choice(r1):
        if r1 == "Urls":
            return gr.update(visible=True)
        else:
            return gr.update(visible=False)

    def change_row(search_type):
        print(search_type)
        if search_type == "Images & légendes":
            return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)
        elif search_type == "Identifiants ARK (Gallica)" :
            return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
        else:
            return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)

    def load_df(file_path, file_ark_sep):
        if file_ark_sep == '[ , ] virgule':
            sep = ","
        else:
            sep= ";"
        print(file_path)
        try:
            df = pd.read_csv(file_path, encoding="utf-8", sep =sep)
            l = len(df.columns)
            return gr.update(value=df.head(5), visible=True), gr.update(choices=list(df.columns), visible=True), gr.update(choices=list(df.columns), visible=True), gr.update(visible=True), gr.update(visible=True)#, column_widths=[f"{int(100/l)}%"]*l)
        except:
            gr.Info("Le fichier csv n'a pas pu être chargé.Vérifiez le séparateur csv utilisé et assurez vous que le fichier est encodé en utf-8")
            return gr.update(visible=False), gr.update(choices=[], visible=True), gr.update(choices=[], visible=True), gr.update(visible=False), gr.update(visible=False)#, column_widths=[f"{int(100/l)}%"]*l)

    def load_df_ark(file_path, sep):
        if sep == '[ , ] virgule':
            sep = ","
        else:
            sep= ";"
        print(file_path)
        try:
            df = pd.read_csv(file_path, encoding="utf-8", sep =sep)
            l = len(df.columns)
            return gr.update(value=df.head(5), visible=True), gr.update(choices=list(df.columns), visible=True),  gr.update(visible=True)
        except:
            if file_path is not None:
                gr.Info("Le fichier csv n'a pas pu être chargé. Vérifiez le séparateur csv utilisé et assurez vous que le fichier est encodé en utf-8")
            return gr.update(visible=False), gr.update(visible=False),  gr.update(visible=False)

    def load_dir_image(img_dir):
        if img_dir == "":
            return gr.update(visible=False)
        else:
            return gr.update(visible=True)

    # Gradio UI
    with gr.Blocks() as demo:
        with gr.Tab("Recherche"):
            with gr.Row():
                corpus_select = gr.Dropdown([], label="Corpus", info="Choisissez un corpus", value=0)
            with gr.Row():
                update_btn = gr.Button("Rafraîchir la liste des corpus")
            with gr.Row():
                search_type = gr.Dropdown(["Texte & Image", "Image", "Texte"], label="Type de recherche", info="Choisissez un type de recherche", value="Texte & Image", interactive = False)
                with gr.Column():
                    search_txt = gr.Textbox(label="Recherche", info="Texte pour la recherche d'images", interactive = False)
                    search_translate = gr.Checkbox(label="Traduire la recherche en anglais", value=True, interactive = False)
                    use_local_images = gr.Checkbox(label="Utiliser les images locales (plus rapide)", value=True, interactive = False)
            with gr.Row():
                img_count = gr.Slider(1, 25, value=5, label="Nombre d'images", info="Choisissez le nombre d'images à rechercher", step=1, interactive = False)
            with gr.Row():
                search_btn = gr.Button("Rechercher des images", interactive = False)
            with gr.Row():
                gallery = gr.Gallery(label="Generated images", show_label=False, elem_id="gallery", columns=[3], rows=[1], object_fit="fill", height="auto", interactive = False)
        with gr.Tab("Créer un corpus") :
            with gr.Column():
                training_type = gr.Dropdown(["Dossier d'images", "Identifiants ARK (Gallica)", "Images & légendes"], label="Type de corpus", info="Choisissez un type de corpus", interactive = True)

            # Type : Dossier
            with gr.Column(visible=False) as groupDir:
                gr.HTML(value="<h1 style=\"margin-top:5px\"> Corpus d'images dans un dossier</h1>")
                with gr.Row():
                    coll_name_dir_img = gr.Textbox(label="Nom de collection", info="Entrez le nom de votre collection", interactive = True)
                with gr.Group():
                    with gr.Column():
                        img_dir = gr.Textbox(label="Chemin du dossier d'images", info="/chemin/du/dossier/",interactive = True)
                with gr.Column(visible=False) as group_dir_img_columns:
                    with gr.Row():
                        use_filename_dir = gr.Checkbox(label = "Utiliser les noms de fichiers comme légendes", value=False, visible=True, interactive = True)
                    search_btn_dir_img = gr.Button("Créer et entrainer le corpus", interactive = True, visible=True)

            # Type : Ark
            with gr.Column(visible=False) as groupArk:
                gr.HTML(value="<h1 style=\"margin-top:5px\"> Corpus Gallica (identifiants ARK)</h1>")
                with gr.Row():
                    coll_name_ark = gr.Textbox(label="Nom de collection", info="Entrez le nom de votre collection", interactive = True)
                with gr.Group():
                    with gr.Column():
                        file_sep_ark = gr.Radio(choices=['[ ; ] point-virgule', '[ , ] virgule'], label = "Séparateur CSV",  interactive = True, value='[ ; ] point-virgule')
                        file_ark = gr.File(label="Fichier CSV du corpus", file_types=[".csv"])
                with gr.Column(visible=False) as group_ark_columns:
                    df_ark = gr.Dataframe(interactive = False, visible=True)
                    with gr.Row():
                        dropdown_ark = gr.Dropdown(choices=[], label="Colonne ARK", info="Sélectionnez une colonne", interactive = True)
                    with gr.Row():
                        keep_local_ark = gr.Checkbox(label = "Conserver les images en local", value=True, visible=True, interactive = True)

                    search_btn_image = gr.Button("Créer et entrainer le corpus", interactive = True, visible=True)

            # Type: Local CSV
            with gr.Column(visible=False) as groupImages:
                gr.HTML(value="<h1 style=\"margin-top:5px\"> Corpus images & légendes</h1>")
                with gr.Row():
                    coll_name_image = gr.Textbox(label="Nom du corpus", info="Entrez le nom de votre corpus", interactive = True)
                with gr.Group():
                    with gr.Column():
                        file_image = gr.File(label="Fichier CSV du corpus", file_types=[".csv"])
                        file_sep_image = gr.Radio(choices=['[ ; ] point-virgule', '[ , ] virgule'], label = "Séparateur CSV",  interactive = True, value='[ ; ] point-virgule')
                with gr.Column(visible=False) as groupImages_columns:
                    df = gr.Dataframe(interactive = False, visible=False)
                    with gr.Column():
                        with gr.Column():
                            r = gr.Radio(label="Type des chemins d'images", choices = ["Chemins locaux", "Urls"], value="Chemins locaux", interactive = True)
                            dropdown_image = gr.Dropdown(choices=[], label="Colonne", info="Sélectionnez une colonne", interactive = True)
                            keep_local_image = gr.Checkbox(label = "Conserver les images en local", value=True, visible=False, interactive = True)
                        with gr.Column():
                            r2 = gr.Radio(label = "Langue des légendes",choices = ["Français", "Anglais", "Pas de légende"], value="Français", interactive = True)
                            dropdown_legend = gr.Dropdown(choices=[], label="Colonne", info="Sélectionnez une colonne", interactive = True)
                    search_ark_csv = gr.Button("Lancer la collecte et les traitements", interactive = True, visible=True)
        with gr.Accordion(label="Logs", open=False):
            logs = gr.Textbox()

        search_btn.click(fn=deactivate, outputs=[corpus_select, update_btn, search_type, search_txt, img_count, search_btn, search_translate, use_local_images]).then(search, inputs=[search_txt,search_type, img_count, search_translate, use_local_images], outputs=[gallery]).then(fn=activate, outputs=[corpus_select, update_btn, search_type, search_txt, img_count, search_btn, search_translate, use_local_images])
        update_btn.click(fn=deactivate, outputs=[corpus_select, update_btn, search_type, search_txt, img_count, search_btn, search_translate, use_local_images]).then(update_corpus, outputs=[corpus_select]).then(fn=activate, outputs=[corpus_select, update_btn, search_type, search_txt, img_count, search_btn, search_translate, use_local_images])

        search_ark_csv.click(fn=search_csv_fn, inputs=[coll_name_image, file_image, dropdown_image, keep_local_image, file_sep_image, r2, dropdown_legend, r])
        search_btn_image.click(fn=search_ark_fn, inputs=[coll_name_ark, file_ark, dropdown_ark, keep_local_ark,file_sep_ark])
        search_btn_dir_img.click(fn=search_dir_fn, inputs=[coll_name_dir_img, img_dir, use_filename_dir])

        corpus_select.change(fn=deactivate, outputs=[corpus_select, update_btn, search_type, search_txt, img_count, search_btn, search_translate, use_local_images]).then(load_corpus, inputs=[corpus_select], outputs=[search_type], show_progress=True).then(fn=activate, outputs=[corpus_select, update_btn, search_type, search_txt, img_count, search_btn, search_translate, use_local_images])

        training_type.change(fn=change_row, inputs=[training_type], outputs=[groupDir, groupArk, groupImages])
        file_image.change(fn=load_df, inputs=[file_image, file_sep_image], outputs=[df, dropdown_legend, dropdown_image, groupImages_columns])
        file_ark.change(fn=load_df_ark, inputs=[file_ark, file_sep_ark], outputs=[df_ark, dropdown_ark, group_ark_columns])
        img_dir.change(fn=load_dir_image, inputs=[img_dir], outputs=[group_dir_img_columns])
        r2.change(fn=update_legend_choice, inputs=[r2], outputs=[dropdown_legend])
        r.change(fn=update_image_choice, inputs=[r], outputs=[keep_local_image])
        #demo.load(read_logs, None, logs, every=1)
    demo.launch(debug=True)

if __name__ == "__main__":
    main()

Using GPU : Tesla T4
Loading translation model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Done.
Loading Blip2 model...


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

100%|██████████| 1.89G/1.89G [01:29<00:00, 22.5MB/s]


Position interpolate from 16x16 to 26x26


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

100%|██████████| 4.37G/4.37G [03:24<00:00, 23.0MB/s]


Done.




Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://726d457be51691108d.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


Identifiants ARK (Gallica)
/tmp/gradio/0299218e2f29f4a5d4732f6fab699580f03aa630/corpus_ark.csv

Création du corpus "" à partir de "/tmp/gradio/0299218e2f29f4a5d4732f6fab699580f03aa630/corpus_ark.csv"
	1 arks trouvés


  corpus_dataframe = pd.read_csv(csv_path, encoding="utf-8", sep = sep)



ARK : bpt6k9818982g
Image : 0 | Description : Gaule indépendante,
Image : 0 | Description : Druide.
Image : 0 | Description : Statue de 'Vereingétorix.
Image : 0 | Description : ' ' •« ! , Arc de triomphe d'Orange.
Image : 0 | Description : La cueillette du gui.
Image : 0 | Description : ... Vereingétorix devant César. ..
Image : 0 | Description : Le Colisée.
Image : 0 | Description : Le Colisée.
Image : 0 | Description : La Gaule, à l'avènement de Clovis.
Image : 0 | Description : Guerrier gerni,,tiii.
Image : 0 | Description : Clovis et' Clotilde. '
Image : 0 | Description : Clovis et' Clotilde. '
Image : 0 | Description : Mahomet.
Image : 0 | Description : Les moines défrichant les forêts,
Image : 0 | Description : Le vase de Soissons.
Image : 0 | Description : Le vœu de Clovis il la bataille de Tolbiac.
Image : 0 | Description : La bataille de Poitiers.
Image : 0 | Description : Empire de Charlemagne.
Image : 0 | Description : Charlemagne se faisant instruire.
Image : 0 | Descript




	Translation batch done.

	Translation batch done.





	Translation batch done.

	Translation batch done.





	Translation batch done.

	Translation batch done.





	Translation batch done.

	Translation batch done.





	Translation batch done.

	Translation batch done.





	Translation batch done.

	Translation batch done.





	Translation batch done.

	Translation batch done.

	Translation batch done.
Done.




toto
en_legend
Index(['image', 'legend', 'en_legend'], dtype='object')

	Legend embedding batch done.

	Legend embedding batch done.

	Legend embedding batch done.

	Legend embedding batch done.

	Legend embedding batch done.

	Legend embedding batch done.

	Legend embedding batch done.

	Legend embedding batch done.

	Legend embedding batch done.

	Legend embedding batch done.

	Legend embedding batch done.

	Legend embedding batch done.

	Legend embedding batch done.

	Legend embedding batch done.

	Legend embedding batch done.
['https://gallica.bnf.fr/iiif/ark:/12148/bpt6k9818982g/f8/250,732,1404,1170/full/0/native.jpg', 'https://gallica.bnf.fr/iiif/ark:/12148/bpt6k9818982g/f9/862,820,707,917/full/0/native.jpg', 'https://gallica.bnf.fr/iiif/ark:/12148/bpt6k9818982g/f10/235,566,711,1170/full/0/native.jpg']





	Image embedding batch done.
['https://gallica.bnf.fr/iiif/ark:/12148/bpt6k9818982g/f11/279,960,1171,830/full/0/native.jpg', 'https://gallica.bnf.fr/iiif/ark:/12148/bpt6k9818982g/f15/127,696,1340,987/full/0/native.jpg', 'https://gallica.bnf.fr/iiif/ark:/12148/bpt6k9818982g/f19/77,753,1378,1099/full/0/native.jpg']





	Image embedding batch done.
['https://gallica.bnf.fr/iiif/ark:/12148/bpt6k9818982g/f20/445,989,1175,854/full/0/native.jpg', 'https://gallica.bnf.fr/iiif/ark:/12148/bpt6k9818982g/f22/388,425,1358,733/full/0/native.jpg', 'https://gallica.bnf.fr/iiif/ark:/12148/bpt6k9818982g/f27/137,714,1447,1175/full/0/native.jpg']





	Image embedding batch done.
['https://gallica.bnf.fr/iiif/ark:/12148/bpt6k9818982g/f28/252,790,616,1377/full/0/native.jpg', 'https://gallica.bnf.fr/iiif/ark:/12148/bpt6k9818982g/f29/942,1124,638,945/full/0/native.jpg', 'https://gallica.bnf.fr/iiif/ark:/12148/bpt6k9818982g/f30/297,486,1392,696/full/0/native.jpg']





	Image embedding batch done.
['https://gallica.bnf.fr/iiif/ark:/12148/bpt6k9818982g/f31/948,1525,615,781/full/0/native.jpg', 'https://gallica.bnf.fr/iiif/ark:/12148/bpt6k9818982g/f33/89,723,1498,972/full/0/native.jpg', 'https://gallica.bnf.fr/iiif/ark:/12148/bpt6k9818982g/f36/495,817,1124,976/full/0/native.jpg']





	Image embedding batch done.
['https://gallica.bnf.fr/iiif/ark:/12148/bpt6k9818982g/f37/93,1395,1361,960/full/0/native.jpg', 'https://gallica.bnf.fr/iiif/ark:/12148/bpt6k9818982g/f41/89,890,1456,986/full/0/native.jpg', 'https://gallica.bnf.fr/iiif/ark:/12148/bpt6k9818982g/f47/137,811,1452,1192/full/0/native.jpg']





	Image embedding batch done.
['https://gallica.bnf.fr/iiif/ark:/12148/bpt6k9818982g/f48/360,759,1288,1081/full/0/native.jpg', 'https://gallica.bnf.fr/iiif/ark:/12148/bpt6k9818982g/f50/267,1072,1426,541/full/0/native.jpg', 'https://gallica.bnf.fr/iiif/ark:/12148/bpt6k9818982g/f53/818,1238,733,1195/full/0/native.jpg']





	Image embedding batch done.
['https://gallica.bnf.fr/iiif/ark:/12148/bpt6k9818982g/f56/349,787,1273,1158/full/0/native.jpg', 'https://gallica.bnf.fr/iiif/ark:/12148/bpt6k9818982g/f59/137,711,1422,1118/full/0/native.jpg', 'https://gallica.bnf.fr/iiif/ark:/12148/bpt6k9818982g/f61/158,689,1355,1368/full/0/native.jpg']





	Image embedding batch done.
['https://gallica.bnf.fr/iiif/ark:/12148/bpt6k9818982g/f67/170,644,1419,1486/full/0/native.jpg', 'https://gallica.bnf.fr/iiif/ark:/12148/bpt6k9818982g/f68/290,741,1402,712/full/0/native.jpg', 'https://gallica.bnf.fr/iiif/ark:/12148/bpt6k9818982g/f69/358,747,955,1222/full/0/native.jpg']





	Image embedding batch done.
['https://gallica.bnf.fr/iiif/ark:/12148/bpt6k9818982g/f70/502,775,1057,1292/full/0/native.jpg', 'https://gallica.bnf.fr/iiif/ark:/12148/bpt6k9818982g/f72/457,669,1081,1319/full/0/native.jpg', 'https://gallica.bnf.fr/iiif/ark:/12148/bpt6k9818982g/f73/322,662,1042,1307/full/0/native.jpg']





	Image embedding batch done.
['https://gallica.bnf.fr/iiif/ark:/12148/bpt6k9818982g/f75/351,635,948,1453/full/0/native.jpg', 'https://gallica.bnf.fr/iiif/ark:/12148/bpt6k9818982g/f77/327,1026,1073,830/full/0/native.jpg', 'https://gallica.bnf.fr/iiif/ark:/12148/bpt6k9818982g/f78/236,835,1497,1171/full/0/native.jpg']





	Image embedding batch done.
['https://gallica.bnf.fr/iiif/ark:/12148/bpt6k9818982g/f85/416,1270,903,1137/full/0/native.jpg', 'https://gallica.bnf.fr/iiif/ark:/12148/bpt6k9818982g/f86/255,577,516,1313/full/0/native.jpg', 'https://gallica.bnf.fr/iiif/ark:/12148/bpt6k9818982g/f87/793,784,723,1246/full/0/native.jpg']





	Image embedding batch done.
['https://gallica.bnf.fr/iiif/ark:/12148/bpt6k9818982g/f88/299,832,1448,1129/full/0/native.jpg', 'https://gallica.bnf.fr/iiif/ark:/12148/bpt6k9818982g/f89/113,1298,1447,521/full/0/native.jpg', 'https://gallica.bnf.fr/iiif/ark:/12148/bpt6k9818982g/f90/328,741,1441,1125/full/0/native.jpg']





	Image embedding batch done.
['https://gallica.bnf.fr/iiif/ark:/12148/bpt6k9818982g/f92/303,504,1425,717/full/0/native.jpg', 'https://gallica.bnf.fr/iiif/ark:/12148/bpt6k9818982g/f92/328,1541,1361,747/full/0/native.jpg', 'https://gallica.bnf.fr/iiif/ark:/12148/bpt6k9818982g/f93/1045,452,474,1204/full/0/native.jpg']





	Image embedding batch done.
['https://gallica.bnf.fr/iiif/ark:/12148/bpt6k9818982g/f96/308,829,1428,1150/full/0/native.jpg', 'https://gallica.bnf.fr/iiif/ark:/12148/bpt6k9818982g/f98/574,1006,929,1328/full/0/native.jpg']





	Image embedding batch done.
Création des indexs
Crétions des indexs pour 
Done.
Index file : .smantic
Loading .smantic
Image index deserialized
Legend index deserialized
Mean index deserialized
'.smantic' loaded
Input translation
Input translation done.
Input Embdding
Input embedding done.
['drive/MyDrive/smantic/IMAGES/29.jpg', 'drive/MyDrive/smantic/IMAGES/27.jpg', 'drive/MyDrive/smantic/IMAGES/28.jpg', 'drive/MyDrive/smantic/IMAGES/16.jpg', 'drive/MyDrive/smantic/IMAGES/42.jpg']




Dossier d'images
