In [1]:
try:import gradio as gr
except:
  %pip install gradio
  import gradio as gr

try:import xmltodict
except:
  %pip install xmltodict
  import xmltodict

try:from lavis.models import load_model_and_preprocess
except:
  %pip install salesforce-lavis
  from lavis.models import load_model_and_preprocess

try:from transformers import MarianMTModel, MarianTokenizer
except:
  %pip install transformers
  from transformers import MarianMTModel, MarianTokenizer

try:from sentence_transformers import SentenceTransformer, util
except:
  %pip install sentence-transformers
  from sentence_transformers import SentenceTransformer, util

import os
import numpy as np
import pandas as pd
from PIL import Image
import csv
from math import dist
import urllib.request, urllib.error, urllib.parse
from urllib.error import HTTPError, URLError
import requests
import torch
from torch import tensor

DIR_BASE = "IMG_CORPUS"
DIR_INDEX = "IMG_INDEXS"

model, vis_processors, txt_processors = load_model_and_preprocess(name="blip2_feature_extractor", model_type="coco", device="cuda")

model_name = 'Helsinki-NLP/opus-mt-fr-en'
tokenizer = MarianTokenizer.from_pretrained(model_name)
marian = MarianMTModel.from_pretrained(model_name)

  from .autonotebook import tqdm as notebook_tqdm


Position interpolate from 16x16 to 26x26


  return self.fget.__get__(instance, owner)()


In [2]:
def nombre_pages(ark):
  # In :  identifiant ark
  # Out : nombre de pages (int)
  PAGINATION_BASEURL = 'https://gallica.bnf.fr/services/Pagination?ark='
  url = "".join([PAGINATION_BASEURL, ark])
  s = requests.get(url, stream=True)
  paginationdic = xmltodict.parse(s.text)
  nb_pages = int(paginationdic["livre"]["structure"]["nbVueImages"])
  return nb_pages
    
def rect_distance(rect1, rect2):
    x1, y1, x1b, y1b = rect1
    x2, y2, x2b, y2b = rect2
    
    # Coordonnées des milieux des côtés
    milieu_haut_rect2 = ((x2 + x2b) / 2, y2)
    milieu_bas_rect1 = ((x1 + x1b) / 2, y1b)
    
    # Calcul de la distance entre les milieux
    distance = dist(milieu_haut_rect2, milieu_bas_rect1)
    
    return distance
    
def translate_legend(row):
    if row["legend"] is not np.nan and row["legend"] is not None and row["legend"].strip() != "":
        inputs = tokenizer.encode(row["legend"], return_tensors="pt")
        outputs = marian.generate(inputs, num_beams=4, max_length=50, early_stopping=True)
        translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        print(row["legend"], translated_text)
        return translated_text
    else:
        print(row["legend"], "")
        return ""       

def get_embedding(row):
    img_path = row["img_path"]
    txt = row["en_legend"] if row["en_legend"] is not np.nan else ""
    
    image = Image.open(img_path).convert("RGB")
    image_processed = vis_processors["eval"](image).unsqueeze(0).to("cuda")
    text_input = txt_processors["eval"](txt)
    sample = {"image": image_processed, "text_input": text_input}

    # English embedding
    text_emb = model.extract_features(sample, mode="text").text_embeds_proj[:,0,:] # size (1, 768)
    text_emb /= text_emb.norm(dim=-1, keepdim=True)

    # Image embedding
    image_emb = model.extract_features(sample, mode="image").image_embeds_proj[:,0,:] # size (1, 768)
    image_emb /= image_emb.norm(dim=-1, keepdim=True)

    # French embedding
    txt_fr = row["legend"] if row["legend"] is not np.nan else ""
    text_fr_input = txt_processors["eval"](txt)
    sample_french = {"image": image_processed, "text_input": text_fr_input}
    text_fr_emb = model.extract_features(sample_french, mode="text").text_embeds_proj[:,0,:] # size (1, 768)
    text_fr_emb /= text_emb.norm(dim=-1, keepdim=True)
    
    print(img_path, txt)
    
    return [text_emb, text_fr_emb, image_emb, torch.stack([text_emb,image_emb]).mean(dim=0), torch.stack([text_fr_emb,image_emb]).mean(dim=0)]

base = None
def load_csv(corpus_select):
    global DIR_INDEX
    global base
    print(corpus_select)
    base = pd.read_csv(DIR_INDEX+"/"+corpus_select)
    base['embedding'] = base['embedding'].apply(lambda x: eval(x)) 

def search_ark_fn(search_ark, coll_name):
    arks=search_ark.split(",")
    global DIR_BASE
    global DIR_INDEX
    try:
        os.makedirs(DIR_BASE)        
    except FileExistsError:pass
    try:
        os.makedirs(DIR_BASE+"/"+coll_name)        
    except:
        return "Erreur : ce nom de collection exite déjà ou n'est pas valide !"
    try:
        os.makedirs(DIR_INDEX)        
    except FileExistsError:pass

    base_img = []
    for ark in arks:
        links = {}
        pages = nombre_pages(ark)
        print("\nARK :", ark)
        for page in range(1, pages+1) : 
            images = []
            texts = []
            alto_url = 'https://gallica.bnf.fr/RequestDigitalElement?O={}&E=ALTO&Deb={}'.format(ark, page)
            s = requests.get(alto_url, stream=True)
            try:
                altodic = xmltodict.parse(s.text)
            except : 
                print(ark, "non océrisé. Skiped")
                break
            print("==========", "Page", page,"==========")
            cbs = altodic["alto"]["Layout"]["Page"].get("PrintSpace", {}).get("TextBlock", [])
            if not isinstance(cbs, list): cbs = [cbs]
            for cb in cbs:
                #print("TEXT:", "x:",int(cb["@HPOS"]),"y:",int(cb["@VPOS"]),"xb:",int(cb["@HPOS"])+int(cb["@WIDTH"]),"yb:",int(cb["@VPOS"])+int(cb["@HEIGHT"]))
                content = []
                textLines = cb.get("TextLine",[])
                if not isinstance(textLines, list): textLines = [textLines]
                for textLine in textLines:
                    strings = textLine.get("String",[])
                    if not isinstance(strings, list): strings = [strings]
                    content.extend(string.get("@CONTENT") for string in strings)
                texts.append(((int(cb["@HPOS"]), int(cb["@VPOS"]), int(cb["@HPOS"])+int(cb["@WIDTH"]), int(cb["@VPOS"])+int(cb["@HEIGHT"])), " ".join(content)))        
                #print(texts)
            cbs = altodic["alto"]["Layout"]["Page"].get("PrintSpace", {}).get("Illustration", [])
            if not isinstance(cbs, list): cbs = [cbs]
            for cb in cbs:
                #print("IMAGE:", "x:",int(cb["@HPOS"]),"y:",int(cb["@VPOS"]),"xb:",int(cb["@HPOS"])+int(cb["@WIDTH"]),"yb:",int(cb["@VPOS"])+int(cb["@HEIGHT"]))   
        
                images.append(((int(cb["@HPOS"]), int(cb["@VPOS"]), int(cb["@HPOS"])+int(cb["@WIDTH"]), int(cb["@VPOS"])+int(cb["@HEIGHT"])), cb))
            cbs = altodic["alto"]["Layout"]["Page"].get("PrintSpace", {}).get("ComposedBlock", [])
            if not isinstance(cbs, list): cbs = [cbs]
            for cb in cbs:
                illustration = cb.get("Illustration", [])
                if not isinstance(illustration, list): illustration = [illustration]
                textBlocks = cb.get("TextBlock", [])
                if not isinstance(textBlocks, list): textBlocks = [textBlocks]
                for cb in textBlocks:
                    #print("TEXT[CB]:", "x:",int(cb["@HPOS"]),"y:",int(cb["@VPOS"]),"xb:",int(cb["@HPOS"])+int(cb["@WIDTH"]),"yb:",int(cb["@VPOS"])+int(cb["@HEIGHT"]))
                    content = []
                    textLines = cb.get("TextLine",[])
                    if not isinstance(textLines, list): textLines = [textLines]
                    for textLine in textLines:
                        strings = textLine.get("String",[])
                        if not isinstance(strings, list): strings = [strings]
                        content.extend(string.get("@CONTENT") for string in strings)
                    texts.append(((int(cb["@HPOS"]), int(cb["@VPOS"]), int(cb["@HPOS"])+int(cb["@WIDTH"]), int(cb["@VPOS"])+int(cb["@HEIGHT"])), " ".join(content)))        
                    #print(texts)
                for cb in illustration:
                    #print("IMAGE[CB]:", "x:",int(cb["@HPOS"]),"y:",int(cb["@VPOS"]),"xb:",int(cb["@HPOS"])+int(cb["@WIDTH"]),"yb:",int(cb["@VPOS"])+int(cb["@HEIGHT"]))
                    images.append(((int(cb["@HPOS"]), int(cb["@VPOS"]), int(cb["@HPOS"])+int(cb["@WIDTH"]), int(cb["@VPOS"])+int(cb["@HEIGHT"])),cb))
            #print(images)
            #print(texts)
            for i,img in enumerate(images) :
                url = "https://gallica.bnf.fr/iiif/ark:/12148/{}/f{}/{},{},{},{}/{}/0/native.jpg".format(ark,page,img[1]["@HPOS"],img[1]["@VPOS"],img[1]["@WIDTH"],img[1]["@HEIGHT"],"full")
                print("Image", i)
                nomfichier = ark+"_"+str(page)+"_"+img[1]["@ID"]+".jpg"
                cheminout = DIR_BASE+"/"+coll_name+"/"+nomfichier                

                while True:
                    try :
                        urllib.request.urlretrieve(url, cheminout)
                    except (HTTPError, URLError) as erreur:
                        print(str(erreur.reason))
                        print("wait 10 seconds")
                        time.sleep(10)
                    break
                
                try:
                    #imagePIL = Image.open(requests.get(url, stream=True).raw)
                    #display(imagePIL)
                    txt_rank = []
                    legend = []
                    for txt in texts:
                        distance = rect_distance(img[0], txt[0])
                        if distance <80 : legend.append(txt[1])
                        txt_rank.append((distance, txt[1]))
                    txt_rank.sort(key= lambda x : x[0])
                    txt_legned = " ".join(legend)
                    print("Description :",txt_legned)
                    #for rank in txt_rank:print(rank)
                    base_img.append((cheminout, txt_legned))
                except:
                    pass
                        
    index_df = pd.DataFrame(base_img, columns=["img_path", "legend"])
    
    legends = index_df["legend"].tolist()
    translated = marian.generate(**tokenizer(legends, return_tensors="pt", padding=True))
    en_legends = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
    index_df["en_legend"] = np.array(en_legends)
    index_df["en_legend"][(index_df["legend"].isna()) | (index_df["legend"].str.strip() == "") | (index_df["legend"] is None)] = ""    
    #index_df["en_legend"] = index_df.apply(translate_legend, axis = 1)
    index_df["embedding"] = index_df.apply(get_embedding, axis = 1)
    index_df.to_csv(DIR_INDEX+"/"+coll_name+".csv", index=False, encoding="utf-8")  

def update_corpus():    
    corpus = []    
    for csv_file in os.listdir(DIR_INDEX):
        if csv_file.lower().endswith('.csv'):
            corpus.append(csv_file)
    return gr.update(choices=corpus)

    
def deactivate():
    return gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False)

def activate():
    return gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True)

def search(input_search_txt, input_search_type, img_count):
    global base
    def sim_calc(row):
        text_sim = -1 if row["en_legend"] is np.nan or row["en_legend"] == "" else util.cos_sim(row["embedding"][0], input_search_embedding).cpu().numpy()[0]
        text_fr_sim = -1 if row["legend"] is np.nan or row["legend"] == "" else util.cos_sim(row["embedding"][1], input_search_embedding).cpu().numpy()[0]
        image_sim = util.cos_sim(row["embedding"][2], input_search_embedding)
        mean_sim = util.cos_sim(row["embedding"][3], input_search_embedding)
        mean_fr_sim = util.cos_sim(row["embedding"][4], input_search_embedding)
        return [text_sim, text_fr_sim, image_sim.cpu().numpy()[0], mean_sim.cpu().numpy()[0], mean_fr_sim.cpu().numpy()[0]]
    
    def get_text_sim(row):
        return row["sim"][0]
    
    def get_text_fr_sim(row):
        return row["sim"][1]
    
    def get_image_sim(row):
        return row["sim"][2]
    
    def get_mean_sim(row):
        return row["sim"][3]
    
    def get_mean_fr_sim(row):
        return row["sim"][4]
    
    print(input_search_txt)
    images = []
    inputs = tokenizer.encode(input_search_txt, return_tensors="pt")
    outputs = marian.generate(inputs, num_beams=4, max_length=50, early_stopping=True)
    input_search_tk = tokenizer.decode(outputs[0], skip_special_tokens=True)   

        
    input_search = txt_processors["eval"](input_search_tk)
    input_search_sample = {"image": None, "text_input": [input_search]}
    input_search_embedding = model.extract_features(input_search_sample, mode="text").text_embeds_proj[:,0,:] # size (1, 768)
    input_search_embedding /= input_search_embedding.norm(dim=-1, keepdim=True)
    
    base['sim'] = base.apply(sim_calc, axis = 1)
    
    base["text_sim"] = base.apply(get_text_sim, axis = 1)
    text_sim_moy = base[base['text_sim'] != -1]['text_sim'].mean()
    base['text_sim'][base['text_sim'] == -1] = text_sim_moy
    
    base["text_fr_sim"] = base.apply(get_text_fr_sim, axis = 1)
    text_fr_sim_moy = base[base['text_fr_sim'] != -1]['text_fr_sim'].mean()
    base['text_fr_sim'][base['text_fr_sim'] == -1] = text_fr_sim_moy
    
    base["image_sim"] = base.apply(get_image_sim, axis = 1)
    base["mean_sim"] = base.apply(get_mean_sim, axis = 1)
    base["mean_fr_sim"] = base.apply(get_mean_fr_sim, axis = 1)
    base["total_mean"]= (base["text_sim"] +base["image_sim"]+base["mean_sim"])/3
    if input_search_type == "Texte & Image":
        filter = "total_mean"
    elif input_search_type == "Image":
        filter = "image_sim"
    else:
        filter = "text_sim"
        
    for index, row in base.sort_values(filter, ascending=False).head(img_count).iterrows():
        image = Image.open(row["img_path"]).convert("RGB")
        images.append((image, str(row["legend"])))
    return images

In [3]:
with gr.Blocks() as demo:
    with gr.Tab("Recherche"):
        with gr.Row():
            corpus_select = gr.Dropdown([], label="Corpus", info="Choisissez un corpus", value=0)
            update_btn = gr.Button("Rafraîchir la liste des corpus")
        with gr.Row():
            search_type = gr.Dropdown(["Texte & Image", "Image", "Texte"], label="Type de recherche", info="Choisissez un type de recherche", value=0, interactive = False)
            search_txt = gr.Textbox(label="Recherche", info="Texte pour la recherche d'images", interactive = False)
        with gr.Row():
            img_count = gr.Slider(1, 25, value=5, label="Nombre d'images", info="Choisissez le nombre d'images à rechercher", step=1, interactive = False)
        with gr.Row():
            search_btn = gr.Button("Rechercher des images", interactive = False)
        with gr.Row():
            gallery = gr.Gallery(label="Generated images", show_label=False, elem_id="gallery", columns=[3], rows=[1], object_fit="fill", height="auto", interactive = False)
       
    with gr.Tab("Collecte"):
        coll_name = gr.Textbox(label="Nom de collection", info="Entrez le nom de votre collection", interactive = True)
        search_ark = gr.Textbox(label="Arks", info="Identifiants Arks à collecter", interactive = True)
        search_ark_btn = gr.Button("Lancer la collecte et les traitements", interactive = True)
        output_ark = gr.Textbox(label="Console", info="Logs de la collecte", interactive = False)
        
    search_btn.click(fn=deactivate, outputs=[corpus_select, update_btn, search_type, search_txt, img_count, search_btn, coll_name, search_ark, search_ark_btn]).then(search, inputs=[search_txt,search_type, img_count], outputs=[gallery]).then(fn=activate, outputs=[corpus_select, update_btn, search_type, search_txt, img_count, search_btn, coll_name, search_ark, search_ark_btn])
    update_btn.click(fn=deactivate, outputs=[corpus_select, update_btn, search_type, search_txt, img_count, search_btn, coll_name, search_ark, search_ark_btn]).then(update_corpus, outputs=[corpus_select]).then(fn=activate, outputs=[corpus_select, update_btn, search_type, search_txt, img_count, search_btn, coll_name, search_ark, search_ark_btn])
    search_ark_btn.click(fn=deactivate, outputs=[corpus_select, update_btn, search_type, search_txt, img_count, search_btn, coll_name, search_ark, search_ark_btn]).then(search_ark_fn, inputs=[search_ark, coll_name], outputs=[output_ark]).then(fn=activate, outputs=[corpus_select, update_btn, search_type, search_txt, img_count, search_btn, coll_name, search_ark, search_ark_btn])
    corpus_select.change(fn=deactivate, outputs=[corpus_select, update_btn, search_type, search_txt, img_count, search_btn, coll_name, search_ark, search_ark_btn]).then(load_csv, inputs=[corpus_select], show_progress=True).then(fn=activate, outputs=[corpus_select, update_btn, search_type, search_txt, img_count, search_btn, coll_name, search_ark, search_ark_btn])

demo.launch()



Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




manuels_histoire.csv
Château
cheval
cheval
chevaux
cartes geographie
Moustache
outils artisanat
Charlemagne
