## Some python packages installation ...

In [None]:
! pip install SPARQLWrapper
! pip install pandas
! pip install requests
! pip install Pillow
! pip3 install -U scikit-learn
! pip install numpy
! pip install ipywidgets

# Images Download

In [None]:
import sys
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON
import requests
import shutil
import os

endpoint_url = "https://query.wikidata.org/sparql"

# Get cities
query = """SELECT ?item ?itemLabel ?image WHERE{

  ?item wdt:P279 wd:Q144;
          wdt:P18 ?image.
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}LIMIT 100"""


def get_results(endpoint_url, query):
    user_agent = "WDQS-example Python/%s.%s" % (
        sys.version_info[0],
        sys.version_info[1],
    )
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()


array = []
results = get_results(endpoint_url, query)

print(results)
for result in results["results"]["bindings"]:
    array.append(
        (
            result["itemLabel"]["value"],
            result["item"]["value"],
            result["image"]["value"],
        )
    )

dataframe = pd.DataFrame(array, columns=["itemLabel", "item", "image"])


def download_image(url):
    try:
        directory = './content/images'

        if not os.path.exists(directory):
            os.makedirs(directory)

        headers = {"User-Agent": "Mozilla/5.0"}
        request = requests.get(url, allow_redirects=True, headers=headers, stream=True)
        if request.status_code == 200:
            filename_without_extension, extension = os.path.splitext(os.path.basename(url))
            filename = filename_without_extension + extension
            image_path = os.path.join(directory, filename)
            with open(image_path, "wb") as image:
                request.raw.decode_content = True
                shutil.copyfileobj(request.raw, image)
        return request.status_code
    except Exception as e:
              print("An error occurred:", str(e))


dataframe.image.apply(download_image)

# Exif Extraction

In [None]:
from re import I
import os, sys
from PIL import Image ,TiffImagePlugin
from PIL.ExifTags import TAGS
from urllib.parse import urlparse, unquote
import json


folder_path = './content/images'
folder_dataexif_path = './content/exif/'

errors = []

file_count = 0
exif_count = 0


exif_images = {}

for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    if os.path.isfile(file_path):
      file_count += 1
      try:
        imgfile = Image.open(file_path)
        exif_data = imgfile._getexif()

        if exif_data:
            try:
              exif_dict = {}
              for tag, value in exif_data.items():
                if not (type(value) is TiffImagePlugin.IFDRational or type(value) is bytes or type(value) is tuple or type(value) is dict):
                  tagname = TAGS.get(tag, tag)
                  if 'ImageDescription' in tagname or 'Orientation' in tagname:
                    exif_dict[tagname] = value

              if len(exif_dict) != 0:
                exif_images[filename] = exif_dict

            except Exception as e:
              print("An error occurred:", str(e))

            exif_count += 1
      except:
        errors.append(file_path)

if len(exif_images) != 0:
  with open("./content/exif.json", "w") as file:
            json.dump(exif_images, file, indent=4)


print("Summary: "+ str(file_count) +" files scanned, "+ str(exif_count)+" has exif data -> "+str(len(errors)) + " errors ")
print(errors)

# Dominant Colors Extraction

In [2]:
import os
import json
import numpy as np
from PIL import Image
from sklearn.cluster import KMeans


def get_dominant_color(image_path, num_colors=3):
    image = Image.open(image_path)
    image = image.resize((150, 150))  # resize image to reduce processing time
    pixels = np.array(image).reshape(-1, 3)
    kmeans = KMeans(n_clusters=num_colors)
    kmeans.fit(pixels)
    dominant_color = kmeans.cluster_centers_[0]
    return dominant_color

image_dir = './content/images'

num_colors = 3

metadata = {}

for filename in os.listdir(image_dir):
      image_path = os.path.join(image_dir, filename)

      dominant_color = get_dominant_color(image_path, num_colors)

      metadata[filename] = dominant_color.tolist()

with open('./content/dominants_colors.json', 'w') as f:
    json.dump(metadata, f, indent=4)


# Images Tag

In [None]:
import ipywidgets as widgets
from PIL import Image
import os
import json

class ImageTaggerApp:


    def __init__(self, images, tags):
        self.matrix = {}

        self.images = images
        self.tags = tags
        self.current_index = 0

        self.image_output = widgets.Output()
        self.tag_checkboxes = [widgets.Checkbox(description=tag, value=0) for tag in tags]
        self.prev_button = widgets.Button(description='Précédent')
        self.next_button = widgets.Button(description='Suivant')
        self.finish_button = widgets.Button(description='Terminer')

        self.prev_button.on_click(self.show_previous_image)
        self.next_button.on_click(self.show_next_image)
        self.finish_button.on_click(self.finish)

        self.show_image()
        self.show()

    def show_image(self):
        height = 600
        with self.image_output:
            self.image_output.clear_output(wait=True)
            img = Image.open(self.images[self.current_index])
            if height is not None:
                img = img.resize((int(img.width * (height / img.height)), height))
            display(img)

    def show(self):
        display(self.image_output)
        display(widgets.VBox(self.tag_checkboxes))
        display(widgets.HBox([self.prev_button, self.next_button, self.finish_button]))

    def show_next_image(self, _):
        self.save_tags()
        self.current_index = (self.current_index + 1) % len(self.images)
        self.set_tags()
        self.show_image()

    def show_previous_image(self, _):
        self.save_tags()
        self.current_index = (self.current_index - 1) % len(self.images)
        self.set_tags()
        self.show_image()

    def finish(self, _):
        # Save tags
        with open("./content/tags.json", "w") as file:
            json.dump(self.matrix, file)


    def save_tags(self):
        image_path = self.images[self.current_index]

        tag_values = {}
        for checkbox in self.tag_checkboxes:
            tag_values[checkbox.description] = checkbox.value

        self.matrix[image_path] = tag_values


    def set_tags(self):
        image_path = self.images[self.current_index]

        if image_path in self.matrix and self.matrix[image_path] is not None:
            node = self.matrix[image_path]

            for checkbox in self.tag_checkboxes:
                checkbox.value = node[checkbox.description]
        else:
            for checkbox in self.tag_checkboxes:
                checkbox.value = 0

def main():
    directory = "./content/images"
    image_paths = [os.path.join(directory, file) for file in os.listdir(directory)]
    tags = ['petit', 'moyen', 'grand', 'poils courts', 'poils long','oreilles tombantes', 'oreilles relevés', 'museau plat', 'museau moyen', 'museau alongé']
    app = ImageTaggerApp(image_paths, tags)

if __name__ == "__main__":
    main()

# User rating app & Data Analysis 

In [None]:
from sklearn import tree
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
import json
import random
import ipywidgets as widgets
from IPython.display import display, clear_output
import os
from PIL import Image

def extract_data_from(path):
    with open(path, 'r') as file:
        return json.load(file)



# The 100 images matrix extraction
def extract_matrix():
    matrix = {}


    tags = extract_data_from('./content/tags.json')
    exifs = extract_data_from('./content/exif.json')
    dominants_colors = extract_data_from('./content/dominants_colors.json')

    for image in tags:
        image_name = os.path.basename(image)

        
        image_tags = tags[image]

        exif = {}
        if image_name in exifs and exifs[image_name] is not None:
            exif = exifs[image_name]

        if image_name in dominants_colors and dominants_colors[image_name] is not None:
            dominant_color = dominants_colors[image_name]

        orientation = 0
        if 'Orientation' in exif and exif['Orientation'] is not None:
            orientation = exif['Orientation'] 


        petit = 0 if image_tags["petit"] == False else 1
        moyen = 0 if image_tags["moyen"] == False else 1
        grand = 0 if image_tags["grand"] == False else 1
        court = 0 if image_tags["poils courts"] == False else 1
        long = 0 if image_tags["poils long"] == False else 1
        tomb = 0 if image_tags["oreilles tombantes"] == False else 1
        releves = 0 if image_tags["oreilles relevÃ©s"] == False else 1
        nez_plat = 0 if image_tags["museau plat"] == False else 1
        nez_moyen = 0 if image_tags["museau moyen"] == False else 1
        nez_alonge = 0 if image_tags["museau alongÃ©"] == False else 1

        node = [
            petit,
            moyen,
            grand,
            court,
            long,
            tomb,
            releves,
            nez_plat,
            nez_moyen,
            nez_alonge,
            orientation,
            dominant_color[0],
            dominant_color[1],
            dominant_color[2]
            ]

        matrix[image] = node

    return matrix

# Choose Random images
def pick_random_images(path, nb):
    files = os.listdir(path)
    return random.sample(files, nb)


def transform_matrix(data):

    # Since we got a binary matrix that is describing following attributes we want to 
    # extract them.

    # Before to this we have this binary matrix
    #    {
    #     "petit": false,
    #     "moyen": false,
    #     "grand": true,
    #     "poils courts": false,
    #     "poils long": true,                   -> [0,0,1,0,1,1,0,0,0,1]
    #     "oreilles tombantes": true,
    #     "oreilles relevés": false,
    #     "museau plat": false,
    #     "museau moyen": false,
    #     "museau alongé": true
    #     "orientation" : ..,
    #     "red" : ..,
    #     "green" : ..,
    #     "blue" : ..
    #   }

    # Now
    #   ["grand","long","tombantes","alongé","paysage", 123, 321, 2]
    #   

    

    transformed_data = []
    for d in data:

        taille = None
        poils = None
        oreilles = None
        museau = None
        orientation= None

        if d[0] == 1:
            if taille is None:
                taille ='petit'
            else:
                print("Error during attributes set")     
        if d[1] == 1:
            if taille is None:
                taille ='moyen'
            else:
                print("Error during attributes set")     
        if d[2] == 1:
            if taille is None:
                taille ='grand'
            else:
                print("Error during attributes set")     
        if d[3] == 1:
            if poils is None:
                poils ='courts'
            else:
                print("Error during attributes set")     
        if d[4] == 1:
            if poils is None:
                poils ='longs'
            else:
                print("Error during attributes set")     
        if d[5] == 1:
            if oreilles is None:
                oreilles ='tombantes'
            else:
                print("Error during attributes set")     
        if d[6] == 1:
            if oreilles is None:
                oreilles ='relevés'
            else:
                print("Error during attributes set")     
        if d[7] == 1:
            if museau is None:
                museau ='plat'
            else:
                print("Error during attributes set")     
        if d[8] == 1:
            if museau is None:
                museau ='moyen'
            else:
                print("Error during attributes set")    
        if d[9] == 1:
            if museau is None:
                museau ='alongé'
            else:
                print("Error during attributes set")     
        if d[10] == 1:
            if orientation is None:
                orientation ='vertical'
            else:
                print("Error during attributes set")     
        else:
            if orientation is None:
                orientation ='paysage'
            else:
                print("Error during attributes set")     
        transformed_data.append([taille,poils,oreilles,museau,orientation, d[11], d[12], d[13]]) 

    return transformed_data
    


class ImageRatingApp:
    def __init__(self, image_list):
        self.matrix = {}

        self.image_list = image_list
        self.current_index = 0
        self.ratings = {}
        self.image_output = widgets.Output()
        self.rating_dropdown = widgets.RadioButtons(options=['like', 'dislike'])
        self.next_button = widgets.Button(description='Suivant')
        self.next_button.on_click(self.next_image)
        
        self.show_image()
        self.show()
        
    def show_image(self):
        height = 600
        with self.image_output:
            clear_output(wait=True)

            self.image_output.clear_output(wait=True)
            img = Image.open(self.image_list[self.current_index])
            if height is not None:
                img = img.resize((int(img.width * (height / img.height)), height))
            display(img)

            display(self.rating_dropdown)
            display(self.next_button)

    def show(self):
        display(self.image_output)

            
    def next_image(self, b):
        self.ratings[self.current_index] = self.rating_dropdown.value
        self.current_index += 1
        if self.current_index < len(self.image_list):
            self.show_image()
            self.save_rating()
            self.set_rating()
        else:
            self.finish_rating()
            
    def finish_rating(self):

        sorted_images_predictions = self.compute_proba()

        # Select 10 best proba images
        images_to_display = sorted_images_predictions
        for image in self.matrix:
            if image in images_to_display:
                images_to_display.pop(image)
        images_to_display = list(images_to_display.items())[:10]


        clear_output(wait=True)


        for image in images_to_display:
            file = open(image[0], "rb")
            img = file.read()
            display( widgets.Image(
                value=img,
                format='png',
                width=300,
                height=400,
            ))
            display(widgets.Label(value=str(image[1])))
    
        

    def compute_proba(self):
        # The vector extraction
        images_matrix = extract_matrix()
        
        # Dataframe Array
        dataframe_array = []
        for image in self.matrix:
            if image in images_matrix and images_matrix[image] is not None:
                dataframe_array.append(images_matrix[image])
            else:
                print("Error during vector extraction")

        # Result Array   
        result_array = []
        for image in self.matrix:
            result_array.append(self.matrix[image])

        dataframe_array = transform_matrix(dataframe_array)


        label_encoder = LabelEncoder()
        labels = label_encoder.fit_transform(result_array)

        # Separate categorical and numerical features
        categorical_features = [0, 1, 2, 3, 4]  # Assuming the first 5 features are categorical
        numerical_features = [5, 6, 7]

        # Define the column transformer
        preprocessor = ColumnTransformer(
            transformers=[
                ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
                ('num', StandardScaler(), numerical_features)
            ])

        # Transform the data
        X_encoded = preprocessor.fit_transform(dataframe_array)

        X_train, X_test, y_train, y_test = train_test_split(X_encoded, labels, test_size=0.2, random_state=42)

        # Train a Logical Regression model
        model = LogisticRegression(max_iter=1000)
        model.fit(X_train, y_train)


        # Predictions
        images_predictions = {}
        for img in images_matrix:
            image_vector = [images_matrix[img]]
            image_transformed_vector = transform_matrix(image_vector)

            X_new = preprocessor.transform(image_transformed_vector)
            prediction = model.predict_proba(X_new)

            images_predictions[img] = prediction[0][1] #[1] for the positive class


        sorted_images_predictions = dict(sorted(images_predictions.items(), key=lambda item: item[1], reverse=True))

        # We save all these predictions in preddiction.json
        with open("./content/prediction.json", "w") as file:
            json.dump(sorted_images_predictions, file)

        return sorted_images_predictions

    def save_rating(self):
        image_path = self.image_list[self.current_index]
        self.matrix[image_path] = self.rating_dropdown.value

    def set_rating(self):
        image_path = self.image_list[self.current_index]

        if image_path in self.matrix and self.matrix[image_path] is not None:
            self.rating_dropdown.value = self.matrix[image_path]
        else:
            self.rating_dropdown.value = None



def main():
    # Random images pick up
    image_folder = "./content/images"
    images = pick_random_images(image_folder,7)
    selected_images = []
    for image in images:
        selected_images.append(image_folder + "/" + image)

    # Rating app launch
    app = ImageRatingApp(selected_images)

if __name__ == "__main__":
    main()

Image(value=b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01\x00H\x00H\x00\x00\xff\xe1\x1eKExif\x00\x00II*\x00\x…

Label(value='0.9897312549399508')

Image(value=b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01\x00H\x00H\x00\x00\xff\xe1\x130Exif\x00\x00II*\x00\x…

Label(value='0.9896756574608203')

Image(value=b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x02\x01\x00\xf0\x00\xf0\x00\x00\xff\xe1\x15<Exif\x00\x00II*…

Label(value='0.9884291892360776')

Image(value=b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x02\x01\x00H\x00H\x00\x00\xff\xe1\x14\xf5Exif\x00\x00MM\x00…

Label(value='0.9832578631391183')

Image(value=b'\xff\xd8\xff\xe1\x1b\x87Exif\x00\x00II*\x00\x08\x00\x00\x00\x0f\x00\x00\x01\x03\x00\x01\x00\x00\…

Label(value='0.9765353405523279')

Image(value=b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01\x00`\x00`\x00\x00\xff\xe1\x87\x0bExif\x00\x00II*\x0…

Label(value='0.9763321458942955')

Image(value=b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01\x00H\x00H\x00\x00\xff\xdb\x00C\x00\x06\x04\x05\x06\…

Label(value='0.976039322758252')

Image(value=b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01\x00H\x00H\x00\x00\xff\xe1\x00\x82Exif\x00\x00II*\x0…

Label(value='0.9738356511397653')

Image(value=b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x02\x01\x01,\x01,\x00\x00\xff\xe1 \xf6Exif\x00\x00MM\x00*\x…

Label(value='0.9722343086895832')

Image(value=b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x02\x01\x00H\x00H\x00\x00\xff\xe1,{Exif\x00\x00MM\x00*\x00\…

Label(value='0.9707504879324116')