# Imagetools
A collection of machine learing technqiues to make sense of large image datasets.

In [None]:
%%capture
!rm -rf toolbox
!git clone https://github.com/zentralwerkstatt/toolbox
!pip3 install git+https://github.com/openai/CLIP.git
!pip3 install umap-learn

import clip
from toolbox import toolbox
import numpy as np
import PIL.Image
from tqdm.notebook import tqdm
from sklearn.cluster import KMeans
from datetime import datetime
import pandas as pd

In [None]:
# Sample CSV file: Metropolitan Museum data
# Simply upload your own CSV file to the notebook instance, or download programatically as per example below.
!wget https://github.com/Rijksmuseum/rijksmuseum.github.io/releases/download/1.0.0/202001-rma-csv-collection.zip
!unzip 202001-rma-csv-collection.zip -d .

In [None]:
#@title Path to your CSV file with metadata and URLs
dropna = True #@param {type:"boolean"}

csv_path = "urls.csv" #@param {type:"string"}
df = pd.read_csv(csv_path)
if dropna:
    df = df.dropna()

selection = df # In case next cell is not run

print(len(df), "data points")
df.head()

In [None]:
#@title CSV filters
column = "objectTitle[1]" #@param {type:"string"}
#@markdown contains (can contain multiple values separated by comma)
value = "draak, slang" #@param {type:"string"}

if "," in value:
    values = [x.strip() for x in value.split(",")]
else:
    values = [value]

selection = []
for value in values:
    selection.append(df.loc[df[column].str.contains(value, case=False)])
selection = pd.concat(selection)

# Add additional filters here

print(len(selection), "data points")
selection.head()

In [None]:
#@title Download images for selected rows
column = "url" #@param {type:"string"}
#@markdown Resize to max.:
size = 200 #@param {type:"integer"}

data_path = csv_path.split("/")[-1].split(".")[0]
toolbox.new_dir(data_path)

urls = selection[column]
for i, url in enumerate(tqdm(urls)):
    img = toolbox.img_from_url(url)
    img.thumbnail((size, size), PIL.Image.ANTIALIAS)
    img.save(f"{data_path}/{i:05d}.jpg")
    # Do this every iteration to allow early stopping
    img_paths = toolbox.get_all_files(data_path, ext="jpg")

In [None]:
img_paths = toolbox.get_all_files("")[:500]

In [None]:
#@title Sample the dataset
no_samples = 3 #@param {type:"integer"}

for n in range(no_samples):
    img = toolbox.load_img(np.random.choice(img_paths))
    img.thumbnail((200,200))
    img_np = np.array(img)
        
    km = KMeans(n_clusters=5) # Set up algorithm to find 5 clusters
    km.fit(img_np.reshape(-1, 3)) # Flatten image but keep color planes
    centers = km.cluster_centers_ # Get the center points of the clusters
    palette = toolbox.make_palette(centers) # Make a palette image

    toolbox.show_img(img)
    toolbox.show_img(palette)

In [None]:
#@title Generate overview grid plot
thumb_size =  30#@param {type:"integer"}
show_in_notebook = True #@param {type:"boolean"}
#@markdown (If not checked will save to file only to save notebook memory.)

plot = toolbox.plot_imgs_grid(img_paths, thumb_size)
timestamp = datetime.now().strftime("%d-%b-%Y-%H-%M-%S")
plot.save(f"{timestamp}_grid_plot.jpg")
if show_in_notebook:
    toolbox.show_img(plot)

In [None]:
#@title Generate brighness cluster plot
show_in_notebook = True #@param {type:"boolean"}
#@markdown (If not checked will save to file only to save notebook memory.)

print("Extracting features")
features = np.zeros((len(img_paths), 32*32*3))
for i, path in enumerate(tqdm(img_paths)):
    img = toolbox.load_img(path)
    features[i] = toolbox.flatten_img(img, 32)

print("Reducing dimensionality")
reduced_features = toolbox.reduce_features(features)

print("Plotting images")
plot = toolbox.plot_imgs_features(img_paths, 50, reduced_features)

timestamp = datetime.now().strftime("%d-%b-%Y-%H-%M-%S")
plot.save(f"{timestamp}_brightness_plot.jpg")
if show_in_notebook:
    toolbox.show_img(plot)

In [None]:
#@title Generate CLIP cluster plot
show_in_notebook = True #@param {type:"boolean"}
#@markdown (If not checked will save to file only to save notebook memory.)

print("Extracting features")
features = np.zeros((len(img_paths), 512))
for i, path in enumerate(tqdm(img_paths)):
    img = toolbox.load_img(path)
    features[i] = toolbox.CLIP_img(img)

print("Reducing dimensionality")
reduced_features = toolbox.reduce_features(features)

print("Plotting images")
plot = toolbox.plot_imgs_features(img_paths, 50, reduced_features)

timestamp = datetime.now().strftime("%d-%b-%Y-%H-%M-%S")
plot.save(f"{timestamp}_CLIP_plot.jpg")
if show_in_notebook:
    toolbox.show_img(plot)

In [None]:
#@title Extract clusters
n_clusters = 5 #@param {type:"integer"}
km = KMeans(n_clusters=n_clusters)
km.fit(features)

clusters = {}
for c in range(n_clusters):
    clusters[c] = []
    for i, img_path in enumerate(img_paths):
        if km.labels_[i] == c:
            clusters[c].append(img_path)  

for c in range(n_clusters):
    toolbox.show_img(toolbox.plot_imgs_grid(clusters[c], 50))

In [None]:
#@title Add back metadata
column = "objectTitle[1]" #@param {type:"string"}
#@markdown contains
value = "draak" #@param {type:"string"}
show_in_notebook = True #@param {type:"boolean"}
#@markdown (If not checked will save to file only to save notebook memory.)

borders = []
p = toolbox.random_palette(2) # No. of colors
for i, img_path in enumerate(img_paths):
    row = selection.iloc[i]
    if value.lower() in row[column].lower():
        borders.append(p[0])
    # Add other conditions/colors here
    else:
        borders.append(p[1])

print("Plotting images")
plot = toolbox.plot_imgs_features(img_paths, 50, reduced_features, borders)

timestamp = datetime.now().strftime("%d-%b-%Y-%H-%M-%S")
plot.save(f"{timestamp}_CLIP_plot_classes.jpg")
if show_in_notebook:
    toolbox.show_img(plot)