## Import Libraries

In [38]:
from PIL import Image
from tqdm import tqdm
import numpy as np
from tqdm import tqdm
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.models import Model
import tensorflow as tf
import glob
from umap import UMAP
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen

In [3]:
base_model = VGG16(weights='imagenet')

model = Model(inputs=base_model.input, outputs=base_model.get_layer('fc1').output)

In [20]:
files = glob.glob("../data/images/*.jpg")
files = [file.replace("\\", "/") for file in files]
len(files)

196

## Download the Image Data

In [36]:
def get_images(url = "https://collections.ushmm.org/search/?f%5Bf_images%5D%5B%5D=all_images&f%5Bf_images%5D%5B%5D=indiv_photographs&page=1&per_page=50"):
    r = requests.get(url)
    soup = BeautifulSoup(r.content)
    all_data = []
    total_page_nums = int(soup.find("ul", {"class": "pagination"}).find_all("li")[-1].text.strip())
    for i in range(1, 5):
        url = f"https://collections.ushmm.org/search/?f%5Bf_images%5D%5B%5D=all_images&f%5Bf_images%5D%5B%5D=indiv_photographs&page={i}&per_page=50"
        print(url)
        r = requests.get(url)
        soup = BeautifulSoup(r.content)
        items = soup.find_all("div", {"class": "document"})
        for item in items:
            try:
                title = item.find("a").text.strip()
                page_url = "https://collections.ushmm.org"+item.find("a")["href"]
                image_url = item.find("img")["src"]
                image_name = url.split("/")[-1]
                # image_file = f"../data/images/{image_name}.jpg"
                all_data.append(
                    (
                        title, page_url, image_url
                    )
                )
                # download_image(image_url)
            except:
                TypeError

    return all_data
image_data = get_images()

https://collections.ushmm.org/search/?f%5Bf_images%5D%5B%5D=all_images&f%5Bf_images%5D%5B%5D=indiv_photographs&page=1&per_page=50
https://collections.ushmm.org/search/?f%5Bf_images%5D%5B%5D=all_images&f%5Bf_images%5D%5B%5D=indiv_photographs&page=2&per_page=50
https://collections.ushmm.org/search/?f%5Bf_images%5D%5B%5D=all_images&f%5Bf_images%5D%5B%5D=indiv_photographs&page=3&per_page=50
https://collections.ushmm.org/search/?f%5Bf_images%5D%5B%5D=all_images&f%5Bf_images%5D%5B%5D=indiv_photographs&page=4&per_page=50


In [37]:
image_df = pd.DataFrame(image_data, columns=["title", "page_url", "image_url"])
image_df

Unnamed: 0,title,page_url,image_url
0,Jewish prisoners load textiles onto a truck fr...,https://collections.ushmm.org/search/catalog/p...,https://collections.ushmm.org/iiif-b/assets/th...
1,View of a destroyed building in Kovno which ho...,https://collections.ushmm.org/search/catalog/p...,https://collections.ushmm.org/iiif-b/assets/th...
2,Undertakers from the funeral home of M.B. Pink...,https://collections.ushmm.org/search/catalog/p...,https://collections.ushmm.org/iiif-b/assets/th...
3,German civilians carry corpses on stretchers t...,https://collections.ushmm.org/search/catalog/p...,https://collections.ushmm.org/iiif-b/assets/th...
4,"The Yiddish words, ""Jews Revenge!"" scrawled in...",https://collections.ushmm.org/search/catalog/p...,https://collections.ushmm.org/iiif-b/assets/th...
...,...,...,...
193,Studio portrait of a family of Dutch rescuers ...,https://collections.ushmm.org/search/catalog/p...,https://collections.ushmm.org/iiif-b/assets/th...
194,Two young children sit next to each other on c...,https://collections.ushmm.org/search/catalog/p...,https://collections.ushmm.org/iiif-b/assets/th...
195,Identification card issued to Moniek Szmulewic...,https://collections.ushmm.org/search/catalog/p...,https://collections.ushmm.org/iiif-b/assets/th...
196,Close-up portrait of a religious Jewish couple.,https://collections.ushmm.org/search/catalog/p...,https://collections.ushmm.org/iiif-b/assets/th...


## Encode Images

In [49]:
def extract(model, img):
    # Resize the image
    img = img.resize((224, 224))
    
    # Convert the image color space
    img = img.convert('RGB')
    
    # Reformat the image
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    
    # Extract Features
    feature = model.predict(x)[0]
    return feature / np.linalg.norm(feature)


def get_feature(model, image_df):
    image_data = image_df.image_url.tolist()
    features = []
    for url in tqdm(image_data): # Iterate through images 
        # Extract Features
        try:
            feature = extract(model, img = Image.open(urlopen(url)))
            features.append(feature)
        except:
            features.append(None)
            continue
    return features

In [50]:
features = get_feature(model, image_df)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 198/198 [03:02<00:00,  1.09it/s]


In [43]:
# image_data = image_df.image_url.tolist()

In [48]:
# img = Image.open(urlopen(image_data[0]))
# feature = extract(model, img)
# feature

array([0.01323685, 0.        , 0.        , ..., 0.        , 0.        ,
       0.        ], dtype=float32)

In [51]:
image_df["features"] = features
image_df

Unnamed: 0,title,page_url,image_url,features
0,Jewish prisoners load textiles onto a truck fr...,https://collections.ushmm.org/search/catalog/p...,https://collections.ushmm.org/iiif-b/assets/th...,"[0.01323685, 0.0, 0.0, 0.0, 0.0, 0.0, 0.009764..."
1,View of a destroyed building in Kovno which ho...,https://collections.ushmm.org/search/catalog/p...,https://collections.ushmm.org/iiif-b/assets/th...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03189113..."
2,Undertakers from the funeral home of M.B. Pink...,https://collections.ushmm.org/search/catalog/p...,https://collections.ushmm.org/iiif-b/assets/th...,"[0.0008594266, 0.023347693, 0.0, 0.0, 0.0, 0.0..."
3,German civilians carry corpses on stretchers t...,https://collections.ushmm.org/search/catalog/p...,https://collections.ushmm.org/iiif-b/assets/th...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.053543463, 0...."
4,"The Yiddish words, ""Jews Revenge!"" scrawled in...",https://collections.ushmm.org/search/catalog/p...,https://collections.ushmm.org/iiif-b/assets/th...,"[0.0, 0.013709405, 0.0, 0.0, 0.0, 0.0, 0.01438..."
...,...,...,...,...
193,Studio portrait of a family of Dutch rescuers ...,https://collections.ushmm.org/search/catalog/p...,https://collections.ushmm.org/iiif-b/assets/th...,"[0.0, 0.0, 0.022584368, 0.0, 0.0, 0.0302029, 0..."
194,Two young children sit next to each other on c...,https://collections.ushmm.org/search/catalog/p...,https://collections.ushmm.org/iiif-b/assets/th...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.010912351, 0...."
195,Identification card issued to Moniek Szmulewic...,https://collections.ushmm.org/search/catalog/p...,https://collections.ushmm.org/iiif-b/assets/th...,"[0.0, 0.0, 0.006256158, 0.0, 0.0, 0.0, 0.01771..."
196,Close-up portrait of a religious Jewish couple.,https://collections.ushmm.org/search/catalog/p...,https://collections.ushmm.org/iiif-b/assets/th...,"[0.0, 0.009990039, 0.0, 0.0, 0.0, 0.0074017523..."


## Clean the Features

In [78]:
cleaned_df = image_df.loc[image_df['features'].isna() == False]
cleaned_df

Unnamed: 0,title,page_url,image_url,features
0,Jewish prisoners load textiles onto a truck fr...,https://collections.ushmm.org/search/catalog/p...,https://collections.ushmm.org/iiif-b/assets/th...,"[0.01323685, 0.0, 0.0, 0.0, 0.0, 0.0, 0.009764..."
1,View of a destroyed building in Kovno which ho...,https://collections.ushmm.org/search/catalog/p...,https://collections.ushmm.org/iiif-b/assets/th...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03189113..."
2,Undertakers from the funeral home of M.B. Pink...,https://collections.ushmm.org/search/catalog/p...,https://collections.ushmm.org/iiif-b/assets/th...,"[0.0008594266, 0.023347693, 0.0, 0.0, 0.0, 0.0..."
3,German civilians carry corpses on stretchers t...,https://collections.ushmm.org/search/catalog/p...,https://collections.ushmm.org/iiif-b/assets/th...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.053543463, 0...."
4,"The Yiddish words, ""Jews Revenge!"" scrawled in...",https://collections.ushmm.org/search/catalog/p...,https://collections.ushmm.org/iiif-b/assets/th...,"[0.0, 0.013709405, 0.0, 0.0, 0.0, 0.0, 0.01438..."
...,...,...,...,...
193,Studio portrait of a family of Dutch rescuers ...,https://collections.ushmm.org/search/catalog/p...,https://collections.ushmm.org/iiif-b/assets/th...,"[0.0, 0.0, 0.022584368, 0.0, 0.0, 0.0302029, 0..."
194,Two young children sit next to each other on c...,https://collections.ushmm.org/search/catalog/p...,https://collections.ushmm.org/iiif-b/assets/th...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.010912351, 0...."
195,Identification card issued to Moniek Szmulewic...,https://collections.ushmm.org/search/catalog/p...,https://collections.ushmm.org/iiif-b/assets/th...,"[0.0, 0.0, 0.006256158, 0.0, 0.0, 0.0, 0.01771..."
196,Close-up portrait of a religious Jewish couple.,https://collections.ushmm.org/search/catalog/p...,https://collections.ushmm.org/iiif-b/assets/th...,"[0.0, 0.009990039, 0.0, 0.0, 0.0, 0.0074017523..."


## Dimensionality Reduction

In [88]:
output_file = "../data/ushmm_image_coords.csv"

In [81]:
clean_features = cleaned_df.features.tolist()

In [89]:
# Reduce the dimensions with UMAP
umap = UMAP()
X_tfm = umap.fit_transform(clean_features)

# Apply coordinates
cleaned_df['x'] = X_tfm[:, 0]
cleaned_df['y'] = X_tfm[:, 1]
cleaned_df.to_csv(output_file, index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df['x'] = X_tfm[:, 0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df['y'] = X_tfm[:, 1]


In [91]:
cleaned_df.head(1)

Unnamed: 0,title,page_url,image_url,features,x,y
0,Jewish prisoners load textiles onto a truck fr...,https://collections.ushmm.org/search/catalog/p...,https://collections.ushmm.org/iiif-b/assets/th...,"[0.01323685, 0.0, 0.0, 0.0, 0.0, 0.0, 0.009764...",4.065477,9.364377


## Function for Performing the Above Tasks

## Visualizing our Clusters

In [84]:
from bokeh.io import curdoc
from bokeh.layouts import column, row
from bokeh.models import (Button, ColumnDataSource, DataTable, TableColumn, TextInput)
from bokeh.plotting import figure, show
from bokeh.models import DataTable, TableColumn, ColorBar, HTMLTemplateFormatter, Spinner, RangeSlider
from bokeh.io import output_notebook
from bokeh.application import Application
from bokeh.application.handlers import FunctionHandler
import numpy as np

In [85]:
output_notebook()

In [98]:
def bulk_text(path, keywords=None):
    df = pd.read_csv(path)
    df['alpha'] = 0.5
    if keywords:
        df['color'] = [determine_keyword(str(t), keywords) for t in df['text']]
        df['alpha'] = [0.4 if c == 'none' else 1 for c in df['color']]

    highlighted_idx = []

    # mapper, df = get_color_mapping(df)
    columns = [
        TableColumn(field="title", title="title", width=500),
        TableColumn(field="image_url", title="image", formatter=HTMLTemplateFormatter(template='<img src="<%= image_url %>" width=60>')),
        TableColumn(field="image_url", title="download", formatter=HTMLTemplateFormatter(template=r'<a href="<%= image_url %>", target="_blank">Download Image</a>')),
    ]

    def update(attr, old, new):
        """Callback used for plot update when lasso selecting"""
        global highlighted_idx
        subset = df.iloc[new]
        highlighted_idx = new
        subset = subset.iloc[np.random.permutation(len(subset))]
        source.data = subset

    def save():
        """Callback used to save highlighted data points"""
        global highlighted_idx
        df.iloc[highlighted_idx][['text']].to_csv(text_filename.value, index=False)

    source = ColumnDataSource(data=dict())
    source_orig = ColumnDataSource(data=df)

    data_table = DataTable(source=source, columns=columns, width=700, height=700)
    source.data = df

    p = figure(title="", sizing_mode="scale_both", tools=["lasso_select", "box_select", "pan", "box_zoom", "wheel_zoom", "reset"])
    p.toolbar.active_drag = None
    p.toolbar.active_inspect = None

    circle_kwargs = {"x": "x", "y": "y", "size": 1, "source": source_orig, "alpha": "alpha"}

    scatter = p.circle(**circle_kwargs)
    p.plot_width = 500
    if "color" in df.columns:
        p.plot_width=350
    p.plot_height = 700
    ## Spinner for Node Size
    spinner = Spinner(title="Circle Size", low = 1, high=60, step=1, value=scatter.glyph.size, width=200)
    spinner.js_link("value", scatter.glyph, "size")
    
    ## Adjust Row Height
    row_spinner = Spinner(title="Row Height", low = 50, high=1000, step=10, value=data_table.row_height, width=200)
    row_spinner.js_link("value", data_table, "row_height")
        
    scatter.data_source.selected.on_change('indices', update)

    text_filename = TextInput(value="out.csv", title="Filename:")
    save_btn = Button(label="SAVE")
    save_btn.on_click(save)

    plot = column(p)
    controls_main = column(spinner, text_filename)
    controls = column(row_spinner, save_btn)
    
    def make_doc(doc):
        doc.add_root(row(spinner, row_spinner))
        doc.add_root(row(plot, data_table))
        doc.add_root(row(text_filename))
        doc.add_root(row(save_btn))
    handler = FunctionHandler(make_doc)
    app=Application(handler)
    return app


app = bulk_text(output_file)
show(app)

ERROR:tornado.application:Uncaught exception GET /autoload.js?bokeh-autoload-element=2472&bokeh-absolute-url=http://localhost:63407&resources=none (127.0.0.1)
HTTPServerRequest(protocol='http', host='localhost:63407', method='GET', uri='/autoload.js?bokeh-autoload-element=2472&bokeh-absolute-url=http://localhost:63407&resources=none', version='HTTP/1.1', remote_ip='127.0.0.1')
Traceback (most recent call last):
  File "C:\Users\wma22\anaconda3\lib\site-packages\tornado\web.py", line 1704, in _execute
    result = await result
  File "C:\Users\wma22\anaconda3\lib\site-packages\bokeh\server\views\autoload_js_handler.py", line 62, in get
    session = await self.get_session()
  File "C:\Users\wma22\anaconda3\lib\site-packages\bokeh\server\views\session_handler.py", line 144, in get_session
    session = await self.application_context.create_session_if_needed(session_id, self.request, token)
  File "C:\Users\wma22\anaconda3\lib\site-packages\bokeh\server\contexts.py", line 243, in create_s