This work is inspired blog post of Maciej D. Korzec https://towardsdatascience.com/recommending-similar-images-using-pytorch-da019282770c

In [2]:
import os
from PIL import Image
from torchvision import transforms

In [3]:

# needed input dimensions for the CNN
inputDim = (224,224)
# directories :\Users\telmi\Documents\dhh23\EarlyModernReuse\early_modern_data-main\data\all_images\cropped\illustration
inputDir = "C:/Users/telmi/Documents/dhh23/EarlyModernReuse/early_modern_data-main/data/all_images/cropped/illustration" #"/scratch/project_2005488/DHH23/early_modern_samples/similarity"
inputDirCNN = "C:/Users/telmi/Documents/dhh23/EarlyModernReuse/data"

In [None]:
os.makedirs(inputDirCNN, exist_ok = True)

transformationForCNNInput = transforms.Compose([transforms.Resize(inputDim)])

for imageName in os.listdir(inputDir):
    I = Image.open(os.path.join(inputDir, imageName))
    newI = transformationForCNNInput(I)

    # copy the rotation information metadata from original image and save, else your transformed images may be rotated
    # exif = I.info['exif']
    newI.save(os.path.join(inputDirCNN, imageName))
    
    newI.close()
    I.close()


In [4]:
import torch
from tqdm import tqdm
from torchvision import models
import json
import numpy as np

In [8]:
# for this prototype we use no gpu, cuda= False and as model resnet18 to obtain feature vectors

class Img2VecResnet18():
    def __init__(self):
        
        self.device = torch.device("cpu")
        self.numberFeatures = 512
        self.modelName = "resnet-18"
        self.model, self.featureLayer = self.getFeatureLayer()
        self.model = self.model.to(self.device)
        self.model.eval()
        self.toTensor = transforms.ToTensor()
        
        # normalize the resized images as expected by resnet18
        # [0.485, 0.456, 0.406] --> normalized mean value of ImageNet, [0.229, 0.224, 0.225] std of ImageNet
        self.normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        
    def getVec(self, img):
        image = self.normalize(self.toTensor(img)).unsqueeze(0).to(self.device)
        embedding = torch.zeros(1, self.numberFeatures, 1, 1)

        def copyData(m, i, o): embedding.copy_(o.data)

        h = self.featureLayer.register_forward_hook(copyData)
        self.model(image)
        h.remove()

        return embedding.numpy()[0, :, 0, 0]

    def getFeatureLayer(self):
        
        cnnModel = models.resnet18(pretrained=True)
        layer = cnnModel._modules.get('avgpool')
        self.layer_output_size = 512
        
        return cnnModel, layer
        

# generate vectors for all the images in the set
img2vec = Img2VecResnet18() 

DATA_FILENAME = inputDirCNN+"/vectors/vectors.json"
with open(DATA_FILENAME, mode='w', encoding='utf-8') as feedsjson:
    #feeds = json.load(feedsjson)
    entry = {}
    entry["sth"] = 0
    #entry['name'] = args.name
    #entry['url'] = args.url
   # entry_dump=
    json.dump(entry,feedsjson)

allVectors = {}
print("Converting images to feature vectors:")
for image in tqdm(os.listdir(inputDirCNN)):
    I = Image.open(os.path.join(inputDirCNN, image)).convert("RGB")
    vec = img2vec.getVec(I)
    allVectors[image] = vec
    with open(DATA_FILENAME) as f:
        data_temp = json.load(f)

    #data_temp.update(allVectors)
    temp= {image:vec}
    data_temp.update(temp)
    #with open(DATA_FILENAME, 'w') as f:
    #    json.dump(data_temp, f)
    filename=  inputDirCNN+"/vectors/" + image.split(".")[0]+ ".txt"
    np.savetxt(filename, vec, fmt='%d')
    I.close() 


Converting images to feature vectors:


 72%|███████████████████████████████████████████████████████▋                     | 9504/13152 [21:09<09:24,  6.46it/s]

In [23]:
len(allVectors)
fn= "C:/Users/telmi/Documents/dhh23/EarlyModernReuse/vector_dict.json"
vectors_dict= {k:v.tolist() for k,v in allVectors.items()}

#with open(fn, 'w') as f:
    #json.dump(vectors_dict, f)

NameError: name 'allVectors' is not defined

In [9]:
# now let us define a function that calculates the cosine similarity entries in the similarity matrix
import pandas as pd
import numpy as np

In [None]:
# read the vectors in
fn= "C:/Users/telmi/Documents/dhh23/EarlyModernReuse/vector_dict.json"

with open(fn, 'r') as f:
    vectors_dict= json.load(f)
    
len(vectors_dict)

In [None]:
allVectors= {k:np.array(v) for k,v in vectors_dict.items()}


In [None]:
# the key is the image name
print(list(allVectors.keys())[:5])

In [None]:
# a subset
someKeys= list(allVectors.keys())[:500]
someValues=list(allVectors.values())[:500]
notAllVectors = {k:v for k,v in zip(someKeys,someValues) }

In [10]:
# read in the necessary data based on metadata criteria

import glob
import pandas as pd
path="C:/Users/telmi/Documents/dhh23/EarlyModernReuse/vectors/"
mynp= glob.glob(path+"*")

# read in metadata
dpath= "C:/Users/telmi/Documents/dhh23/EarlyModernReuse/early_modern_data-main/"

# C:\Users\telmi\Documents\dhh23\reuseportfolio
meta = dpath + "metadata.csv"
clip= dpath + "clip_classification.csv"

meta = pd.read_csv(meta, dtype={"page_id":str, "ecco_id":str})
meta.gatherings.unique()

array(['8vo', '4to', '2fo', nan, '12mo', '16mo', '32mo', '8long', '18mo',
       '24mo', '12long'], dtype=object)

In [22]:
len(mynp)

13151

In [26]:
# load in the wanted vectors
meta_=meta[meta["gatherings"].isin(["12mo","8vo","4to","2fo"])]
print(len(meta_), meta_["page_id"].nunique())


# wanted files
wlist=[]
filteredVectors={}
err=[]
for pic in list(mynp):
    pic_id = pic.split(".json")[-1]
    page_id = pic_id.split("_")[0]
    
    try:
        if page_id in list(meta_["page_id"]):
            wlist.append(pic)
            filteredVectors[pic_id]=np.loadtxt(pic)
    except:
        err.append(pic)
print(len(wlist), len (mynp))      
#filteredVectors[pic_id]

95346 95346
11896 13151


In [None]:


def getSimilarityMatrix(vectors):
    v = np.array(list(vectors.values())).T
    sim = np.inner(v.T, v.T) / ((np.linalg.norm(v, axis=0).reshape(-1,1)) * ((np.linalg.norm(v, axis=0).reshape(-1,1)).T))
    keys = list(vectors.keys())
    matrix = pd.DataFrame(sim, columns = keys, index = keys)
    
    return matrix
        
similarityMatrix = getSimilarityMatrix(filteredVectors)

In [None]:
from numpy.testing import assert_almost_equal
import pickle

k = 5 # the number of top similar images to be stored

similarNames = pd.DataFrame(index = similarityMatrix.index, columns = range(k))
similarValues = pd.DataFrame(index = similarityMatrix.index, columns = range(k))

for j in tqdm(range(similarityMatrix.shape[0])):
    kSimilar = similarityMatrix.iloc[j, :].sort_values(ascending = False).head(k)
    similarNames.iloc[j, :] = list(kSimilar.index)
    similarValues.iloc[j, :] = kSimilar.values
similarNames_path = "/scratch/project_2005488/DHH23/model/similarNames.pkl"
similarValues_path = "/scratch/project_2005488/DHH23/model/similarValues.pkl"
similarNames.to_pickle(similarNames_path)
similarValues.to_pickle(similarValues_path)

In [None]:
# save the vectors



In [None]:
# open a file, where you stored the pickled data
file = open(similarNames_path, 'rb')
simNames = pickle.load(file)
file.close()

file = open(similarValues_path, 'rb')
simValues = pickle.load(file)
file.close()

In [None]:
def setAxes(ax, image, query = False, **kwargs):
    value = kwargs.get("value", None)
    if query:
        ax.set_xlabel("Query Image\n{0}".format(image), fontsize = 8)
    else:
        ax.set_xlabel("Similarity value {1:1.3f}\n{0}".format( image,  value), fontsize = 8)
    ax.set_xticks([])
    ax.set_yticks([])
    
def getSimilarImages(image, simNames, simVals):
    if image in set(simNames.index):
        imgs = list(simNames.loc[image, :])
        vals = list(simVals.loc[image, :])
        if image in imgs:
            assert_almost_equal(max(vals), 1, decimal = 5)
            imgs.remove(image)
            vals.remove(max(vals))
        return imgs, vals
    else:
        print("'{}' Unknown image".format(image))
        
def plotSimilarImages(image, simiarNames, similarValues):
    simImages, simValues = getSimilarImages(image, similarNames, similarValues)
    fig = plt.figure(figsize=(10, 20))
    
    # now plot the  most simliar images
    for j in range(0, numCol*numRow):
        ax = []
        if j == 0:
            img = Image.open(os.path.join(inputDir, image))
            ax = fig.add_subplot(numRow, numCol, 1)
            setAxes(ax, image, query = True)
        else:
            img = Image.open(os.path.join(inputDir, simImages[j-1]))
            ax.append(fig.add_subplot(numRow, numCol, j+1))
            setAxes(ax[-1], simImages[j-1], value = simValues[j-1])
        img = img.convert('RGB')
        plt.imshow(img)
        img.close()
    
    plt.show()
        

In [None]:
import matplotlib.pyplot as plt
import random    
# take three examples from the provided image set and plot
folder_path = "/scratch/project_2005488/DHH23/early_modern_samples/similarity"
numCol = 5
numRow = 1
num_files = 100
all_files = os.listdir(folder_path)
# Shuffle the list of files randomly
random.shuffle(all_files)
selected_files = all_files[:num_files]





for image in selected_files:
    plotSimilarImages(image, simNames, simValues)

In [None]:
similarNames