# Image similarity model


In [1]:
from tensorflow.keras.layers import Flatten, Dense, Input,concatenate
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras.layers import Activation, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential
import tensorflow as tf
import cv2
from scipy.spatial import distance
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# from os import listdir
# from os.path import isfile, join


### Import VGG16 model from Tensorflow

In [3]:
vgg16 = tf.keras.applications.VGG16(weights='imagenet', include_top=True, pooling='max', input_shape=(224, 224, 3))

In [4]:
basemodel = Model(inputs=vgg16.input, outputs=vgg16.get_layer('fc2').output)

In [5]:
#basemodel.summary()

### Vectorizer for images

In [6]:
def get_feature_vector(img):
 img1 = cv2.resize(img, (224, 224))
 feature_vector = basemodel.predict(img1.reshape(1, 224, 224, 3))
 return feature_vector

### Calculate similarity function

In [7]:
def calculate_similarity(vector1, vector2):
 return distance.cosine(vector1, vector2)

### Create vectors for all images in local folder

In [8]:
# directory = '/home/beres/code/tklein98/crate_scanner/notebooks/images/'
# vectors = []

# for filename in os.listdir(directory):
#     img = cv2.imread(f'images/{filename}')
#     img1 = get_feature_vector(img)
#     vectors.append([filename, img1.copy()])

### The vector file can be saved and retrieved as follows;

In [9]:
# The vectors array has been saved to the notebooks folder in the project repo 
# array = np.array(vectors)
# np.save('array', array)

In [131]:
## Load saved file using the following code; x_loaded = np.load(path/'x.npy')
#vectors = np.load('array.npy', allow_pickle = True)

### Import test image and find similar image

In [129]:
img1 = cv2.imread("test_images/Arcade Fire_Funeral_38.jpg")

f1 = get_feature_vector(img1)
comparison = []

for vector in vectors:
    for i in vector[1]:
        comparison.append(calculate_similarity(f1, i))
        

### Retrieve details for closest match

In [130]:
index = comparison.index(min(comparison))
album = vectors[0]
print(album)
#album = cv2.imread(f'images/{album}')[:,:,::-1]
#plt.imshow(album)

['I Want to See the Bright Lights Tonight Richard & Linda Thompson.jpg'
 array([[0., 0., 0., ..., 0., 0., 0.]], dtype=float32)]


### Loop over test images and find matches

In [140]:
# This filepath is local, Beres: 1st, Tobias: 2nd

# Beres
directory = '/home/beres/code/tklein98/crate_scanner/notebooks/test_images'

# Tobi
# directory = "/Users/Tobias/Desktop/project_weeks/Pictures/"
full_vectors = np.load('full_array.npy', allow_pickle = True)

comparisons = []
import ipdb


for filename in os.listdir(directory):
    img = cv2.imread(f'test_images/{filename}')
    img1 = get_feature_vector(img)
    comparison = {}
    counter = 0
    for vector in full_vectors:
        for j in vector[1]:
            #ipdb.set_trace()
            if len(comparison) == 0:
                #ipdb.set_trace()
                comparison[filename] = (counter, calculate_similarity(img1, j))
            elif comparison[filename][1] > calculate_similarity(img1, j):
                #ipdb.set_trace()
                comparison[filename] = (counter, calculate_similarity(img1, j))
            counter += 1
    comparisons.append(comparison)


error: OpenCV(4.5.1) /tmp/pip-req-build-ms668fyv/opencv/modules/imgproc/src/resize.cpp:4051: error: (-215:Assertion failed) !ssize.empty() in function 'resize'


### Accuracy model function

In [152]:
# directory is the length of the testing images set

def accuracy_model(testing_dataframe):
# introduce score to calculate the accuracy 
    score = 0 

    for i in range(len(comparisons)):
        # look in comparisons, take the index of the image, get the key value through testing_dataframe['filename'][i] and get the first value of the tuple (index)
        index_matched_image = list(comparisons[i].values())[0][0]
        # Get the predicted album from all the vectors with the defined index, this assumes that the label is stored in the first place or 0
        predicted_album = full_vectors[index_matched_image][0][3]
        
        # TODO: Clean the album string as the output will be something like this: 'Blonde on Blonde Bob Dylan'
        predicted_album_cleaned = predicted_album.lower().replace(' ','')
        # If the predicted label matches the label from the labeled Dataframe with the test images, increase score by one
        if testing_dataframe['label'][i] == predicted_album_cleaned:
            score += 1
        print(index_matched_image, predicted_album_cleaned, testing_dataframe['label'][i])
    return score/len(comparisons)
        

### Get testing dataframe function

In [61]:

def get_testing_dataframe(directory_path):
    '''directory_path: Your directory path where you have stored all the testing images'''
    digits = ['0','1','2','3','4','5','6','7','8','9']

    # Create list of all filenames
    filenames = [f for f in listdir(directory_path) if isfile(join(directory_path, f))]
    #filenames.sort()
    # Omit .DOS file
    #filenames = filenames[1:]

    # Create Dataframe
    df = pd.DataFrame(columns=["picture_path","filename", "label"])

    # Creating labeled dataframe with picture filepath, filename and the respective label
    for i in range(len(filenames)):
        # Clear string of spaces, the numbers from the jpeg (not possible album names with numbers) and the underscores

        # remove .jpg and lowercase
        cleaned_string = filenames[i][:-4].lower()
        # Delete the numbers from the scraping naming convention, do this twice as there are can be two digit numbers
        if cleaned_string[-1] in digits:
            cleaned_string = cleaned_string[:-1]

        if cleaned_string[-1] in digits:
            cleaned_string = cleaned_string[:-1]

        # remove whitespace and underscores
        cleaned_string = cleaned_string.replace(" ", "").replace('_','')

        # Appending each row with image filepath,filename and its label
        df = df.append({
             "picture_path": f'{directory_path}{filenames[i]}',
             "filename": filenames[i],
             # Clear the remaining 'jpg' string at the end of every cleaned string
             "label": cleaned_string
              }, ignore_index=True)
    return df

In [150]:
testing_dataframe = get_testing_dataframe('/home/beres/code/tklein98/crate_scanner/notebooks/test_images')

### Result

In [153]:
accuracy_model(testing_dataframe)

3679 juliaholterekstasis joydivisionunknownpleasures
596 ericclaptonslowhand35thanniversary(superdeluxe) queenanightattheopera
375 dispatchbangbang neutralmilkhotelintheaeroplaneoverthesea
6316 genesisfromgenesistorevelation joydivisionunknownpleasures
1083 lcdsoundsystemsoundofsilver lcdsoundsystemsoundofsilver
1874 bobdylananothersideofbobdylan bobdylanhighway61revisited
1068 nasillmatic nasillmatic
1072 neilyoungafterthegoldrush(2009remaster) neilyoungafterthegoldrush
3537 eaglesthelongrun(2013remaster) joydivisionunknownpleasures
1874 bobdylananothersideofbobdylan bobdylanhighway61revisited
5065 thebeachboysloveyou(remastered) thestonerosesthestoneroses
8742 thecoralrootsandechoes pinkfloydthewall
8699 thebooradleyseverything'salrightforever jonimitchellblue
5029 thelemonheadscomeonfeelthelemonheads nasillmatic
5432 thesaintseternallyyours blacksabbathparanoid
8966 plumtreepredictsthefuture portisheaddummy
5938 sororitynoiseyou'renotas_____asyouthink weezerpinkerton
8742 thecoralro

5524 toddrundgrenhermitofminkhollow milesdaviskindofblue
439 pixiessurferrosa pixiessurferrosa
1034 talkingheadsremaininlight(deluxeversion) talkingheadsremaininlight
4168 justintimberlakejustified thebeatlesrubbersoul
1069 loveforeverchanges(2015remasteredversion) loveforeverchanges
1165 bobdylanthefreewheelin'bobdylan thewhowho'snext
1020 thevelvetunderground&nicothevelvetunderground&nico45thanniversary thevelvetunderground&nicothevelvetunderground&nico
8399 leonardcohenthanksforthedance joydivisionunknownpleasures
8629 brianwilsonbrianwilson michaeljacksonthriller
7738 xtcgo2 pinkfloydthewall
2379 samcookeonenightstand-samcookeliveattheharlemsquareclub,1963 thecuredisintegration
703 blacksabbathparanoid(remaster) blacksabbathparanoid
9191 gatobarbierichapterone:latinamerica pinkfloydthewall
1715 rushafarewelltokings boniverforemma,foreverago
1581 parquetcourtswideawake! coldplayarushofbloodtothehead
1034 talkingheadsremaininlight(deluxeversion) talkingheadsremaininlight
4052 mariahc

4052 mariahcareymusicbox thestrokesisthisit
1308 themoodybluesdaysoffuturepassed(deluxeversion) nirvananevermind
6935 queengreatesthitsii thecuredisintegration
1250 mgmtoracularspectacular davidbowielow
9398 elcuartetodenosraro jeffbuckleygrace
1083 lcdsoundsystemsoundofsilver lcdsoundsystemsoundofsilver
1138 leonardcohensongsofleonardcohen jeffbuckleygrace
6159 jack'smannequintheglasspassenger johncoltranealovesupreme
1069 loveforeverchanges(2015remasteredversion) loveforeverchanges
1078 interpolturnonthebrightlights interpolturnonthebrightlights
1165 bobdylanthefreewheelin'bobdylan thewhowho'snext
1225 godspeedyou!blackemperorf♯a♯∞ pinkfloydthewall
1084 thezombiesodesseyandoracle thezombiesodesseyandoracle
1581 parquetcourtswideawake! u2thejoshuatree
2907 sturgillsimpsonmetamodernsoundsincountrymusic vanmorrisonastralweeks
4355 cigarettesaftersexcigarettesaftersex radioheadokcomputer
1060 therollingstonesstickyfingers(remastered) therollingstonesstickyfingers
3308 lilwaynethacarterii

6769 matthewgoodavalanche theclashlondoncalling
1800 thegaslightanthemthe'59sound televisionmarqueemoon
1055 r.e.m.automaticforthepeople r.e.m.automaticforthepeople
8699 thebooradleyseverything'salrightforever thecuredisintegration
1034 talkingheadsremaininlight(deluxeversion) talkingheadsremaininlight
5595 alcestshelter(deluxeedition) thestrokesisthisit
1059 therollingstonesletitbleed therollingstonesletitbleed
1086 coldplayarushofbloodtothehead coldplayarushofbloodtothehead
6640 dredgelcielo yesclosetotheedge
5762 oingoboingodeadman'sparty thebeatlesmagicalmysterytour
7337 howtodresswell"whatisthisheart?"(deluxeedition) radioheadthebends
1079 radioheadamoonshapedpool radioheadamoonshapedpool
4822 mainsourcebreakingatoms(2017remasteredversion) blacksabbathparanoid
1082 slintspiderland slintspiderland
4212 johnlennonwallsandbridges coldplayarushofbloodtothehead
400 jodibensonjodibensonsingssongsfromthebeginner'sbible televisionmarqueemoon
2893 thebeachboyswildhoney(remastered) thezombi

1043 therollingstonesexileonmainstreet(2010re-mastered) therollingstonesexileonmainst.
7028 jimmyreedi'mjimmyreed thesmithsthequeenisdead
1049 wilcoyankeehotelfoxtrot wilcoyankeehotelfoxtrot
1049 wilcoyankeehotelfoxtrot wilcoyankeehotelfoxtrot
6378 garymoorewildfrontier brucespringsteenborntorun
4588 isaiahrashadcilviademo ledzeppelinledzeppelinii
1581 parquetcourtswideawake! thesmashingpumpkinssiamesedream
5938 sororitynoiseyou'renotas_____asyouthink weezerpinkerton
8297 therentalsreturnoftherentals michaeljacksonthriller
6309 theloungelizardstheloungelizards kendricklamartopimpabutterfly
6977 sebadohharmacy pinkfloydanimals
3552 otisreddingpaininmyheart johncoltranealovesupreme
128 patrickhuntamongmysouvenirs r.e.m.automaticforthepeople
4335 thedoorsanamericanprayer tomwaitsraindogs
1087 queenanightattheopera(deluxeremasteredversion) queenanightattheopera
384 alejandrosanzalejandrosanz3 slintspiderland
1053 oasis(what'sthestory)morningglory? oasis(what'sthestory)morningglory
1043 the

0.22474460839954596