# Image similarity model


In [9]:
from tensorflow.keras.layers import Flatten, Dense, Input,concatenate
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras.layers import Activation, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential
import tensorflow as tf
import cv2
from scipy.spatial import distance
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from os import listdir
from os.path import isfile, join


### Import VGG16 model from Tensorflow

In [2]:
vgg16 = tf.keras.applications.VGG16(weights='imagenet', include_top=True, pooling='max', input_shape=(224, 224, 3))

In [3]:
basemodel = Model(inputs=vgg16.input, outputs=vgg16.get_layer('fc2').output)

In [93]:
basemodel.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 224, 224, 3)]     0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 56, 56, 128)       0     

### Vectorizer for images

In [4]:
def get_feature_vector(img):
 img1 = cv2.resize(img, (224, 224))
 feature_vector = basemodel.predict(img1.reshape(1, 224, 224, 3))
 return feature_vector

### Calculate similarity function

In [5]:
def calculate_similarity(vector1, vector2):
 return distance.cosine(vector1, vector2)

### Create vectors for all images in local folder

In [8]:
# directory = '/home/beres/code/tklein98/crate_scanner/notebooks/images/'
# vectors = []

# for filename in os.listdir(directory):
#     img = cv2.imread(f'images/{filename}')
#     img1 = get_feature_vector(img)
#     vectors.append([filename, img1.copy()])

### The vector file can be saved and retrieved as follows;

In [9]:
# The vectors array has been saved to the notebooks folder in the project repo 
# array = np.array(vectors)
# np.save('array', array)

In [52]:
## Load saved file using the following code; x_loaded = np.load(path/'x.npy')
vectors = np.load('full_array.npy', allow_pickle = True)

In [73]:
len(full_vectors)

9402

### Import test image and find similar image

In [91]:
img1 = cv2.imread("images/The_Rolling_Stones_Aftermath.jpg")

f1 = get_feature_vector(img1)
comparison = []

for vector in vectors:
    for i in vector[1]:
        comparison.append(calculate_similarity(f1, i))
        

### Retrieve details for closest match

In [92]:
index = comparison.index(min(comparison))
album = vectors[index]
print(album)
#album = cv2.imread(f'images/{album}')[:,:,::-1]
#plt.imshow(album)

[Unnamed: 0                                                   5922
artists                                              Otis Redding
album_name                                       Pain in My Heart
artist+album                        Otis Redding Pain in My Heart
album_id                                   2BFOk5b8jjm2xmsbx7qXq3
album_cover     https://i.scdn.co/image/ab67616d00004851cd404b...
Name: 3581, dtype: object
 array([[1.1700733, 1.5782962, 0.       , ..., 0.       , 0.       ,
        3.4891472]], dtype=float32)]


### Loop over test images and find matches

In [84]:
# This filepath is local, Beres: 1st, Tobias: 2nd

# Beres
directory = '/home/beres/code/tklein98/crate_scanner/notebooks/images'

# Tobi
# directory = "/Users/Tobias/Desktop/project_weeks/Pictures/"
full_vectors = np.load('full_array_highres.npy', allow_pickle = True)

comparisons = []
import ipdb


for filename in os.listdir(directory):
    img = cv2.imread(f'images/{filename}')
    img1 = get_feature_vector(img)
    comparison = {}
    counter = 0
    for vector in full_vectors:
        for j in vector[1]:
            #ipdb.set_trace()
            if len(comparison) == 0:
                #ipdb.set_trace()
                comparison[filename] = (counter, calculate_similarity(img1, j))
            elif comparison[filename][1] > calculate_similarity(img1, j):
                #ipdb.set_trace()
                comparison[filename] = (counter, calculate_similarity(img1, j))
            counter += 1
    comparisons.append(comparison)


In [85]:
comparisons

[{'Red_Hot_Chili_Peppers_Californication.jpg': (4247, 0.5060388147830963)},
 {'The_Rolling_Stones_Aftermath.jpg': (2436, 0.33827537298202515)},
 {'U2_Boy.jpg': (5116, 0.2472863793373108)},
 {'Neil_Young_After_the_Gold_Rush.jpg': (5929, 0.3985966444015503)}]

### Accuracy model function

In [86]:
# directory is the length of the testing images set

def accuracy_model(testing_dataframe):
# introduce score to calculate the accuracy 
    score = 0 

    for i in range(len(comparisons)):
        # look in comparisons, take the index of the image, get the key value through testing_dataframe['filename'][i] and get the first value of the tuple (index)
        index_matched_image = list(comparisons[i].values())[0][0]
        # Get the predicted album from all the vectors with the defined index, this assumes that the label is stored in the first place or 0
        predicted_album = full_vectors[index_matched_image][0][3]
        
        # TODO: Clean the album string as the output will be something like this: 'Blonde on Blonde Bob Dylan'
        predicted_album_cleaned = predicted_album.lower().replace(' ','')
        # If the predicted label matches the label from the labeled Dataframe with the test images, increase score by one
        if testing_dataframe['label'][i] == predicted_album_cleaned:
            score += 1
        print(index_matched_image, predicted_album_cleaned, testing_dataframe['label'][i])
    return score/len(comparisons)
        

In [42]:
full_vectors[5929][0]

Unnamed: 0                                                   5966
artists                                                Del Amitri
album_name                                      Change Everything
artist+album                         Del Amitri Change Everything
album_id                                   0Gcfpfph9iIFaLIzAlw7Yn
album_cover     https://i.scdn.co/image/ab67616d0000b273be91f3...
Name: 5966, dtype: object

### Get testing dataframe function

In [87]:

def get_testing_dataframe(directory_path):
    '''directory_path: Your directory path where you have stored all the testing images'''
    digits = ['0','1','2','3','4','5','6','7','8','9']

    # Create list of all filenames
    filenames = [f for f in listdir(directory_path) if isfile(join(directory_path, f))]
    #filenames.sort()
    # Omit .DOS file
    #filenames = filenames[1:]

    # Create Dataframe
    df = pd.DataFrame(columns=["picture_path","filename", "label"])

    # Creating labeled dataframe with picture filepath, filename and the respective label
    for i in range(len(filenames)):
        # Clear string of spaces, the numbers from the jpeg (not possible album names with numbers) and the underscores

        # remove .jpg and lowercase
        cleaned_string = filenames[i][:-4].lower()
        # Delete the numbers from the scraping naming convention, do this twice as there are can be two digit numbers
        if cleaned_string[-1] in digits:
            cleaned_string = cleaned_string[:-1]

        if cleaned_string[-1] in digits:
            cleaned_string = cleaned_string[:-1]

        # remove whitespace and underscores
        cleaned_string = cleaned_string.replace(" ", "").replace('_','')

        # Appending each row with image filepath,filename and its label
        df = df.append({
             "picture_path": f'{directory_path}{filenames[i]}',
             "filename": filenames[i],
             # Clear the remaining 'jpg' string at the end of every cleaned string
             "label": cleaned_string
              }, ignore_index=True)
    return df

In [88]:
testing_dataframe = get_testing_dataframe('/home/beres/code/tklein98/crate_scanner/notebooks/images')

In [89]:
testing_dataframe

Unnamed: 0,picture_path,filename,label
0,/home/beres/code/tklein98/crate_scanner/notebo...,Red_Hot_Chili_Peppers_Californication.jpg,redhotchilipepperscalifornication
1,/home/beres/code/tklein98/crate_scanner/notebo...,The_Rolling_Stones_Aftermath.jpg,therollingstonesaftermath
2,/home/beres/code/tklein98/crate_scanner/notebo...,U2_Boy.jpg,u2boy
3,/home/beres/code/tklein98/crate_scanner/notebo...,Neil_Young_After_the_Gold_Rush.jpg,neilyoungafterthegoldrush


### Result

In [15]:
accuracy_model(testing_dataframe)

3802 modernbaseballyou'regonnamissitall joydivisionunknownpleasures
2642 courtneybarnetttellmehowyoureallyfeel queenanightattheopera
5066 manchesterorchestrai'mlikeavirginlosingachild neutralmilkhotelintheaeroplaneoverthesea
8467 carseatheadrestnervousyoungman joydivisionunknownpleasures
2289 petergabrielpassion:musicforthelasttemptationofchrist lcdsoundsystemsoundofsilver
1140 bobdylanhighway61revisited bobdylanhighway61revisited
2185 samcookenightbeat nasillmatic
1893 midlakethetrialsofvanoccupanther neilyoungafterthegoldrush
8751 nilsfrahmfelt(specialedition) joydivisionunknownpleasures
1140 bobdylanhighway61revisited bobdylanhighway61revisited
8646 steveearletranscendentalblues thestonerosesthestoneroses
1084 theloniousmonk','johncoltranetheloniousmonkwithjohncoltrane pinkfloydthewall
1084 theloniousmonk','johncoltranetheloniousmonkwithjohncoltrane jonimitchellblue
5650 thecorrstalkoncorners nasillmatic
1084 theloniousmonk','johncoltranetheloniousmonkwithjohncoltrane blacksabbathpa

8803 ulverkveldssanger thecuredisintegration
7319 manicstreetpreachersresistanceisfutile davidbowielow
445 mimithomaagermancabaretstar[1935-1941] jeffbuckleygrace
2873 riseagainstthesufferer&thewitness lcdsoundsystemsoundofsilver
741 talatmahmoodbestoftalatmahmood:hisevergreenbollywoodhithindifilmsongs&ghazals,vol.2 jeffbuckleygrace
81 carlwoitschachcarlwoitschachrarerecordings johncoltranealovesupreme
7733 thetemptationsalldirections loveforeverchanges
6397 liarswixiw interpolturnonthebrightlights
1234 thewhowho'snext thewhowho'snext
4369 thursdayfullcollapse pinkfloydthewall
1237 thezombiesodesseyandoracle thezombiesodesseyandoracle
1084 theloniousmonk','johncoltranetheloniousmonkwithjohncoltrane u2thejoshuatree
4053 sonicyouththeeternal vanmorrisonastralweeks
5496 johnfoxxmetamatic...plus radioheadokcomputer
3440 mötleycrüetoofastforlove therollingstonesstickyfingers
1492 johnnypaycheckjohnnypaycheck-16biggesthits radioheadthebends
6241 lloydcoleandthecommotionsmainstream radioheadt

485 natalinoottohounsassolinonellascarpa radioheadthebends
8605 aceyaloneabookofhumanlanguage r.e.m.automaticforthepeople
8386 jefflynnelongwave oasis(what'sthestory)morningglory
89 agustinmagaldiagustinmagaldi-consejosdeoro- marvingayewhat'sgoingon
7718 sibafulorestadosamba kendricklamartopimpabutterfly
2231 nickcave&thebadseedshenry'sdream(2010remasteredversion) sufjanstevensillinois
2096 bandofhorsesceasetobegin interpolturnonthebrightlights
7073 plastikmanconsumed wilcoyankeehotelfoxtrot
4692 angelsoflighthowilovedyou davidbowiehunkydory
511 mannadey','sabitachowdhuryalltimegreats-sabitachowdhury therollingstonesletitbleed
7131 lfofrequencies interpolturnonthebrightlights
8172 elvispresleysunrise arcticmonkeyswhateverpeoplesayiam,that'swhati'mnot
560 bingcrosby','theandrewssistersamerrychristmaswithbingcrosby&theandrewssisters(remastered) thebeatlesrubbersoul
3008 sunralanquidity thedarksideofthemoonpinkfloyd
7533 traderhornemorningway loveforeverchanges
3711 unwoundnewplasticideas

701 francispoulenc','pierrebernacpierrebernac-francispoulenc bobdylanhighway61revisited
6241 lloydcoleandthecommotionsmainstream radioheadthebends
8509 witchfindergeneraldeathpenalty pearljamten
7936 ryanadams29 tameimpalalonerism
8837 kool&thegangspiritoftheboogie thebeatlesmagicalmysterytour
5743 beastieboystothe5boroughs wilcoyankeehotelfoxtrot
2642 courtneybarnetttellmehowyoureallyfeel thedarksideofthemoonpinkfloyd
8126 thecoupstealthisdoublealbum tameimpalalonerism
982 perrycomogreatestchristmassongs joydivisioncloser
8939 paoloconteaguaplano mybloodyvalentineloveless
9181 ramshackleglorylivethedream fleetwoodmacrumours
2451 suzannevegasuzannevega bobdylanhighway61revisited
4557 arcadiumbreatheawhile thewhowho'snext
5532 wyclefjeanthecarnivalextras-ep arcticmonkeyswhateverpeoplesayiam,that'swhati'mnot
5201 billybraggdon'ttrythisathome r.e.m.automaticforthepeople
2185 samcookenightbeat thebeatlesrubbersoul
7239 harveymilkcourtesyandgoodwilltowardmen/liveatttthebear's jonimitchellbl

1084 theloniousmonk','johncoltranetheloniousmonkwithjohncoltrane yesclosetotheedge
5201 billybraggdon'ttrythisathome r.e.m.automaticforthepeople
4314 vanillafudgevanillafudge fleetwoodmacrumours
7533 traderhornemorningway loveforeverchanges
5454 loureedmetalmachinemusic thesmithsthequeenisdead
1132 stangetz','charliebyrdjazzsamba kanyewestmybeautifuldarktwistedfantasy
2884 womenpublicstrain boniverforemma,foreverago
1955 billieholidayladysingstheblues davidbowielow
2761 sturgillsimpsonmetamodernsoundsincountrymusic vanmorrisonastralweeks
4561 jonnygreenwoodphantomthread(originalmotionpicturesoundtrack) nasillmatic
3086 paulmccartneyflamingpie(archivecollection) godspeedyou!blackemperorliftyr.skinnyfistslikeantennastoheaven
1084 theloniousmonk','johncoltranetheloniousmonkwithjohncoltrane pearljamten
176 jeffbeatmanjeffbeatmanisinthemoodforsinging brucespringsteenborntorun
5623 ericclaptonfromthecradle pinkfloydanimals
3306 godisanastronautallisviolent,allisbright(2011remasterededition) 

0.040310077519379844

In [90]:
accuracy_model(testing_dataframe)

4247 st.germaintourist(remastered) redhotchilipepperscalifornication
2436 thekinkskinks therollingstonesaftermath
5116 britneyspearsinthezone u2boy
5929 delamitrichangeeverything neilyoungafterthegoldrush


0.0