In [2]:
import os
import numpy as np
from PIL import Image
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import jaccard_score
from sklearn.preprocessing import Binarizer
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing import image

# Load pre-trained VGG16 model + higher level layers
base_model = VGG16(weights='imagenet')
model = Model(inputs=base_model.input, outputs=base_model.get_layer('fc1').output)

def extract_image_features(img_path):
    img = image.load_img(img_path, target_size=(224, 224))
    img_data = image.img_to_array(img)
    img_data = np.expand_dims(img_data, axis=0)
    img_data = preprocess_input(img_data)
    
    features = model.predict(img_data)
    return features.flatten()

def calculate_cosine_similarity(features1, features2):
    return cosine_similarity([features1], [features2])[0][0]

def calculate_jaccard_similarity(features1, features2):
    binarizer = Binarizer()
    bin_features1 = binarizer.fit_transform([features1])[0]
    bin_features2 = binarizer.fit_transform([features2])[0]
    return jaccard_score(bin_features1, bin_features2)

def calculate_combined_similarity(features1, features2, weight_cosine=0.7, weight_jaccard=0.3):
    cosine_sim = calculate_cosine_similarity(features1, features2)
    jaccard_sim = calculate_jaccard_similarity(features1, features2)
    return weight_cosine * cosine_sim + weight_jaccard * jaccard_sim

# Paths to training and test folders
train_folder = "./train"
test_folder = "./test"

# List of training and test images
train_images = ["2024.03.15_0954.jpg", "2024.03.15_1145.jpg", "Faller_8.jpg","invoice_77073.jpg","invoice_102856.jpg"]
test_images = ["invoice_77098.jpg", "invoice_102857.jpg"]

# Dictionary to store training images
database = {}

# Extract and store training images
for img_name in train_images:
    img_path = os.path.join(train_folder, img_name)
    features = extract_image_features(img_path)
    database[img_name] = features

# Compare test images to training images
for test_img in test_images:
    test_img_path = os.path.join(test_folder, test_img)
    test_features = extract_image_features(test_img_path)
    
    best_match = None
    highest_similarity = 0
    
    for train_img, train_features in database.items():
        similarity = calculate_combined_similarity(test_features, train_features)
        if similarity > highest_similarity:
            highest_similarity = similarity
            best_match = train_img
    
    print(f"Test Image: {test_img}")
    print(f"Most Similar Image: {best_match}")
    print(f"Similarity Score: {highest_similarity}\n")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 284ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 150ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 154ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 157ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 157ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 148ms/step
Test Image: invoice_77098.jpg
Most Similar Image: invoice_77073.jpg
Similarity Score: 0.9861023215324117

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 166ms/step
Test Image: invoice_102857.jpg
Most Similar Image: invoice_102856.jpg
Similarity Score: 0.9614406268093565

