In [6]:
import os
import time
import numpy as np
import psutil
from PIL import Image
from tensorflow.keras.applications import InceptionV3
from tensorflow.keras.applications.inception_v3 import preprocess_input
from tensorflow.keras.preprocessing import image as keras_image

# Define global variables
TARGET_SIZE = (299, 299)
FEATURES_CACHE_FILE = 'image_features.npy'

def resize_image(img, size):
    """Resize image to the given size."""
    return img.resize(size)

def convert_to_rgb(img):
    """Convert image to RGB mode."""
    return img.convert('RGB')  # Ensure image is in RGB mode

def preprocess_image(img):
    """Preprocess image for InceptionV3 model."""
    img = img.resize(TARGET_SIZE)
    img = keras_image.img_to_array(img)
    img = np.expand_dims(img, axis=0)
    img = preprocess_input(img)
    return img

def load_inception_model():
    """Load pre-trained InceptionV3 model."""
    return InceptionV3(weights='imagenet', include_top=False, pooling='avg')

def extract_features(img, model):
    """Extract features using the InceptionV3 model."""
    return model.predict(img)

def measure_memory_usage():
    process = psutil.Process(os.getpid())
    memory_usage = process.memory_info().rss / 1024 / 1024  # in MB
    return memory_usage

def cache_image_features(folder):
    """Cache image features to a file for faster processing."""
    model = load_inception_model()
    feature_dict = {}

    for root, dirs, files in os.walk(folder):
        for file in files:
            if file.lower().endswith(('.png', '.jpg', '.jpeg', '.tif', '.tiff')):
                file_path = os.path.join(root, file)

                try:
                    with Image.open(file_path) as img:
                        rgb_img = convert_to_rgb(img)
                        preprocessed_img = preprocess_image(rgb_img)
                        img_features = extract_features(preprocessed_img, model)
                        feature_dict[file_path] = img_features

                except Exception as e:
                    print(f"Error processing {file_path}: {e}")

    # Save feature dictionary to file
    np.save(FEATURES_CACHE_FILE, feature_dict)

def load_cached_features():
    """Load cached image features from file."""
    if os.path.exists(FEATURES_CACHE_FILE):
        return np.load(FEATURES_CACHE_FILE, allow_pickle=True).item()
    else:
        return None

def find_duplicate_images(folder):
    # Load or cache image features
    cached_features = load_cached_features()
    if cached_features is None:
        cache_image_features(folder)
        cached_features = load_cached_features()

    # Dictionary to store image features and their paths
    feature_dict = cached_features

    # Identify duplicate images based on feature similarity
    duplicates = []
    processed_files = set()

    for file_path, features in feature_dict.items():
        if file_path not in processed_files:
            # Compare current image with others
            duplicate_group = [file_path]
            processed_files.add(file_path)

            for other_path, other_features in feature_dict.items():
                if other_path not in processed_files:
                    similarity = np.linalg.norm(features - other_features)
                    if similarity < 0.1:  # Adjust this threshold based on your needs
                        duplicate_group.append(other_path)
                        processed_files.add(other_path)

            if len(duplicate_group) > 1:
                duplicates.append(duplicate_group)

    return duplicates

if __name__ == "__main__":
    folder_path = r'C:\Users\DELL\Desktop\hsshing\Deduplication'
    
    # Measure memory usage before execution
    memory_before = measure_memory_usage()
    
    # Measure execution time
    start_time = time.time()
    duplicate_pairs = find_duplicate_images(folder_path)
    end_time = time.time()
    elapsed_time = end_time - start_time
    
    # Measure memory usage after execution
    memory_after = measure_memory_usage()
    memory_used = memory_after - memory_before
    
    print(f"Time taken: {elapsed_time:.2f} seconds")
    print(f"Memory used: {memory_used:.2f} MB")

    print("Duplicate image groups:")
    for group in duplicate_pairs:
        print(group)


Time taken: 1.20 seconds
Memory used: 1.07 MB
Duplicate image groups:
['C:\\Users\\DELL\\Desktop\\hsshing\\final\\1\\1.1\\1.1.1\\Abdiel HMS 1940 1983.jpg', 'C:\\Users\\DELL\\Desktop\\hsshing\\final\\1\\1.1\\1.1.1\\Abdiel HMS 1940 1983_15_1.jpg']
['C:\\Users\\DELL\\Desktop\\hsshing\\final\\1\\1.1\\1.1.1\\Achilles HMS 1905  24594  4.7pc.jpg', 'C:\\Users\\DELL\\Desktop\\hsshing\\final\\1\\1.1\\1.1.1\\Achilles HMS 1905.jpg']
['C:\\Users\\DELL\\Desktop\\hsshing\\final\\1\\1.1\\1.1.1\\Achilles(3) - Copy.jpg', 'C:\\Users\\DELL\\Desktop\\hsshing\\final\\1\\1.1\\1.1.2\\Achilles(3).jpg']
['C:\\Users\\DELL\\Desktop\\hsshing\\final\\1\\1.1\\1.1.2\\Achiles(1) - Copy.jpg', 'C:\\Users\\DELL\\Desktop\\hsshing\\final\\1\\1.1\\1.1.2\\Achiles(1).jpg']
['C:\\Users\\DELL\\Desktop\\hsshing\\final\\1\\1.2\\1.2.2\\Ajax HMS 5.6.1935 W&L 412 B1.jpg', 'C:\\Users\\DELL\\Desktop\\hsshing\\final\\1\\1.2\\1.2.2\\Ajax HMS 5.6.1935.jpg']
['C:\\Users\\DELL\\Desktop\\hsshing\\final\\2\\2.1\\Unidentified-09 - Copy.jpg', 

In [7]:
def calculate_accuracy(ground_truth, duplicate_pairs):
    # Convert ground truth pairs to a set of tuples without paths
    ground_truth_set = set((os.path.basename(pair[0]), os.path.basename(pair[1])) for pair in ground_truth)
    
    # Convert detected pairs to a set of tuples without paths
    detected_pairs_set = set((os.path.basename(pair[0]), os.path.basename(pair[1])) for pair in duplicate_pairs)
    
    # Calculate intersection (common pairs)
    common_pairs = ground_truth_set.intersection(detected_pairs_set)
    
    # Calculate accuracy as the ratio of correctly detected pairs to total ground truth pairs
    accuracy = len(common_pairs) / len(ground_truth)
    
    return accuracy

if __name__ == "__main__":
    # Your ground truth pairs
    ground_truth = [
        ('Abdiel HMS 1940 1983.jpg', 'Abdiel HMS 1940 1983_15_1.jpg'),
        ('Achilles HMS 1905  24594  4.7pc.jpg', 'Achilles HMS 1905.jpg'),
        ('Ajax HMS 5.6.1935 W&L 412 B1.jpg', 'Ajax HMS 5.6.1935.jpg'),
        ('Brocklesby.jpg', 'Brocklesby.jpg'),
        ('Cardiff - Copy.jpg', 'Cardiff.jpg'),
        ('Exeter(2) - Copy.jpg', 'Exeter(2).jpg'),
        ('Exeter(8) - Copy.jpg', 'Exeter(8).jpg'),
        ('Glasgow(1) - Copy.jpg', 'Glasgow(1).jpg'),
        ('Achiles(1) - Copy.jpg', 'Achiles(1).jpg'),
        ('Phoebe(1) - Copy.jpg', 'Phoebe(1).jpg'),
        ('Sirius(5) - Copy.jpg', 'Sirius(5).jpg'),
        ('London(1) - Copy.jpg', 'London(1).jpg'),
        ('Sheffield(6) - Copy.jpg', 'Sheffield(6).jpg'),
        ('Barricade-05 - Copy.jpg', 'Barricade-05.jpg'),
        ('Barricade-07 - Copy.jpg', 'Barricade-07.jpg'),
        ('Brave - Copy.jpg', 'Brave.jpg'),
        ('Bristol - Copy.jpg', 'Bristol.jpg'),
        ('Powerful - Copy.jpg', 'Powerful.jpg'),
        ('Ulm-2 - Copy.jpg', 'Ulm-2.jpg'),
        ('Van Kinsbergen - Copy.jpg', 'Van Kinsbergen.jpg'),
        ('Exeter-2.jpg', 'Liverpool-2.jpg'),
        ('Active-1.jpg', 'Amazon-1.jpg'),
        ('Adept - Copy.jpg', 'Adept.jpg'),
        ('Liverpool - Copy.jpg', 'Liverpool.jpg'),
        ('Salmaid - Copy.jpg', 'Salmaid.jpg'),
        ('Shavington-1 - Copy.jpg', 'Shavington-1.jpg'),
        ('Southampton - Copy.jpg', 'Southampton.jpg'),
        ('Endurance - Copy.jpg', 'Endurance.jpg'),
    ]
    

    # Calculate accuracy
    accuracy = calculate_accuracy(ground_truth, duplicate_pairs)
    print(f"Accuracy: {accuracy * 100:.2f}%")


Accuracy: 35.71%


Custom model

In [1]:
import os
import time
import numpy as np
import psutil
from PIL import Image
import tensorflow as tf
from tensorflow.keras.applications import InceptionV3
from tensorflow.keras.applications.inception_v3 import preprocess_input
from tensorflow.keras.preprocessing import image as keras_image

# Define global variables
TARGET_SIZE = (299, 299)
FEATURES_CACHE_FILE = 'image_features.npy'

def resize_image(img, size):
    """Resize image to the given size."""
    return img.resize(size)

def convert_to_rgb(img):
    """Convert image to RGB mode."""
    return img.convert('RGB')  # Ensure image is in RGB mode

def preprocess_image(img):
    """Preprocess image for InceptionV3 model."""
    img = img.resize(TARGET_SIZE)
    img = keras_image.img_to_array(img)
    img = np.expand_dims(img, axis=0)
    img = preprocess_input(img)
    return img

def load_inception_model():
    """Load customized InceptionV3 model."""
    base_model = InceptionV3(weights='imagenet', include_top=False, pooling='avg')
    x = base_model.output
    x = tf.keras.layers.Dense(496, activation='relu')(x)
    x = tf.keras.layers.Dense(496, activation='relu')(x)
    x = tf.keras.layers.Dense(496, activation='relu')(x)
    predictions = tf.keras.layers.Dense(3, activation='softmax')(x)
    
    model = tf.keras.models.Model(inputs=base_model.input, outputs=predictions)
    return model

def extract_features(img, model):
    """Extract features using the customized InceptionV3 model."""
    return model.predict(img)

def measure_memory_usage():
    process = psutil.Process(os.getpid())
    memory_usage = process.memory_info().rss / 1024 / 1024  # in MB
    return memory_usage

def cache_image_features(folder):
    """Cache image features to a file for faster processing."""
    model = load_inception_model()
    feature_dict = {}

    for root, dirs, files in os.walk(folder):
        for file in files:
            if file.lower().endswith(('.png', '.jpg', '.jpeg', '.tif', '.tiff')):
                file_path = os.path.join(root, file)

                try:
                    with Image.open(file_path) as img:
                        rgb_img = convert_to_rgb(img)
                        preprocessed_img = preprocess_image(rgb_img)
                        img_features = extract_features(preprocessed_img, model)
                        feature_dict[file_path] = img_features

                except Exception as e:
                    print(f"Error processing {file_path}: {e}")

    # Save feature dictionary to file
    np.save(FEATURES_CACHE_FILE, feature_dict)

def load_cached_features():
    """Load cached image features from file."""
    if os.path.exists(FEATURES_CACHE_FILE):
        return np.load(FEATURES_CACHE_FILE, allow_pickle=True).item()
    else:
        return None

def find_duplicate_images(folder):
    # Load or cache image features
    cached_features = load_cached_features()
    if cached_features is None:
        cache_image_features(folder)
        cached_features = load_cached_features()

    # Dictionary to store image features and their paths
    feature_dict = cached_features

    # Identify duplicate images based on feature similarity
    duplicates = []
    processed_files = set()

    for file_path, features in feature_dict.items():
        if file_path not in processed_files:
            # Compare current image with others
            duplicate_group = [file_path]
            processed_files.add(file_path)

            for other_path, other_features in feature_dict.items():
                if other_path not in processed_files:
                    similarity = np.linalg.norm(features - other_features)
                    if similarity < 0.1:  # Adjust this threshold based on your needs
                        duplicate_group.append(other_path)
                        processed_files.add(other_path)

            if len(duplicate_group) > 1:
                duplicates.append(duplicate_group)

    return duplicates

if __name__ == "__main__":
    folder_path = r'C:\Users\DELL\Desktop\hsshing\Deduplication'
    
    # Measure memory usage before execution
    memory_before = measure_memory_usage()
    
    # Measure execution time
    start_time = time.time()
    duplicate_pairs = find_duplicate_images(folder_path)
    end_time = time.time()
    elapsed_time = end_time - start_time
    
    # Measure memory usage after execution
    memory_after = measure_memory_usage()
    memory_used = memory_after - memory_before
    
    print(f"Time taken: {elapsed_time:.2f} seconds")
    print(f"Memory used: {memory_used:.2f} MB")

    print("Duplicate image groups:")
    for group in duplicate_pairs:
        print(group)


Time taken: 0.02 seconds
Memory used: 0.54 MB
Duplicate image groups:
['C:\\Users\\DELL\\Desktop\\hsshing\\Deduplication\\Cadiz1951.jpg', 'C:\\Users\\DELL\\Desktop\\hsshing\\Deduplication\\Cadmus 1946.jpg', 'C:\\Users\\DELL\\Desktop\\hsshing\\Deduplication\\Caesar 1961.jpg', 'C:\\Users\\DELL\\Desktop\\hsshing\\Deduplication\\Cairo 1939.jpg', 'C:\\Users\\DELL\\Desktop\\hsshing\\Deduplication\\Caistor Castle 1953 .jpg', 'C:\\Users\\DELL\\Desktop\\hsshing\\Deduplication\\Calcutta 1918.jpg', 'C:\\Users\\DELL\\Desktop\\hsshing\\Deduplication\\Caledon 1935 .jpg', 'C:\\Users\\DELL\\Desktop\\hsshing\\Deduplication\\Calypso 1935 .jpg', 'C:\\Users\\DELL\\Desktop\\hsshing\\Deduplication\\Cambrian 1963.jpg', 'C:\\Users\\DELL\\Desktop\\hsshing\\Deduplication\\Campania 1952.jpg', 'C:\\Users\\DELL\\Desktop\\hsshing\\Deduplication\\Campbell 1935.jpg', 'C:\\Users\\DELL\\Desktop\\hsshing\\Deduplication\\Camperdown 1953.jpg', 'C:\\Users\\DELL\\Desktop\\hsshing\\Deduplication\\Capetown 1919.jpg', 'C:\\Use

In [2]:
def calculate_accuracy(ground_truth, duplicate_pairs):
    # Convert ground truth pairs to a set of tuples without paths
    ground_truth_set = set((os.path.basename(pair[0]), os.path.basename(pair[1])) for pair in ground_truth)
    
    # Convert detected pairs to a set of tuples without paths
    detected_pairs_set = set((os.path.basename(pair[0]), os.path.basename(pair[1])) for pair in duplicate_pairs)
    
    # Calculate intersection (common pairs)
    common_pairs = ground_truth_set.intersection(detected_pairs_set)
    
    # Calculate accuracy as the ratio of correctly detected pairs to total ground truth pairs
    accuracy = len(common_pairs) / len(ground_truth)
    
    return accuracy

if __name__ == "__main__":
    # Your ground truth pairs
    ground_truth = [
        ('Abdiel HMS 1940 1983.jpg', 'Abdiel HMS 1940 1983_15_1.jpg'),
        ('Achilles HMS 1905  24594  4.7pc.jpg', 'Achilles HMS 1905.jpg'),
        ('Ajax HMS 5.6.1935 W&L 412 B1.jpg', 'Ajax HMS 5.6.1935.jpg'),
        ('Brocklesby.jpg', 'Brocklesby.jpg'),
        ('Cardiff - Copy.jpg', 'Cardiff.jpg'),
        ('Exeter(2) - Copy.jpg', 'Exeter(2).jpg'),
        ('Exeter(8) - Copy.jpg', 'Exeter(8).jpg'),
        ('Glasgow(1) - Copy.jpg', 'Glasgow(1).jpg'),
        ('Achiles(1) - Copy.jpg', 'Achiles(1).jpg'),
        ('Phoebe(1) - Copy.jpg', 'Phoebe(1).jpg'),
        ('Sirius(5) - Copy.jpg', 'Sirius(5).jpg'),
        ('London(1) - Copy.jpg', 'London(1).jpg'),
        ('Sheffield(6) - Copy.jpg', 'Sheffield(6).jpg'),
        ('Barricade-05 - Copy.jpg', 'Barricade-05.jpg'),
        ('Barricade-07 - Copy.jpg', 'Barricade-07.jpg'),
        ('Brave - Copy.jpg', 'Brave.jpg'),
        ('Bristol - Copy.jpg', 'Bristol.jpg'),
        ('Powerful - Copy.jpg', 'Powerful.jpg'),
        ('Ulm-2 - Copy.jpg', 'Ulm-2.jpg'),
        ('Van Kinsbergen - Copy.jpg', 'Van Kinsbergen.jpg'),
        ('Exeter-2.jpg', 'Liverpool-2.jpg'),
        ('Active-1.jpg', 'Amazon-1.jpg'),
        ('Adept - Copy.jpg', 'Adept.jpg'),
        ('Liverpool - Copy.jpg', 'Liverpool.jpg'),
        ('Salmaid - Copy.jpg', 'Salmaid.jpg'),
        ('Shavington-1 - Copy.jpg', 'Shavington-1.jpg'),
        ('Southampton - Copy.jpg', 'Southampton.jpg'),
        ('Endurance - Copy.jpg', 'Endurance.jpg'),
    ]
    

    # Calculate accuracy
    accuracy = calculate_accuracy(ground_truth, duplicate_pairs)
    print(f"Accuracy: {accuracy * 100:.2f}%")


Accuracy: 35.71%


In [1]:
import os
import time
import numpy as np
import psutil
from PIL import Image
from tensorflow.keras.applications import InceptionV3
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing import image as keras_image
from tensorflow.keras.applications.inception_v3 import preprocess_input
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity

def resize_image(img, size):
    """Resize image to the given size."""
    return img.resize(size)

def convert_to_rgb(img):
    """Convert image to RGB mode."""
    return img.convert('RGB')

def preprocess_image(img, target_size):
    """Preprocess image for InceptionV3 model."""
    img = img.resize(target_size)
    img = keras_image.img_to_array(img)
    img = np.expand_dims(img, axis=0)
    img = preprocess_input(img)
    return img

def create_custom_inception_model():
    """Create a customized InceptionV3 model."""
    base_model = InceptionV3(weights='imagenet', include_top=False, input_shape=(299, 299, 3))

    # Add custom layers
    x = base_model.output
    x = GlobalAveragePooling2D()(x)
    x = Dense(496, activation='relu')(x)
    x = Dense(496, activation='relu')(x)
    x = Dense(496, activation='relu')(x)
    
    # Output layer
    predictions = Dense(128, activation='relu')(x)

    # Create and return the model
    model = Model(inputs=base_model.input, outputs=predictions)
    return model

def load_custom_model():
    """Load the customized InceptionV3 model."""
    return create_custom_inception_model()

def extract_features(img, model):
    """Extract features using the customized InceptionV3 model."""
    return model.predict(img)

def measure_memory_usage():
    """Measure the current memory usage of the process."""
    process = psutil.Process(os.getpid())
    memory_usage = process.memory_info().rss / 1024 / 1024  # in MB
    return memory_usage

def find_duplicate_images(folder, model, target_size=(299, 299)):
    """Find duplicate images in a folder based on feature similarity."""
    
    feature_list = []
    file_paths = []

    # Traverse the directory
    for root, dirs, files in os.walk(folder):
        for file in files:
            if file.lower().endswith(('.png', '.jpg', '.jpeg', '.tif', '.tiff')):
                file_path = os.path.join(root, file)
                try:
                    with Image.open(file_path) as img:
                        rgb_img = convert_to_rgb(img)
                        preprocessed_img = preprocess_image(rgb_img, target_size)
                    img_features = extract_features(preprocessed_img, model).flatten()
                    
                    feature_list.append(img_features)
                    file_paths.append(file_path)

                except Exception as e:
                    print(f"Error processing {file_path}: {e}")

    # Normalize features and reduce dimensionality
    feature_list = np.array(feature_list)
    pca = PCA(n_components=50)  # Reduce dimensions to 50 components
    reduced_features = pca.fit_transform(feature_list)

    # Calculate cosine similarity between feature vectors
    similarity_matrix = cosine_similarity(reduced_features)
    threshold = 0.9  # Adjust this threshold based on your needs
    
    # Identify duplicates based on similarity
    duplicates = []
    visited = set()
    
    for i in range(len(file_paths)):
        if i in visited:
            continue
        group = [file_paths[i]]
        visited.add(i)
        for j in range(i + 1, len(file_paths)):
            if similarity_matrix[i, j] > threshold:
                group.append(file_paths[j])
                visited.add(j)
        if len(group) > 1:
            duplicates.append(group)

    return duplicates

if __name__ == "__main__":
    folder_path = r'C:\Users\uu1n23\Downloads\DeduplicationFinal'
    
    model = load_custom_model()
    
    memory_before = measure_memory_usage()
    
    start_time = time.time()
    duplicate_pairs = find_duplicate_images(folder_path, model)
    end_time = time.time()
    elapsed_time = end_time - start_time
    
    memory_after = measure_memory_usage()
    memory_used = memory_after - memory_before
    
    print(f"Time taken: {elapsed_time:.2f} seconds")
    print(f"Memory used: {memory_used:.2f} MB")

    print("Duplicate image groups:")
    for group in duplicate_pairs:
        print(group)










Time taken: 117.40 seconds
Memory used: 378.40 MB
Duplicate image groups:
['C:\\Users\\uu1n23\\Downloads\\DeduplicationFinal\\hmp_rmmus_006_74.tif', 'C:\\Users\\uu1n23\\Downloads\\DeduplicationFinal\\RMM 1974_6.tif', 'C:\\Users\\uu1n23\\Downloads\\DeduplicationFinal\\Art\\hmp_rmmus_006_74.tif']
['C:\\Users\\uu1n23\\Downloads\\DeduplicationFinal\\hmp_rmmus_008_71.tif', 'C:\\Users\\uu1n23\\Downloads\\DeduplicationFinal\\Art\\hmp_rmmus_008_71 - Copy.tif', 'C:\\Users\\uu1n23\\Downloads\\DeduplicationFinal\\Art\\hmp_rmmus_008_71.tif']
['C:\\Users\\uu1n23\\Downloads\\DeduplicationFinal\\hmp_rmmus_008_74.tif', 'C:\\Users\\uu1n23\\Downloads\\DeduplicationFinal\\RMM 1974_8.tif', 'C:\\Users\\uu1n23\\Downloads\\DeduplicationFinal\\Art\\hmp_rmmus_008_74.tif']
['C:\\Users\\uu1n23\\Downloads\\DeduplicationFinal\\hmp_rmmus_009_74.tif', 'C:\\Users\\uu1n23\\Downloads\\DeduplicationFinal\\Art\\hmp_rmmus_009_74.tif']
['C:\\Users\\uu1n23\\Downloads\\DeduplicationFinal\\hmp_rmmus_010_74.tif', 'C:\\Users\\u

In [3]:
def calculate_accuracy(ground_truth, duplicate_pairs):
    # Convert ground truth pairs to a set of tuples without paths
    ground_truth_set = set((os.path.basename(pair[0]), os.path.basename(pair[1])) for pair in ground_truth)
    
    # Convert detected pairs to a set of tuples without paths
    detected_pairs_set = set((os.path.basename(pair[0]), os.path.basename(pair[1])) for pair in duplicate_pairs)
    
    # Calculate intersection (common pairs)
    common_pairs = ground_truth_set.intersection(detected_pairs_set)
    
    # Calculate accuracy as the ratio of correctly detected pairs to total ground truth pairs
    accuracy = len(common_pairs) / len(ground_truth)
    
    return accuracy
if __name__ == "__main__":
    # Your ground truth pairs
    ground_truth =[
    ('hmp_rmmus_011_74.tif', 'RMM 1974_11.tif'),
    ('RMM 1974_13.tif', 'RMM 1974_13_o2.jpg'),
    ('hmp_rmmus_006_74.tif', 'RMM 1974_6.tif'),
    ('RMM 1974_7.tif', 'RMM 1974_7_o4.jpg'),
    ('hmp_rmmus_008_74.tif', 'RMM 1974_8.tif'),
    ('hmp_rmmus_006_74.tif', 'hmp_rmmus_006_74.tif'),
    ('RMM 1974_7.tif', 'hmp_rmmus_007_74.tif'),
    ('hmp_rmmus_008_71.tif', 'hmp_rmmus_008_71 - Copy.tif'),
    ('hmp_rmmus_008_71.tif', 'hmp_rmmus_008_71.tif'),
    ('hmp_rmmus_008_74.tif', 'hmp_rmmus_008_74.tif'),
    ('hmp_rmmus_009_74.tif', 'hmp_rmmus_009_74.tif'),
    ('hmp_rmmus_010_74.tif', 'hmp_rmmus_010_74.tif'),
    ('hmp_rmmus_011_74.tif', 'hmp_rmmus_011_74.tif'),
    ('RMM 1974_13.tif', 'hmp_rmmus_13_74.tif'),
    ('RMM 1973_170.tif', 'hmp_rmmus_170_73.tif'),
    ('RMM 1974_19.tif', 'hmp_rmmus_19_74.tif'),
    ('RMM 1974_20.tif', 'hmp_rmmus_20_74.tif'),
    ('RMM 1971_278.tif', 'hmp_rmmus_278_71.tif'),
    ('RMM 1967_304.tif', 'hmp_rmmus_304_67.tif'),
    ('hmp_rmmus_88_02o - Copy.tif', 'hmp_rmmus_88_02o.tif'),
    ('hmp_rmmus_88_02t.tif', 'hmp_rmmus_88_02ta.tif'),
    ('NMRN  2017_106_178_unframed.jpg', 'NMRN  2017_106_178_unframed.tif'),
    ('NMRN 2016_26_5 Commander Daniel Little Couch.jpg', 'NMRN 2016_26_5 Commander Daniel Little Couch.tif'),
    ('NMRN 2016_29_1 Cdr Crooke RNR low res.jpg', 'NMRN 2016_29_1 Cdr Crooke RNR.jpg'),
    ('RNM 1982_1572 - Copy.tif', 'RNM 1982_1572.tif'),
    ('RNM 2000_45_1 - Copy.tif', 'RNM 2000_45_1.tif'),
    ('Cardiff - Copy.jpg', 'Cardiff.jpg'),
    ('Exeter(2) - Copy.jpg', 'Exeter(2).jpg'),
    ('Exeter(8) - Copy.jpg', 'Exeter(8).jpg'),
    ('Glasgow(1) - Copy.jpg', 'Glasgow(1).jpg'),
    ('Black Rover(4).jpg', 'Black Rover(4).jpg'),
    ('Regent(3).jpg', 'Regent(3).jpg'),
    ('Achiles(1) - Copy.jpg', 'Achiles(1).jpg'),
    ('Phoebe(1) - Copy.jpg', 'Phoebe(1).jpg'),
    ('Sirius(2).jpg', 'Sirius(2).jpg'),
    ('Sirius(5) - Copy.jpg', 'Sirius(5).jpg'),
    ('Theseus.jpg', 'Theseus.jpg'),
    ('St Austell.jpg', 'St Austell.jpg'),
    ('London(1) - Copy.jpg', 'London(1).jpg'),
    ('Sheffield(6) - Copy.jpg', 'Sheffield(6).jpg'),
    ('Fearless.jpg', 'Fearless.jpg'),
    ('Lynx-1.jpg', 'Lynx-1.jpg'),
    ('Argus-1.jpg', 'Argus-1.jpg'),
    ('Brinton.jpg', 'Brinton.jpg'),
    ('Active-1.jpg', 'Amazon-1.jpg'),
    ('Exeter-2.jpg', 'Liverpool-2.jpg'),
    ('Sheffield-1.jpg', 'Sheffield-1.jpg'),
    ('Bangor - Copy.jpg', 'Bangor - Copy.jpg'),
    ('Bangor - Copy.jpg', 'Bangor.jpg'),
    ('Cumberland.jpg', 'Cumberland.jpg'),
    ('Gloucester.jpg', 'Gloucester.jpg'),
]
    
# Calculate accuracy
accuracy = calculate_accuracy(ground_truth, duplicate_pairs)
print(f"Accuracy: {accuracy * 100:.2f}%")


Accuracy: 80.39%


In [4]:
from sklearn.metrics import jaccard_score

def calculate_jaccard_index(ground_truth, duplicate_pairs):
    ground_truth_set = set((os.path.basename(pair[0]), os.path.basename(pair[1])) for pair in ground_truth)
    detected_pairs_set = set((os.path.basename(pair[0]), os.path.basename(pair[1])) for pair in duplicate_pairs)
    
    y_true = []
    y_pred = []

    for pair in ground_truth_set:
        y_true.append(1)
        y_pred.append(1 if pair in detected_pairs_set else 0)
    
    for pair in detected_pairs_set - ground_truth_set:
        y_true.append(0)
        y_pred.append(1)
    
    jaccard = jaccard_score(y_true, y_pred)
    return jaccard

jaccard_index = calculate_jaccard_index(ground_truth, duplicate_pairs)
print(f"Jaccard Index: {jaccard_index:.2f}")


Jaccard Index: 0.75
