In [1]:
!pip install pillow imagehash




In [1]:
import os
import time
from PIL import Image
import imagehash
import psutil
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock

def resize_image(img, size):
    """Resize image to the given size."""
    return img.resize(size)

def convert_to_grayscale(img):
    """Convert image to grayscale."""
    return img.convert('L')

def process_image(file_path, target_size):
    """Process an image: convert to grayscale, resize, and compute pHash."""
    try:
        with Image.open(file_path) as img:
            grayscale_img = convert_to_grayscale(img)
            resized_img = resize_image(grayscale_img, target_size)
            img_hash = imagehash.phash(resized_img)
        return img_hash, file_path
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

def find_duplicate_images(folder, target_size=(298, 298)):
    # Dictionary to store image hashes and their paths
    hash_dict = {}
    hash_dict_lock = Lock()

    # List to store all image file paths
    image_files = []
    for root, dirs, files in os.walk(folder):
        for file in files:
            if file.lower().endswith(('.png', '.jpg', '.jpeg', '.tif', '.tiff')):
                file_path = os.path.join(root, file)
                image_files.append(file_path)

    # Use ThreadPoolExecutor to process images in parallel
    with ThreadPoolExecutor() as executor:
        futures = {executor.submit(process_image, file_path, target_size): file_path for file_path in image_files}
        for future in as_completed(futures):
            result = future.result()
            if result:
                img_hash, file_path = result
                with hash_dict_lock:
                    if img_hash in hash_dict:
                        hash_dict[img_hash].append(file_path)
                    else:
                        hash_dict[img_hash] = [file_path]

    # Identify duplicate images
    duplicates = []
    for key, value in hash_dict.items():
        if len(value) > 1:
            duplicates.append(value)

    return duplicates

def measure_memory_usage():
    process = psutil.Process(os.getpid())
    memory_usage = process.memory_info().rss / 1024 / 1024  # in MB
    return memory_usage

if __name__ == "__main__":
    folder_path = r'C:\Users\uu1n23\Downloads\DeduplicationFinal'
    
    # Measure memory usage before execution
    memory_before = measure_memory_usage()
    
    # Measure execution time
    start_time = time.time()
    duplicate_pairs = find_duplicate_images(folder_path)
    end_time = time.time()
    elapsed_time = end_time - start_time
    
    # Measure memory usage after execution
    memory_after = measure_memory_usage()
    memory_used = memory_after - memory_before
    
    print(f"Time taken: {elapsed_time:.2f} seconds")
    print(f"Memory used: {memory_used:.2f} MB")

    print("Duplicate image pairs:")
    for pair in duplicate_pairs:
        print(pair)




Time taken: 4.13 seconds
Memory used: 22.41 MB
Duplicate image pairs:
['C:\\Users\\uu1n23\\Downloads\\DeduplicationFinal\\RMM 1974_7_o4.jpg', 'C:\\Users\\uu1n23\\Downloads\\DeduplicationFinal\\RMM 1974_7.tif', 'C:\\Users\\uu1n23\\Downloads\\DeduplicationFinal\\Art\\hmp_rmmus_007_74.tif']
['C:\\Users\\uu1n23\\Downloads\\DeduplicationFinal\\RMM 1974_10_o4.jpg', 'C:\\Users\\uu1n23\\Downloads\\DeduplicationFinal\\hmp_rmmus_010_74.tif', 'C:\\Users\\uu1n23\\Downloads\\DeduplicationFinal\\Art\\hmp_rmmus_010_74.tif']
['C:\\Users\\uu1n23\\Downloads\\DeduplicationFinal\\RMM 1974_13_o2.jpg', 'C:\\Users\\uu1n23\\Downloads\\DeduplicationFinal\\RMM 1974_13.tif', 'C:\\Users\\uu1n23\\Downloads\\DeduplicationFinal\\Art\\hmp_rmmus_13_74.tif']
['C:\\Users\\uu1n23\\Downloads\\DeduplicationFinal\\RMM 1974_8.tif', 'C:\\Users\\uu1n23\\Downloads\\DeduplicationFinal\\hmp_rmmus_008_74.tif', 'C:\\Users\\uu1n23\\Downloads\\DeduplicationFinal\\Art\\hmp_rmmus_008_74.tif']
['C:\\Users\\uu1n23\\Downloads\\Deduplicati

In [2]:
def calculate_accuracy(ground_truth, duplicate_pairs):
    # Convert ground truth pairs to a set of tuples without paths
    ground_truth_set = set((os.path.basename(pair[0]), os.path.basename(pair[1])) for pair in ground_truth)
    
    # Convert detected pairs to a set of tuples without paths
    detected_pairs_set = set((os.path.basename(pair[0]), os.path.basename(pair[1])) for pair in duplicate_pairs)
    
    # Calculate intersection (common pairs)
    common_pairs = ground_truth_set.intersection(detected_pairs_set)
    
    # Calculate accuracy as the ratio of correctly detected pairs to total ground truth pairs
    accuracy = len(common_pairs) / len(ground_truth)
    
    return accuracy
if __name__ == "__main__":
    # Your ground truth pairs
    ground_truth = [('hmp_rmmus_010_74.tif', 'RMM 1974_10_o4.jpg'),
('hmp_rmmus_011_74.tif', 'RMM 1974_11.tif'),
('RMM 1974_13.tif', 'RMM 1974_13_o2.jpg'),
('hmp_rmmus_006_74.tif', 'RMM 1974_6.tif'),
('RMM 1974_7.tif', 'RMM 1974_7_o4.jpg'),
('hmp_rmmus_008_74.tif', 'RMM 1974_8.tif'),
('Agincourt HMS 3_1956 W&L 1036P.jpg', 'RNM 2001_2_1036_P_o2.jpg'),
('Abdiel HMS 1940 1983.jpg', 'Abdiel HMS 1940 1983_15_1.jpg'),
('Achilles HMS 1905  24594  4.7pc.jpg', 'Achilles HMS 1905.jpg'),
('RNM 2001_2_2428_B2_o2.jpg', 'Achilles HMS 3.1971 Leander Gun Frigate W&L 2428B2.jpg'),
('Active HMS  1911 light cruiser 1985_32 - Copy.jpg', 'Active HMS  1911 light cruiser 1985_32.jpg'),
('Agincourt HMS 3_1956 W&L 1036P.jpg', 'Agincourt HMS 3_1956 W&L 1036P.jpg'),
('Ajax HMS 5.6.1935 W&L 412 B1.jpg', 'Ajax HMS 5.6.1935.jpg'),
('hmp_rmmus_006_74.tif', 'hmp_rmmus_006_74.tif'),
('RMM 1974_7.tif', 'hmp_rmmus_007_74.tif'),
('hmp_rmmus_008_71.tif', 'hmp_rmmus_008_71 - Copy.tif'),
('hmp_rmmus_008_71.tif', 'hmp_rmmus_008_71.tif'),
('hmp_rmmus_008_74.tif', 'hmp_rmmus_008_74.tif'),
('hmp_rmmus_009_74.tif', 'hmp_rmmus_009_74.tif'),
('hmp_rmmus_010_74.tif', 'hmp_rmmus_010_74.tif'),
('hmp_rmmus_011_74.tif', 'hmp_rmmus_011_74.tif'),
('RMM 1974_13.tif', 'hmp_rmmus_13_74.tif'),
('RMM 1973_170.tif', 'hmp_rmmus_170_73.tif'),
('RMM 1974_19.tif', 'hmp_rmmus_19_74.tif'),
('RMM 1974_20.tif', 'hmp_rmmus_20_74.tif'),
('RMM 1971_278.tif', 'hmp_rmmus_278_71.tif'),
('RMM 1967_304.tif', 'hmp_rmmus_304_67.tif'),
('hmp_rmmus_88_02o - Copy.tif', 'hmp_rmmus_88_02o.tif'),
('hmp_rmmus_88_02t.tif', 'hmp_rmmus_88_02ta.tif'),
('NMRN  2017_106_178_unframed.jpg', 'NMRN  2017_106_178_unframed.tif'),
('NMRN 2016_26_5 Commander Daniel Little Couch.jpg', 'NMRN 2016_26_5 Commander Daniel Little Couch.tif'),
('NMRN 2016_29_1 Cdr Crooke RNR low res.jpg', 'NMRN 2016_29_1 Cdr Crooke RNR.jpg'),
('RNM 1982_1572 - Copy.tif', 'RNM 1982_1572.tif'),
('RNM 1985_301.tif', 'RNM 1985_301.tif'),
('RNM 2000_45_1 - Copy.tif', 'RNM 2000_45_1.tif'),
('Cardiff - Copy.jpg', 'Cardiff.jpg'),
('Exeter(2) - Copy.jpg', 'Exeter(2).jpg'),
('Exeter(8) - Copy.jpg', 'Exeter(8).jpg'),
('Glasgow(1) - Copy.jpg', 'Glasgow(1).jpg'),
('Black Rover(4).jpg', 'Black Rover(4).jpg'),
('Regent(3).jpg', 'Regent(3).jpg'),
('Achiles(1) - Copy.jpg', 'Achiles(1).jpg'),
('Phoebe(1) - Copy.jpg', 'Phoebe(1).jpg'),
('Sirius(2).jpg', 'Sirius(2).jpg'),
('Sirius(5) - Copy.jpg', 'Sirius(5).jpg'),
('Theseus.jpg', 'Theseus.jpg'),
('St Austell.jpg', 'St Austell.jpg'),
('London(1) - Copy.jpg', 'London(1).jpg'),
('Sheffield(6) - Copy.jpg', 'Sheffield(6).jpg'),
('Fearless.jpg', 'Fearless.jpg'),
('Lynx-1.jpg', 'Lynx-1.jpg'),
('Argus-1.jpg', 'Argus-1.jpg'),
('Brinton.jpg', 'Brinton.jpg'),
('Active-1.jpg', 'Amazon-1.jpg'),
('Exeter-2.jpg', 'Liverpool-2.jpg'),
('Sheffield-1.jpg', 'Sheffield-1.jpg'),
('Bangor - Copy.jpg', 'Bangor - Copy.jpg'),
('Bangor - Copy.jpg', 'Bangor.jpg'),
('Cumberland.jpg', 'Cumberland.jpg'),
('Gloucester.jpg', 'Gloucester.jpg'),]
    
# Calculate accuracy
accuracy = calculate_accuracy(ground_truth, duplicate_pairs)
print(f"Accuracy: {accuracy * 100:.2f}%")


Accuracy: 51.67%


In [3]:
from sklearn.metrics import jaccard_score

def calculate_jaccard_index(ground_truth, duplicate_pairs):
    ground_truth_set = set((os.path.basename(pair[0]), os.path.basename(pair[1])) for pair in ground_truth)
    detected_pairs_set = set((os.path.basename(pair[0]), os.path.basename(pair[1])) for pair in duplicate_pairs)
    
    y_true = []
    y_pred = []

    for pair in ground_truth_set:
        y_true.append(1)
        y_pred.append(1 if pair in detected_pairs_set else 0)
    
    for pair in detected_pairs_set - ground_truth_set:
        y_true.append(0)
        y_pred.append(1)
    
    jaccard = jaccard_score(y_true, y_pred)
    return jaccard

jaccard_index = calculate_jaccard_index(ground_truth, duplicate_pairs)
print(f"Jaccard Index: {jaccard_index:.2f}")


Jaccard Index: 0.43


In [4]:
from sklearn.metrics import roc_curve, auc

def calculate_roc_auc(ground_truth, duplicate_pairs):
    # Convert to sets for easy comparison
    ground_truth_set = set((os.path.basename(pair[0]), os.path.basename(pair[1])) for pair in ground_truth)
    detected_pairs_set = set((os.path.basename(pair[0]), os.path.basename(pair[1])) for pair in duplicate_pairs)
    
    y_true = []
    y_scores = []

    for pair in ground_truth_set:
        y_true.append(1)  # True positive
        y_scores.append(1 if pair in detected_pairs_set else 0)
    
    for pair in detected_pairs_set - ground_truth_set:
        y_true.append(0)  # False positive
        y_scores.append(1)
    
    fpr, tpr, thresholds = roc_curve(y_true, y_scores)
    roc_auc = auc(fpr, tpr)
    
    return fpr, tpr, roc_auc

fpr, tpr, roc_auc = calculate_roc_auc(ground_truth, duplicate_pairs)
print(f"AUC: {roc_auc:.2f}")


AUC: 0.33


In [6]:
!pip install matplotlib


Collecting matplotlib
  Downloading matplotlib-3.7.5-cp38-cp38-win_amd64.whl.metadata (5.8 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Downloading contourpy-1.1.1-cp38-cp38-win_amd64.whl.metadata (5.9 kB)
Collecting cycler>=0.10 (from matplotlib)
  Downloading cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.53.1-cp38-cp38-win_amd64.whl.metadata (165 kB)
Collecting kiwisolver>=1.0.1 (from matplotlib)
  Downloading kiwisolver-1.4.5-cp38-cp38-win_amd64.whl.metadata (6.5 kB)
Collecting pyparsing>=2.3.1 (from matplotlib)
  Downloading pyparsing-3.1.2-py3-none-any.whl.metadata (5.1 kB)
Collecting importlib-resources>=3.2.0 (from matplotlib)
  Downloading importlib_resources-6.4.3-py3-none-any.whl.metadata (3.9 kB)
Downloading matplotlib-3.7.5-cp38-cp38-win_amd64.whl (7.5 MB)
   ---------------------------------------- 0.0/7.5 MB ? eta -:--:--
   - -------------------------------------- 0.3/7.5 MB ? eta -:--:--


In [5]:
ground_truth = [('hmp_rmmus_010_74.tif', 'RMM 1974_10_o4.jpg'),
('hmp_rmmus_011_74.tif', 'RMM 1974_11.tif'),
('RMM 1974_13.tif', 'RMM 1974_13_o2.jpg'),
('hmp_rmmus_006_74.tif', 'RMM 1974_6.tif'),
('RMM 1974_7.tif', 'RMM 1974_7_o4.jpg'),
('hmp_rmmus_008_74.tif', 'RMM 1974_8.tif'),
('Agincourt HMS 3_1956 W&L 1036P.jpg', 'RNM 2001_2_1036_P_o2.jpg'),
('Abdiel HMS 1940 1983.jpg', 'Abdiel HMS 1940 1983_15_1.jpg'),
('Achilles HMS 1905  24594  4.7pc.jpg', 'Achilles HMS 1905.jpg'),
('RNM 2001_2_2428_B2_o2.jpg', 'Achilles HMS 3.1971 Leander Gun Frigate W&L 2428B2.jpg'),
('Active HMS  1911 light cruiser 1985_32 - Copy.jpg', 'Active HMS  1911 light cruiser 1985_32.jpg'),
('Agincourt HMS 3_1956 W&L 1036P.jpg', 'Agincourt HMS 3_1956 W&L 1036P.jpg'),
('Ajax HMS 5.6.1935 W&L 412 B1.jpg', 'Ajax HMS 5.6.1935.jpg'),
('hmp_rmmus_006_74.tif', 'hmp_rmmus_006_74.tif'),
('RMM 1974_7.tif', 'hmp_rmmus_007_74.tif'),
('hmp_rmmus_008_71.tif', 'hmp_rmmus_008_71 - Copy.tif'),
('hmp_rmmus_008_71.tif', 'hmp_rmmus_008_71.tif'),
('hmp_rmmus_008_74.tif', 'hmp_rmmus_008_74.tif'),
('hmp_rmmus_009_74.tif', 'hmp_rmmus_009_74.tif'),
('hmp_rmmus_010_74.tif', 'hmp_rmmus_010_74.tif'),
('hmp_rmmus_011_74.tif', 'hmp_rmmus_011_74.tif'),
('RMM 1974_13.tif', 'hmp_rmmus_13_74.tif'),
('RMM 1973_170.tif', 'hmp_rmmus_170_73.tif'),
('RMM 1974_19.tif', 'hmp_rmmus_19_74.tif'),
('RMM 1974_20.tif', 'hmp_rmmus_20_74.tif'),
('RMM 1971_278.tif', 'hmp_rmmus_278_71.tif'),
('RMM 1967_304.tif', 'hmp_rmmus_304_67.tif'),
('hmp_rmmus_88_02o - Copy.tif', 'hmp_rmmus_88_02o.tif'),
('hmp_rmmus_88_02t.tif', 'hmp_rmmus_88_02ta.tif'),
('NMRN  2017_106_178_unframed.jpg', 'NMRN  2017_106_178_unframed.tif'),
('NMRN 2016_26_5 Commander Daniel Little Couch.jpg', 'NMRN 2016_26_5 Commander Daniel Little Couch.tif'),
('NMRN 2016_29_1 Cdr Crooke RNR low res.jpg', 'NMRN 2016_29_1 Cdr Crooke RNR.jpg'),
('RNM 1982_1572 - Copy.tif', 'RNM 1982_1572.tif'),
('RNM 1985_301.tif', 'RNM 1985_301.tif'),
('RNM 2000_45_1 - Copy.tif', 'RNM 2000_45_1.tif'),
('Cardiff - Copy.jpg', 'Cardiff.jpg'),
('Exeter(2) - Copy.jpg', 'Exeter(2).jpg'),
('Exeter(8) - Copy.jpg', 'Exeter(8).jpg'),
('Glasgow(1) - Copy.jpg', 'Glasgow(1).jpg'),
('Black Rover(4).jpg', 'Black Rover(4).jpg'),
('Regent(3).jpg', 'Regent(3).jpg'),
('Achiles(1) - Copy.jpg', 'Achiles(1).jpg'),
('Phoebe(1) - Copy.jpg', 'Phoebe(1).jpg'),
('Sirius(2).jpg', 'Sirius(2).jpg'),
('Sirius(5) - Copy.jpg', 'Sirius(5).jpg'),
('Theseus.jpg', 'Theseus.jpg'),
('St Austell.jpg', 'St Austell.jpg'),
('London(1) - Copy.jpg', 'London(1).jpg'),
('Sheffield(6) - Copy.jpg', 'Sheffield(6).jpg'),
('Fearless.jpg', 'Fearless.jpg'),
('Lynx-1.jpg', 'Lynx-1.jpg'),
('Argus-1.jpg', 'Argus-1.jpg'),
('Brinton.jpg', 'Brinton.jpg'),
('Active-1.jpg', 'Amazon-1.jpg'),
('Exeter-2.jpg', 'Liverpool-2.jpg'),
('Sheffield-1.jpg', 'Sheffield-1.jpg'),
('Bangor - Copy.jpg', 'Bangor - Copy.jpg'),
('Bangor - Copy.jpg', 'Bangor.jpg'),
('Cumberland.jpg', 'Cumberland.jpg'),
('Gloucester.jpg', 'Gloucester.jpg'),]

In [6]:
def calculate_precision(duplicate_pairs, ground_truth):
    detected_pairs_set = set((os.path.basename(pair[0]), os.path.basename(pair[1])) for pair in duplicate_pairs)
    ground_truth_set = set((os.path.basename(pair[0]), os.path.basename(pair[1])) for pair in ground_truth)
    # Convert lists of duplicate pairs into sets of tuples (to handle unordered pairs)
    identified_set = set()
    for group in detected_pairs_set:
        for i in range(len(group)):
            for j in range(i + 1, len(group)):
                identified_set.add(tuple(sorted([group[i], group[j]])))

    ground_truth_set = set()
    for group in ground_truth:
        for i in range(len(group)):
            for j in range(i + 1, len(group)):
                ground_truth_set.add(tuple(sorted([group[i], group[j]])))

    # Calculate the number of correct predictions
    true_positives = identified_set.intersection(ground_truth_set)

    # Calculate precision
    precision = len(true_positives) / len(identified_set) if identified_set else 0

    return precision

# Example usage
precision = calculate_precision(duplicate_pairs, ground_truth)
print(f"Precision: {precision:.2f}")


Precision: 1.00


In [7]:
import os

def calculate_recall(duplicate_pairs, ground_truth):
    detected_pairs_set = set((os.path.basename(pair[0]), os.path.basename(pair[1])) for pair in duplicate_pairs)
    ground_truth_set = set((os.path.basename(pair[0]), os.path.basename(pair[1])) for pair in ground_truth)
    # Convert lists of duplicate pairs into sets of tuples (to handle unordered pairs)
    identified_set = set()
    for group in detected_pairs_set:
        for i in range(len(group)):
            for j in range(i + 1, len(group)):
                identified_set.add(tuple(sorted([group[i], group[j]])))

    ground_truth_set = set()
    for group in ground_truth:
        for i in range(len(group)):
            for j in range(i + 1, len(group)):
                ground_truth_set.add(tuple(sorted([group[i], group[j]])))

    # Calculate the number of correct predictions
    true_positives = identified_set.intersection(ground_truth_set)

    # Calculate recall
    recall = len(true_positives) / len(ground_truth_set) if ground_truth_set else 0

    return recall

# Example usage
recall = calculate_recall(duplicate_pairs, ground_truth)
print(f"Recall: {recall:.2f}")


Recall: 0.72


In [8]:
def calculate_f1_score(duplicate_pairs, ground_truth):
    precision = calculate_precision(duplicate_pairs, ground_truth)
    recall = calculate_recall(duplicate_pairs, ground_truth)

    # Calculate F1 Score
    if precision + recall == 0:
        f1_score = 0
    else:
        f1_score = 2 * (precision * recall) / (precision + recall)

    return f1_score

# Example usage
f1_score = calculate_f1_score(duplicate_pairs, ground_truth)
print(f"F1 Score: {f1_score:.2f}")


F1 Score: 0.83
