In [None]:
import os
import cv2
import numpy as np
import shutil
import glob
from collections import defaultdict
from scipy.spatial.distance import pdist, squareform
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.models import Model
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
import re

# Load VGG16 model for feature extraction
base_model = VGG16(weights='imagenet')
model = Model(inputs=base_model.input, outputs=base_model.get_layer('fc1').output)

def load_and_process_image(image_path):
    img = cv2.imread(image_path)
    img = cv2.resize(img, (224, 224))
    img = np.expand_dims(img, axis=0)
    img = preprocess_input(img)
    return img

def extract_features(img):
    feature = model.predict(img)
    return np.squeeze(feature)

def move_duplicates(duplicates_dict, target_dir):
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)
    for key, values in duplicates_dict.items():
        for file_path in values:
            try:
                shutil.move(file_path, target_dir)
            except Exception as e:
                print(f"Could not move image {file_path}. Error: {e}")

src_dir = '../../compare-method-for-filtering-out-duplicated-image/image-source2'

# Calculate highest_existed_number based on existing directories
trial_pattern = '../../faiss/duplicate/trial*'
trial_list = glob.glob(trial_pattern)
highest_existed_number = max([int(re.search(r'trial(\d+)', trial).group(1)) for trial in trial_list], default=0)

features = []
paths = []
for filename in os.listdir(src_dir):
    if filename.endswith(".jpg") or filename.endswith(".png"):
        try:
            image_path = os.path.join(src_dir, filename)
            img = load_and_process_image(image_path)
            feature = extract_features(img)
            features.append(feature)
            paths.append(image_path)
            print(f"Processed image {filename}")
        except Exception as e:
            print(f"Could not process image {filename}. Error: {e}")

features = np.array(features)
cosine_similarities = 1 - squareform(pdist(features, metric='cosine'))

silhouette_scores = []
thresholds = np.linspace(0.0, 1.0, 100)
for threshold in thresholds:
    # Apply a threshold to the similarity matrix to create a binary adjacency matrix
    adjacency_matrix = (cosine_similarities > threshold).astype(int)

    # Apply clustering using KMeans
    n_clusters = 2  
    kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(features)

    # Calculate silhouette score
    score = silhouette_score(features, kmeans.labels_)
    silhouette_scores.append(score)

optimal_threshold_index = np.argmax(silhouette_scores)
optimal_threshold = thresholds[optimal_threshold_index]
print(f"Optimal cosine similarity threshold: {optimal_threshold}")

indices = np.where(cosine_similarities > optimal_threshold)
duplicates_dict = defaultdict(list)
handled_images = set()
for i in range(len(indices[0])):
    if indices[0][i] < indices[1][i]:
        img1 = paths[indices[0][i]]
        img2 = paths[indices[1][i]]
        if img2 not in handled_images:
            duplicates_dict[img1].append(img2)
            handled_images.add(img2)

save_dir = f'../../faiss/duplicate/trial{highest_existed_number+1}/silhouette_score'

print(f"Duplicates will be moved to: {save_dir}")
print(f"Found {sum([len(val) for val in duplicates_dict.values()])} duplicates.")

move_duplicates(duplicates_dict, save_dir)