
# ỨNG DỤNG CLUSTERING ĐỂ PHÁT HIỆN ẢNH TRÙNG

1. Yêu cầu chung: Dùng kỹ thuật clustering để tạo công cụ hỗ trợ phát hiện các ảnh trùng nhau

2. Yêu cầu cụ thể:
  - Input: Danh sách các ảnh được lưu trong tập tin, ví dụ CarDataset-Splits-1-Train.csv (xem mô tả https://colab.research.google.com/drive/1gf0GzvW0tHddKtuvMUNIvglUT-J6oW6S?usp=sharing)
  - Output: Danh sách các clusters và hiển thị các ảnh trong cluster

3. Hướng dẫn:
  - Bước 1:
    - Mỗi ảnh cần thực hiện bước rút trích đặc trưng (feature extraction), biểu diễn dưới dạng một vector đặc trưng d chiều (d-dimension).
    - Có nhiều công cụ hỗ trợ bước rút trích đặc trưng, trong bài tập này, chúng ta sẽ chọn một công cụ sao cho tốc độ xử lý nhanh nhưng kết quả tốt. Các mô hình MobileNet (https://keras.io/api/applications/mobilenet/) có thể được dùng vì đáp ứng các tiêu chí này.
  - Bước 2:
    - Chọn một thuật toán clustering - ví dụ K-Means (số lượgn clusters K=5)
    - Ghi kết quả clustering ra tập tin - thay CategoryID bằng ClusterID
  - Bước 3:
    - Hiển thị kết quả clustering - kế thừa kết quả của bài tập Hiển thị dữ liệu https://colab.research.google.com/drive/1rHbKlJd7O9E49SsJlHnZNKcyTbwXT_Ls?usp=sharing
    - Từ kết quả hiển thị, nếu các ảnh nhìn trùng nhau, nhưng tên tập tin khác nhau thì có thể đưa vào danh sách hậu kiểm.

HN \\
Trần Trọng Nhân 21522924 \\
Nguyễn Văn Đức Huy  21520930 \\


In [None]:
import cv2
import os
import torch
import random

import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import plotly.express as px

from tensorflow.keras.applications import (
    MobileNet, MobileNetV2, MobileNetV3Small, MobileNetV3Large,
    ResNet50, ResNet101, ResNet152,
    VGG16, VGG19,
    EfficientNetB0, EfficientNetB1, EfficientNetB7,
    InceptionV3, Xception
)

from PIL import Image
from tqdm import tqdm
from skimage.io import imread
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from scipy.spatial.distance import euclidean
from tensorflow.keras.preprocessing import image
from sklearn.metrics.pairwise import cosine_similarity
from skimage.metrics import structural_similarity as ssim
from tensorflow.keras.applications.mobilenet import preprocess_input

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
base_dir = './drive/MyDrive/Public'
dataset_dir = './drive/MyDrive/Dataset'

categories = ['Others', 'Honda', 'Hyundai', 'KIA', 'Mazda', 'Mitsubishi', 'Suzuki', 'Toyota', 'VinFast']

def get_indexing(categories):
    indexing = {category: idx for idx, category in enumerate(categories)}
    invert_indexing = {idx: category for category, idx in indexing.items()}
    return indexing, invert_indexing

indexing, invert_indexing = get_indexing(categories)

In [None]:
def extract_feature_one_img(image_path, model, input_shape=(224, 224)):
    img = image.load_img(image_path, target_size=input_shape)
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    features = model.predict(x, verbose=0)
    return features.flatten()

def extract_features(df=None,
                     base_dir='./',
                     dataset_dir='./',
                     file_csv='CarDataset-Splits-1-Train.csv',
                     model_name='MobileNet',
                     input_shape=(224, 224),
                     partition=False,
                     partition_size=1000,
                     random_state=42,
                     save_result=False,
                     save_name='extracted_features-Splits-1.npz'):
    models = {
        'MobileNet': MobileNet,
        'MobileNetV2': MobileNetV2,
        'MobileNetV3Small': MobileNetV3Small,
        'MobileNetV3Large': MobileNetV3Large,
        'ResNet50': ResNet50,
        'ResNet101': ResNet101,
        'ResNet152': ResNet152,
        'VGG16': VGG16,
        'VGG19': VGG19,
        'EfficientNetB0': EfficientNetB0,
        'EfficientNetB1': EfficientNetB1,
        'EfficientNetB7': EfficientNetB7,
        'InceptionV3': InceptionV3,
        'Xception': Xception
    }

    if model_name not in models:
        model_name = 'MobileNet'

    device = '/device:GPU:0' if tf.config.list_physical_devices('GPU') else '/device:CPU:0'
    print(f"Using device: {device}")

    with tf.device(device):
        model = models[model_name](weights='imagenet', include_top=False, pooling='avg')

    if df is None:
        df = pd.read_csv(os.path.join(dataset_dir, file_csv))

    if partition:
        sampled_df = df.sample(n=min(partition_size, len(df)), random_state=random_state).reset_index(drop=True)
        print(f"Processing {len(sampled_df)} images out of {len(df)} available.")
    else:
        sampled_df = df
    print("Extracting features...")

    result = []
    for image_path in tqdm(sampled_df["ImageFullPath"].values, desc="Extracting Features"):
        full_path = os.path.join(base_dir, image_path)
        try:
            extracted_features = extract_feature_one_img(full_path, model, input_shape=input_shape)
            result.append({'ImagePath': image_path, 'Extracted Features': extracted_features})
        except Exception as e:
            print(f"Error processing image {full_path}: {e}. Skipping...")

    print(f"Successfully processed {len(result)} images")

    if save_result:
        save_path = os.path.join(dataset_dir, save_name)
        np.savez(save_path, extracted_features=result)
        print(f"Extracted features saved to {save_path}")

    return result

In [None]:
extracted_features = extract_features(base_dir=base_dir,
                                      dataset_dir=dataset_dir,
                                      file_csv='CarDataset-Splits-1-Train.csv',
                                      model_name='MobileNet',
                                      input_shape=(224, 224),
                                      partition=True,
                                      partition_size=1000,
                                      save_result=True,
                                      save_name='extracted_features-Splits-1.npz',
                                      )

Using device: /device:CPU:0


  model = models[model_name](weights='imagenet', include_top=False, pooling='avg')


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet/mobilenet_1_0_224_tf_no_top.h5
[1m17225924/17225924[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Processing 1000 images out of 29408 available.
Extracting features...


Extracting Features: 100%|██████████| 1000/1000 [09:03<00:00,  1.84it/s]

Successfully processed 1000 images
Extracted features saved to ./drive/MyDrive/Dataset/extracted_features-Splits-1.npz





In [None]:
def perform_clustering(extracted_features=None,
                       dataset_dir='./',
                       extracted_file='extracted_features-Splits-1.npz',
                       n_clusters=9,
                       random_state=0,
                       save_file=False,
                       save_name='Clustering_Results.csv',
                       ) -> pd.DataFrame:
    if extracted_features == None:
        data = np.load(os.path.join(dataset_dir, extracted_file), allow_pickle=True)
        extracted_features = data['extracted_features']

    features_list = [x['Extracted Features'] for x in extracted_features]
    image_path_list = [x['ImagePath'] for x in extracted_features]

    clustering = KMeans(n_clusters=n_clusters, random_state=random_state).fit(features_list)

    df = pd.DataFrame({'ImageFullPath': image_path_list, 'ClusterID': clustering.labels_})
    if save_file:
      save_path = os.path.join(dataset_dir, save_name)
      df.to_csv(save_path, index=False)
      print(f"Saved to {save_path}")

    return df, clustering, features_list, image_path_list

In [None]:
df, clustering, features_list, image_path_list = perform_clustering(dataset_dir=dataset_dir,
                                                                    extracted_file='extracted_features-Splits-1.npz',
                                                                    save_file=True,
                                                                    save_name='Clustering_results.csv',
                                                                    )

Saved to ./drive/MyDrive/Dataset/Clustering_results.csv


In [None]:
df.head(10)

Unnamed: 0,ImageFullPath,ClusterID
0,Hyundai/22521492-22521599.Hyundai.15.jpg,5
1,Suzuki/22520459-22520507-22520862.Suzuki.820.jpg,4
2,Toyota/22520477-22520490.Toyota.90.jpg,5
3,Mazda/22521027-22520195-22521060.Mazda.961.jpg,5
4,KIA/22520394-22520395.KIA.109.jpg,3
5,Toyota/22521027-22520195-22521060.Toyota.1845.jpg,2
6,Others/22521070-22520211.Others.18.jpg,4
7,Toyota/22521070-22520211.Toyota.208.jpg,1
8,Toyota/22521027-22520195-22521060.Toyota.0443.jpg,5
9,Suzuki/22520459-22520507-22520862.Suzuki.735.jpg,4
