In [1]:
import numpy as np
import pandas as pd
import os
from glob import glob

## Install `imagededup`

In [2]:
!pip install imagededup



## Average Hashing

In [3]:
from imagededup.methods import AHash
hasher = AHash()

In [4]:
encodings = hasher.encode_images(image_dir='/kaggle/input/mumu-dance-music-album-covers/mumu-image-annotations/album_covers_dance_music')

2024-01-17 06:54:22,715: INFO Start: Calculating hashes...
100%|██████████| 506/506 [00:00<00:00, 930.29it/s]
2024-01-17 06:54:23,324: INFO End: Calculating hashes!


## Find "duplicates" into Val Split

In [5]:
duplicates = hasher.find_duplicates_to_remove(encoding_map=encodings)

2024-01-17 06:54:23,332: INFO Start: Evaluating hamming distances for getting duplicates
2024-01-17 06:54:23,333: INFO Start: Retrieving duplicates using Cython Brute force algorithm
100%|██████████| 506/506 [00:00<00:00, 8302.73it/s]
2024-01-17 06:54:23,436: INFO End: Retrieving duplicates using Cython Brute force algorithm
2024-01-17 06:54:23,437: INFO End: Evaluating hamming distances for getting duplicates


## Split the "non-duplicates" into random Train/Test Splits

In [6]:
all_images = os.listdir('/kaggle/input/mumu-dance-music-album-covers/mumu-image-annotations/album_covers_dance_music')
unique_images = [f for f in all_images if f not in duplicates]

In [7]:
import random
random.seed(42)

In [8]:
train_set = random.sample(unique_images, 256) 
test_set = [f for f in unique_images if f not in train_set]
val_set = duplicates.copy()

## Save the details of the splits into individual files

In [9]:
print(f"Details of the split \n Size of train set: {len(train_set)} \n Size of test set: {len(test_set)} \n Size of val set: {len(val_set)}")

Details of the split 
 Size of train set: 256 
 Size of test set: 172 
 Size of val set: 78


In [10]:
assert len(set(train_set).intersection(test_set)) == 0
assert len(set(train_set).intersection(val_set)) == 0
assert len(set(test_set).intersection(val_set)) == 0

In [11]:
with open("/kaggle/working/mumu_a_hash_train.txt", "w") as f:
    for _file in train_set:
        f.write(f"{_file}\n")

In [12]:
with open("/kaggle/working/mumu_a_hash_test.txt", "w") as f:
    for _file in test_set:
        f.write(f"{_file}\n")

In [13]:
with open("/kaggle/working/mumu_a_hash_val.txt", "w") as f:
    for _file in val_set:
        f.write(f"{_file}\n")