### Import libraries

In [2]:
# Fix randomness and hide warnings
seed = 42

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['PYTHONHASHSEED'] = str(seed)
os.environ['MPLCONFIGDIR'] = os.getcwd()+'/configs/'

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=Warning)

import numpy as np
np.random.seed(seed)

import logging

import random
random.seed(seed)

In [3]:
# Import tensorflow
import tensorflow as tf
from tensorflow import keras as tfk
from tensorflow.keras import layers as tfkl
tf.autograph.set_verbosity(0)
tf.get_logger().setLevel(logging.ERROR)
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
tf.random.set_seed(seed)
tf.compat.v1.set_random_seed(seed)
print(tf.__version__)

2.14.0


In [4]:
# Import other libraries
import cv2
from tensorflow.keras.applications.mobilenet import preprocess_input
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
import seaborn as sns

In [5]:
import numpy as np
from PIL import Image
import imagehash

#### Load and clean the dataset

In [6]:
# Conditional check for unzipping
unzip = True

# Unzip the 'public_data.zip' file if the 'unzip' flag is True
if unzip:
    !unzip public_data.zip


Archive:  public_data.zip
replace public_data.npz? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [10]:
# Load the data from the file
file_path = 'public_data.npz'
loaded_data = np.load(file_path, allow_pickle=True)

In [11]:
# Access the 'data' and 'labels' arrays
data = loaded_data['data']
labels = loaded_data['labels']

In [20]:
# 58 -> Shrek - 338 -> Trololo
to_extract = [data[58], data[338]]
outliers = []

for i, image in enumerate(to_extract):
    pil_image = Image.fromarray(image.astype('uint8'))
    img_hash = imagehash.average_hash(pil_image)

    outliers.append(img_hash)
    print(img_hash)

081c1e1e1e3f3f1b
02393d3e3c3c1c00


In [22]:
# Access the 'data' and 'labels' arrays
data = loaded_data['data']
labels = loaded_data['labels']

# Create a list to store unique hashes and their corresponding indices
hashes = []
unique_indices = []
raw_images = []
duplicates = []
indices_duplicates = []

# Iterate through the images and calculate their perceptual hashes
for i, image in enumerate(data):
    pil_image = Image.fromarray(image.astype('uint8'))
    img_hash = imagehash.average_hash(pil_image)

    pil_image.info['label'] = labels[i]
    raw_images.append(pil_image)

    # Check if the hash is unique
    if img_hash not in hashes and img_hash not in outliers:
        hashes.append(img_hash)
        unique_indices.append(i)
        print
    else:
        duplicates.append(pil_image)
        indices_duplicates.append(i)

# 

# Create a new numpy array with only the unique images
unique_data = data[unique_indices]

# Create a new numpy array with only the unique labels
unique_labels = labels[unique_indices]

duplicates_labels = labels[indices_duplicates]

# Save the cleaned dataset
np.savez('cleaned_data.npz', data=unique_data, labels=unique_labels)
np.savez('duplicates.npz', data=duplicates, labels=duplicates_labels)

081c1e1e1e3f3f1b
081c1e1e1e3f3f1b
081c1e1e1e3f3f1b
081c1e1e1e3f3f1b
3880cccfcfc78f1f
081c1e1e1e3f3f1b
081c1e1e1e3f3f1b
3d3f8ca677f3f018
20001f7dfffefe80
e3f92f83101c91fb
02393d3e3c3c1c00
02393d3e3c3c1c00
02393d3e3c3c1c00
e3f92f83101c91fb
02393d3e3c3c1c00
081c1e1e1e3f3f1b
0f6f200d47f7f4e1
02393d3e3c3c1c00
270383c3c3dbb9f0
081c1e1e1e3f3f1b
be0ccbcedcf0e1c8
081c1e1e1e3f3f1b
081c1e1e1e3f3f1b
081c1e1e1e3f3f1b
081c1e1e1e3f3f1b
02393d3e3c3c1c00
b7070f1cbcad8931
02393d3e3c3c1c00
02393d3e3c3c1c00
02393d3e3c3c1c00
081c1e1e1e3f3f1b
02393d3e3c3c1c00
8cc6e3f339091b7c
081c1e1e1e3f3f1b
081c1e1e1e3f3f1b
081c1e1e1e3f3f1b
f3f3f0472f7ef8e4
02393d3e3c3c1c00
081c1e1e1e3f3f1b
02393d3e3c3c1c00
02393d3e3c3c1c00
081c1e1e1e3f3f1b
0030194f0f071f5f
02393d3e3c3c1c00
24262f3f3fcc892f
081c1e1e1e3f3f1b
02393d3e3c3c1c00
02393d3e3c3c1c00
02393d3e3c3c1c00
02393d3e3c3c1c00
ff8f07040287676f
02393d3e3c3c1c00
02393d3e3c3c1c00
02393d3e3c3c1c00
02393d3e3c3c1c00
3e3c0d9df9010101
081c1e1e1e3f3f1b
081c1e1e1e3f3f1b
081c1e1e1e3f3f

In [24]:
# Load the data from the file
file_path = 'cleaned_data.npz'
loaded_data_cleaned = np.load(file_path, allow_pickle=True)

In [23]:
# Load the data from the file
file_path = 'duplicates.npz'
duplicates_file = np.load(file_path, allow_pickle=True)

duplicates_data = duplicates_file['data']
duplicates_labels = duplicates_file['labels']

# Create a folder to store the images
output_folder = 'duplicate_images'
os.makedirs(output_folder, exist_ok=True)

# Iterate through the images and save them to the folder
for i, image in enumerate(duplicates_data):
    image_filename = os.path.join(output_folder, f'{i}_{duplicates_labels[i]}.png')
    
    # Convert the numpy array to a PIL image
    pil_image = Image.fromarray(image.astype('uint8'))

    # Save the PIL image to a file
    pil_image.save(image_filename)

In [9]:
data = loaded_data['data']

# Create a folder to store the images
output_folder = 'output_images'
os.makedirs(output_folder, exist_ok=True)

# Iterate through the images and save them to the folder
for i, image in enumerate(data):
    image_filename = os.path.join(output_folder, f'image_{i}.png')
    
    # Convert the numpy array to a PIL image
    pil_image = Image.fromarray(image.astype('uint8'))

    # Save the PIL image to a file
    pil_image.save(image_filename)

In [25]:
cleaned_data = loaded_data_cleaned['data']

# Create a folder to store the images
output_folder = 'output_images_cleaned'
os.makedirs(output_folder, exist_ok=True)

# Iterate through the images and save them to the folder
for i, image in enumerate(cleaned_data):
    image_filename = os.path.join(output_folder, f'image_{i}.png')
    
    # Convert the numpy array to a PIL image
    pil_image = Image.fromarray(image.astype('uint8'))

    # Save the PIL image to a file
    pil_image.save(image_filename)