In [16]:
import os

TRAIN_PATH = "./train"
RELABELS_PATH = "relabels.csv"

os.mkdir("{}/{}".format(TRAIN_PATH, "revise"))

with open(RELABELS_PATH) as f:
    for line in f:
        cols = line.split()
        src = "{}/{}/{}.jpg".format(TRAIN_PATH, cols[1], cols[0])
        dst = "{}/{}/{}.jpg".format(TRAIN_PATH, cols[2], cols[0])

        try:
            os.rename(src, dst)

        except FileNotFoundError:
            print("{} not found".format(src))

In [17]:
import sys
import json
import glob
import numpy as np
from PIL import Image
import os
from shutil import copyfile, copytree
import time
import gc 
from scipy.misc import imread
import cv2
from sklearn import cluster
import re
from utils import *

In [24]:
TRAIN_FOLDER = "./train/"

### First, we need to make a backup of the train folder

In [25]:
copytree(TRAIN_FOLDER, TRAIN_FOLDER[:-1] + "_clustered/")

'./train_clustered/'

In [26]:
TRAIN_FOLDER = TRAIN_FOLDER[:-1] + "_clustered/"

In [27]:
paths = glob.glob(TRAIN_FOLDER + "*/*.jpg")

### Let's first do a classification with the images sizes.

This takes ~3min

In [28]:
# Create a directory if it doesn't exist
def mk(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

In [29]:
for path in paths:
    
    folder = path[:-13]
    temp_img = imread(path)
    destination = "{}{}_{}/".format(folder, temp_img.shape[0], temp_img.shape[1])
    mk(destination)
    
    os.rename(path, destination + path[-13:])

IndexError: tuple index out of range

### Now we'll group images using dbscan.

A lot of code is taken from this notebook: https://www.kaggle.com/anokas/the-nature-conservancy-fisheries-monitoring/finding-boatids

The next cells take ~20min to compute (computing the distance matrix in n-square complex)

In [None]:
def compare(img, img2):
    return np.mean(np.abs(img - img2))

In [None]:
def compare_matrices(mat1, mat2):
    result = np.mean(np.abs(mat2 - mat1),axis=2)  
    return result

In [None]:
# We loop over the folder containing the images.
folders_img_sizes = glob.glob(TRAIN_FOLDER + "*/*/")


print("Nb of folders to do:", len(folders_img_sizes))

for folder in glob.glob(TRAIN_FOLDER + "*/*/"):
    train_files = glob.glob(folder + "*.jpg")
    train = np.array([imread(img) for img in train_files])


    # Resize the images to speed it up.
    train = [cv2.resize(img, (224, 224), cv2.INTER_LINEAR) for img in train]

    custom_train = []
    for img in train:
        im = (img - img.mean()) / img.std()
        custom_train.append(im.reshape((224*224*3,)))

    train = np.array(custom_train)

    l = len(train)
    
    print(l, "images in", folder,  "to create clusters.")
    
    if l <=4:
        continue
    
    distances = np.zeros((l,l))
    
    # We compute the distance matrix
    for i in range(l):
        for j in range(l):
            distances[i,j] = compare(train[i], train[j])
    
    # We compute the clusters
    cls = cluster.DBSCAN(metric='precomputed', min_samples=3, eps=0.1)
    y = cls.fit_predict(distances)
    
    # We move the images in the good folder.
    for path, cluster_idx in zip(train_files, y.tolist()):
        dest_folder = folder + str(cluster_idx) + "/"
        mk(dest_folder)
        dest_path = dest_folder + path[-13:]
        os.rename(path, dest_path)

### Creating the JSON file for easy sharing.

In [18]:
COPIES_FOLDER = "./copies/"

images_kept = os.listdir(COPIES_FOLDER)

images_removed = {}

for img_kept in images_kept:
    path_folder = COPIES_FOLDER + img_kept + "/"
    sequence_removed = os.listdir(path_folder)

    images_removed[img_kept] = sequence_removed

In [19]:
with open('duplicates.json', 'w') as outfile:
    outfile.write(json.dumps(images_removed, indent=4, sort_keys=True))

### Using the JSON to clean the dataset

In [20]:
TRAIN_FOLDER = "./train/"
JSON_PATH = "duplicates.json"

In [21]:
with open(JSON_PATH) as data_file:    
    duplicate_dict = json.load(data_file)

In [22]:
# We need to be able to find the path of a file:

path_dictionary = {}

for class_folder in os.listdir(TRAIN_FOLDER):
    files = os.listdir(TRAIN_FOLDER + class_folder)
    
    for file in files:
        path_dictionary[file] = TRAIN_FOLDER + class_folder + "/" + file

In [23]:
duplicate_folder = TRAIN_FOLDER + "duplicates/"
mk(duplicate_folder)

for file, duplicates in duplicate_dict.items():
    try:
        path = path_dictionary[file]
    except KeyError:
        print("Couldn't find", file, "in the train folder.")
        continue
    mk(duplicate_folder + path.split("/")[-2])    

    dest_folder = duplicate_folder + path.split("/")[-2] + "/" + file
    mk(dest_folder)

    for duplicate in duplicates:
        
        try:
            path = path_dictionary[duplicate]
        except KeyError:
            print("Couldn't find", file, "in the train folder.")
            continue
    
        new_path = dest_folder + "/" + duplicate

        os.rename(path, new_path)

In [None]:
f = "./test1/"

In [20]:
paths = glob.glob(f + "*.jpg")

In [21]:
for path in paths:
    
    folder = path[:-13]
    temp_img = imread(path)
    destination = "{}{}_{}/".format(folder, temp_img.shape[0], temp_img.shape[1])
    mk(destination)
    
    os.rename(path, destination + path[-13:])

In [22]:
# We loop over the folder containing the images.
folders_img_sizes = glob.glob(f + "*/")


print("Nb of folders to do:", len(folders_img_sizes))

for folder in glob.glob(f + "*/"):
    train_files = glob.glob(folder + "*.jpg")
    train = np.array([imread(img) for img in train_files])


    # Resize the images to speed it up.
    train = [cv2.resize(img, (224, 224), cv2.INTER_LINEAR) for img in train]

    custom_train = []
    for img in train:
        im = (img - img.mean()) / img.std()
        custom_train.append(im.reshape((224*224*3,)))

    train = np.array(custom_train)

    l = len(train)
    
    print(l, "images in", folder,  "to create clusters.")
    
    if l <=4:
        continue
    
    distances = np.zeros((l,l))
    
    # We compute the distance matrix
    for i in range(l):
        for j in range(l):
            distances[i,j] = compare(train[i], train[j])
    
    # We compute the clusters
    cls = cluster.DBSCAN(metric='precomputed', min_samples=3, eps=0.2)
    y = cls.fit_predict(distances)
    
    # We move the images in the good folder.
    for path, cluster_idx in zip(train_files, y.tolist()):
        dest_folder = folder + str(cluster_idx) + "/"
        mk(dest_folder)
        dest_path = dest_folder + path[-13:]
        os.rename(path, dest_path)

Nb of folders to do: 10
45 images in ./test1\670_1192\ to create clusters.
4 images in ./test1\700_1244\ to create clusters.
57 images in ./test1\718_1276\ to create clusters.
606 images in ./test1\720_1280\ to create clusters.
148 images in ./test1\750_1280\ to create clusters.
11 images in ./test1\750_1334\ to create clusters.
21 images in ./test1\854_1518\ to create clusters.
7 images in ./test1\924_1280\ to create clusters.
98 images in ./test1\974_1280\ to create clusters.
3 images in ./test1\974_1732\ to create clusters.
