# Analysing & Preparing an Image Classification Dataset for Training

## Installation & Setting Up

In [None]:
!pip install pip -U
!pip install fastdup
!pip install pandas
!pip install wurlitzer
%load_ext wurlitzer

## Download Data

In [None]:
# download and extract imagenette
!wget https://s3.amazonaws.com/fast-ai-imageclas/imagenette2-160.tgz
!tar -xf imagenette2-160.tgz

## Load and Format Annotations

In [1]:
import pandas as pd

In [2]:
data_dir = 'imagenette2-160/'
csv_path = 'imagenette2-160/noisy_imagenette.csv'

In [3]:
label_map = {
    'n02979186': 'cassette_player', 
    'n03417042': 'garbage_truck', 
    'n01440764': 'tench', 
    'n02102040': 'English_springer', 
    'n03028079': 'church',
    'n03888257': 'parachute', 
    'n03394916': 'French_horn', 
    'n03000684': 'chain_saw', 
    'n03445777': 'golf_ball', 
    'n03425413': 'gas_pump'
}

In [4]:
# load raw annotations
df_annot = pd.read_csv(csv_path)
df_annot.head(3)

Unnamed: 0,path,noisy_labels_0,noisy_labels_1,noisy_labels_5,noisy_labels_25,noisy_labels_50,is_valid
0,train/n02979186/n02979186_9036.JPEG,n02979186,n02979186,n02979186,n02979186,n02979186,False
1,train/n02979186/n02979186_11957.JPEG,n02979186,n02979186,n02979186,n02979186,n03000684,False
2,train/n02979186/n02979186_9715.JPEG,n02979186,n02979186,n02979186,n03417042,n03000684,False


In [5]:
# change raw format to fastdup's format

# take relevant columns
df_annot = df_annot[['path', 'noisy_labels_0']]

# rename columns to fastdup's column names
df_annot = df_annot.rename({'noisy_labels_0': 'label', 'path': 'img_filename'}, axis='columns')

# create split column
df_annot['split'] = df_annot['img_filename'].apply(lambda x: x.split("/")[0])

# map label ids to regular labels
df_annot['label'] = df_annot['label'].map(label_map)

# show formated annotations
df_annot

Unnamed: 0,img_filename,label,split
0,train/n02979186/n02979186_9036.JPEG,cassette_player,train
1,train/n02979186/n02979186_11957.JPEG,cassette_player,train
2,train/n02979186/n02979186_9715.JPEG,cassette_player,train
3,train/n02979186/n02979186_21736.JPEG,cassette_player,train
4,train/n02979186/ILSVRC2012_val_00046953.JPEG,cassette_player,train
...,...,...,...
13389,val/n03425413/n03425413_17521.JPEG,gas_pump,val
13390,val/n03425413/n03425413_20711.JPEG,gas_pump,val
13391,val/n03425413/n03425413_19050.JPEG,gas_pump,val
13392,val/n03425413/n03425413_13831.JPEG,gas_pump,val


## Import & Run fastdup

In [6]:
import fastdup
work_dir = 'fastdup_imagenette'

# run fastdup with annotations
fd = fastdup.create(work_dir=work_dir, input_dir=data_dir) 
fd.run(annotations=df_annot, ccthreshold=0.9, threshold=0.8)



## Outliers

In [7]:
# visualize outliers
fd.vis.outliers_gallery()

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 26222.59it/s]

Stored outliers visual view in  fastdup_imagenette/galleries/outliers.html





Info,Unnamed: 1
Distance,0.476124
Path,train/n02979186/n02979186_3967.JPEG
label,cassette_player

Info,Unnamed: 1
Distance,0.514679
Path,train/n03445777/n03445777_5218.JPEG
label,golf_ball

Info,Unnamed: 1
Distance,0.515087
Path,train/n02979186/n02979186_10186.JPEG
label,cassette_player

Info,Unnamed: 1
Distance,0.544796
Path,train/n03888257/n03888257_34639.JPEG
label,parachute

Info,Unnamed: 1
Distance,0.55351
Path,train/n02979186/n02979186_11265.JPEG
label,cassette_player

Info,Unnamed: 1
Distance,0.555266
Path,train/n03445777/n03445777_3254.JPEG
label,golf_ball

Info,Unnamed: 1
Distance,0.569853
Path,train/n03445777/n03445777_13576.JPEG
label,golf_ball

Info,Unnamed: 1
Distance,0.579928
Path,val/n02102040/n02102040_7670.JPEG
label,English_springer

Info,Unnamed: 1
Distance,0.583889
Path,val/n03445777/n03445777_5932.JPEG
label,golf_ball

Info,Unnamed: 1
Distance,0.590159
Path,train/n03888257/n03888257_79145.JPEG
label,parachute

Info,Unnamed: 1
Distance,0.607759
Path,train/n03394916/n03394916_37544.JPEG
label,French_horn

Info,Unnamed: 1
Distance,0.608525
Path,train/n03394916/n03394916_33663.JPEG
label,French_horn

Info,Unnamed: 1
Distance,0.609526
Path,train/n03888257/n03888257_7793.JPEG
label,parachute

Info,Unnamed: 1
Distance,0.609977
Path,train/n03028079/n03028079_33913.JPEG
label,church

Info,Unnamed: 1
Distance,0.610503
Path,val/n03445777/n03445777_11122.JPEG
label,golf_ball

Info,Unnamed: 1
Distance,0.610714
Path,val/n03445777/n03445777_9292.JPEG
label,golf_ball

Info,Unnamed: 1
Distance,0.611143
Path,val/n01440764/n01440764_4962.JPEG
label,tench

Info,Unnamed: 1
Distance,0.611296
Path,train/n03888257/n03888257_16223.JPEG
label,parachute

Info,Unnamed: 1
Distance,0.61373
Path,train/n03445777/n03445777_6033.JPEG
label,golf_ball

Info,Unnamed: 1
Distance,0.6138
Path,train/n03888257/n03888257_74946.JPEG
label,parachute


In [8]:
# show outliers image data
fd.outliers().head(5)

Unnamed: 0,index,outlier,nearest,distance,img_filename_outlier,label_outlier,split_outlier,error_code_outlier,is_valid_outlier,img_filename_nearest,label_nearest,split_nearest,error_code_nearest,is_valid_nearest
0,1338,2664,9763,0.476124,train/n02979186/n02979186_3967.JPEG,cassette_player,train,VALID,True,val/n01440764/n01440764_710.JPEG,tench,val,VALID,True
1,1336,8150,7831,0.514679,train/n03445777/n03445777_5218.JPEG,golf_ball,train,VALID,True,train/n03445777/n03445777_18756.JPEG,golf_ball,train,VALID,True
2,1335,1970,1513,0.515087,train/n02979186/n02979186_10186.JPEG,cassette_player,train,VALID,True,train/n02102040/n02102040_4835.JPEG,English_springer,train,VALID,True
3,1332,9087,8628,0.544796,train/n03888257/n03888257_34639.JPEG,parachute,train,VALID,True,train/n03888257/n03888257_12053.JPEG,parachute,train,VALID,True
4,1331,2036,10033,0.55351,train/n02979186/n02979186_11265.JPEG,cassette_player,train,VALID,True,val/n02102040/n02102040_3691.JPEG,English_springer,val,VALID,True


In [9]:
# show similar image pairs
fd.similarity().head(5)

Unnamed: 0,from,to,distance,img_filename_from,label_from,split_from,error_code_from,is_valid_from,img_filename_to,label_to,split_to,error_code_to,is_valid_to
0,11521,5390,0.968786,val/n03394916/n03394916_30631.JPEG,French_horn,val,VALID,True,train/n03394916/n03394916_44127.JPEG,French_horn,train,VALID,True
1,5390,11521,0.968786,train/n03394916/n03394916_44127.JPEG,French_horn,train,VALID,True,val/n03394916/n03394916_30631.JPEG,French_horn,val,VALID,True
2,12914,7715,0.962459,val/n03445777/n03445777_6882.JPEG,golf_ball,val,VALID,True,train/n03445777/n03445777_13918.JPEG,golf_ball,train,VALID,True
3,7715,12914,0.962459,train/n03445777/n03445777_13918.JPEG,golf_ball,train,VALID,True,val/n03445777/n03445777_6882.JPEG,golf_ball,val,VALID,True
4,1117,1404,0.953837,train/n02102040/n02102040_1564.JPEG,English_springer,train,VALID,True,train/n02102040/n02102040_3837.JPEG,English_springer,train,VALID,True


In [10]:
# visualize clusters
fd.vis.component_gallery()

tench


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 95.86it/s]


Finished OK. Components are stored as image files fastdup_imagenette/galleries/components_[index].jpg
Stored components visual view in  fastdup_imagenette/galleries/components.html
Execution time in seconds 2.3


Info,Unnamed: 1
component,36.0
num_images,24.0
mean_distance,0.9003

Label,Unnamed: 1
tench,24

Info,Unnamed: 1
component,7331.0
num_images,22.0
mean_distance,0.9011

Label,Unnamed: 1
golf_ball,22

Info,Unnamed: 1
component,143.0
num_images,16.0
mean_distance,0.9003

Label,Unnamed: 1
tench,16

Info,Unnamed: 1
component,6.0
num_images,13.0
mean_distance,0.9023

Label,Unnamed: 1
tench,13

Info,Unnamed: 1
component,10.0
num_images,11.0
mean_distance,0.9065

Label,Unnamed: 1
tench,11

Info,Unnamed: 1
component,4589.0
num_images,11.0
mean_distance,0.9005

Label,Unnamed: 1
French_horn,11

Info,Unnamed: 1
component,900.0
num_images,10.0
mean_distance,0.9018

Label,Unnamed: 1
English_springer,10

Info,Unnamed: 1
component,5491.0
num_images,10.0
mean_distance,0.9001

Label,Unnamed: 1
garbage_truck,10

Info,Unnamed: 1
component,150.0
num_images,10.0
mean_distance,0.9032

Label,Unnamed: 1
tench,10

Info,Unnamed: 1
component,7340.0
num_images,9.0
mean_distance,0.9112

Label,Unnamed: 1
golf_ball,9

Info,Unnamed: 1
component,7354.0
num_images,8.0
mean_distance,0.9057

Label,Unnamed: 1
golf_ball,8

Info,Unnamed: 1
component,5478.0
num_images,8.0
mean_distance,0.9025

Label,Unnamed: 1
garbage_truck,8

Info,Unnamed: 1
component,151.0
num_images,7.0
mean_distance,0.9006

Label,Unnamed: 1
tench,7

Info,Unnamed: 1
component,902.0
num_images,7.0
mean_distance,0.9044

Label,Unnamed: 1
English_springer,7

Info,Unnamed: 1
component,4571.0
num_images,6.0
mean_distance,0.9038

Label,Unnamed: 1
French_horn,6

Info,Unnamed: 1
component,41.0
num_images,6.0
mean_distance,0.9007

Label,Unnamed: 1
tench,6

Info,Unnamed: 1
component,5718.0
num_images,6.0
mean_distance,0.9043

Label,Unnamed: 1
garbage_truck,6

Info,Unnamed: 1
component,917.0
num_images,5.0
mean_distance,0.9037

Label,Unnamed: 1
English_springer,5

Info,Unnamed: 1
component,8447.0
num_images,5.0
mean_distance,0.9004

Label,Unnamed: 1
parachute,5

Info,Unnamed: 1
component,218.0
num_images,5.0
mean_distance,0.9

Label,Unnamed: 1
tench,5


In [11]:
# visualize clusters with specific labels
fd.vis.component_gallery(slice='chain_saw')

chain_saw


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 442.39it/s]

Finished OK. Components are stored as image files fastdup_imagenette/galleries/components_[index].jpg
Stored components visual view in  fastdup_imagenette/galleries/components.html
Execution time in seconds 1.3





Info,Unnamed: 1
component,2953.0
num_images,3.0
mean_distance,0.9064

Label,Unnamed: 1
chain_saw,3

Info,Unnamed: 1
component,2875.0
num_images,2.0
mean_distance,0.9029

Label,Unnamed: 1
chain_saw,2

Info,Unnamed: 1
component,2891.0
num_images,2.0
mean_distance,0.9208

Label,Unnamed: 1
chain_saw,2

Info,Unnamed: 1
component,2939.0
num_images,2.0
mean_distance,0.9222

Label,Unnamed: 1
chain_saw,2

Info,Unnamed: 1
component,3065.0
num_images,2.0
mean_distance,0.9139

Label,Unnamed: 1
chain_saw,2

Info,Unnamed: 1
component,3068.0
num_images,2.0
mean_distance,0.9198

Label,Unnamed: 1
chain_saw,2

Info,Unnamed: 1
component,3077.0
num_images,2.0
mean_distance,0.9073

Label,Unnamed: 1
chain_saw,2

Info,Unnamed: 1
component,3078.0
num_images,2.0
mean_distance,0.9192

Label,Unnamed: 1
chain_saw,2

Info,Unnamed: 1
component,3153.0
num_images,2.0
mean_distance,0.9355

Label,Unnamed: 1
chain_saw,2

Info,Unnamed: 1
component,3381.0
num_images,2.0
mean_distance,0.9345

Label,Unnamed: 1
chain_saw,2

Info,Unnamed: 1
component,10339.0
num_images,2.0
mean_distance,0.9039

Label,Unnamed: 1
chain_saw,2


In [12]:
# show connected components
cc_df, _ = fd.connected_components()
cc_df.sort_values('count', ascending=False).head(5)

Unnamed: 0,fastdup_id,component_id,sum,count,mean_distance,min_distance,max_distance,img_filename,label,split,error_code,is_valid
12991,12991,7331,36.6734,40.0,0.9168,0.9011,0.9328,val/n03445777/n03445777_9552.JPEG,golf_ball,val,VALID,True
8419,8419,7331,36.6734,40.0,0.9168,0.9011,0.9328,train/n03445777/n03445777_8677.JPEG,golf_ball,train,VALID,True
682,682,36,36.5815,40.0,0.9145,0.9003,0.9339,train/n01440764/n01440764_6159.JPEG,tench,train,VALID,True
954,954,36,36.5815,40.0,0.9145,0.9003,0.9339,train/n01440764/n01440764_9885.JPEG,tench,train,VALID,True
9706,9706,36,36.5815,40.0,0.9145,0.9003,0.9339,val/n01440764/n01440764_4980.JPEG,tench,val,VALID,True


In [13]:
# getting metadata for individual images using their 'fastdup_id' available in fd.annotations()
fd[349]

{'img_filename': 'train/n01440764/n01440764_17789.JPEG',
 'label': 'tench',
 'split': 'train',
 'fastdup_id': 349,
 'error_code': 'VALID',
 'is_valid': True}