# Preparing a labeled image dataset for training using fastdup V1.0

In [None]:
# download fastdup
!pip install pip -U
!pip install fastdup
!pip install pandas

In [3]:
import fastdup
import pandas as pd

# Download Data

In [1]:
# download and extract imagenette
!wget https://s3.amazonaws.com/fast-ai-imageclas/imagenette2-160.tgz
!tar -xf imagenette2-160.tgz

--2023-03-01 06:49:52--  https://s3.amazonaws.com/fast-ai-imageclas/imagenette2-160.tgz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 3.5.1.143, 52.217.106.214, 52.216.34.176, ...
Connecting to s3.amazonaws.com (s3.amazonaws.com)|3.5.1.143|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 99003388 (94M) [application/x-tar]
Saving to: ‘imagenette2-160.tgz’


2023-03-01 06:49:56 (27.4 MB/s) - ‘imagenette2-160.tgz’ saved [99003388/99003388]



In [4]:
data_dir = 'imagenette2-160/'
csv_path = 'imagenette2-160/noisy_imagenette.csv'

# Load and Format Annotations

In [5]:
label_map = {
    'n02979186': 'cassette_player', 
    'n03417042': 'garbage_truck', 
    'n01440764': 'tench', 
    'n02102040': 'English_springer', 
    'n03028079': 'church',
    'n03888257': 'parachute', 
    'n03394916': 'French_horn', 
    'n03000684': 'chain_saw', 
    'n03445777': 'golf_ball', 
    'n03425413': 'gas_pump'
}

In [6]:
# load raw annotations
df_annot = pd.read_csv(csv_path)
df_annot.head(3)

Unnamed: 0,path,noisy_labels_0,noisy_labels_1,noisy_labels_5,noisy_labels_25,noisy_labels_50,is_valid
0,train/n02979186/n02979186_9036.JPEG,n02979186,n02979186,n02979186,n02979186,n02979186,False
1,train/n02979186/n02979186_11957.JPEG,n02979186,n02979186,n02979186,n02979186,n03000684,False
2,train/n02979186/n02979186_9715.JPEG,n02979186,n02979186,n02979186,n03417042,n03000684,False


In [7]:
# change raw format to fastdup's format

# take relevant columns
df_annot = df_annot[['path', 'noisy_labels_0']]

# rename columns to fastdup's column names
df_annot = df_annot.rename({'noisy_labels_0': 'label', 'path': 'img_filename'}, axis='columns')

# create split column
df_annot['split'] = df_annot['img_filename'].apply(lambda x: x.split("/")[0])

# map label ids to regular labels
df_annot['label'] = df_annot['label'].map(label_map)

# show formated annotations
df_annot

Unnamed: 0,img_filename,label,split
0,train/n02979186/n02979186_9036.JPEG,cassette_player,train
1,train/n02979186/n02979186_11957.JPEG,cassette_player,train
2,train/n02979186/n02979186_9715.JPEG,cassette_player,train
3,train/n02979186/n02979186_21736.JPEG,cassette_player,train
4,train/n02979186/ILSVRC2012_val_00046953.JPEG,cassette_player,train
...,...,...,...
13389,val/n03425413/n03425413_17521.JPEG,gas_pump,val
13390,val/n03425413/n03425413_20711.JPEG,gas_pump,val
13391,val/n03425413/n03425413_19050.JPEG,gas_pump,val
13392,val/n03425413/n03425413_13831.JPEG,gas_pump,val


# Run Fastdup

In [8]:
work_dir = 'fastdup_imagenette'

# run fastdup with annotations
fd = fastdup.create(work_dir=work_dir, input_dir=data_dir) 
fd.run(annotations=df_annot, ccthreshold=0.9, threshold=0.8)

FastDup Software, (C) copyright 2022 Dr. Amir Alush and Dr. Danny Bickson.
fastdup C++ error received:  read_features_parallel allowed values 1..64
 



NoneType: None
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/fastdup/sentry.py", line 114, in inner_function
    ret = func(*args, **kwargs)
  File "/usr/local/lib/python3.8/dist-packages/fastdup/fastdup_controller.py", line 300, in run
    self._create_img_mapping()
  File "/usr/local/lib/python3.8/dist-packages/fastdup/fastdup_controller.py", line 617, in _create_img_mapping
    df_mapping = self._fetch_df(FD.MAPPING_CSV).reset_index()
AttributeError: 'NoneType' object has no attribute 'reset_index'


AttributeError: ignored

In [None]:
# show outliers image data
fd.outliers().head(5)

Unnamed: 0,index,outlier,nearest,distance,img_filename_outlier,label_outlier,split_outlier,error_code_outlier,is_valid_outlier,img_filename_nearest,label_nearest,split_nearest,error_code_nearest,is_valid_nearest
0,1338,12072,1045,0.467178,val/n03417042/n03417042_5110.JPEG,garbage_truck,val,VALID,True,train/n02102040/n02102040_1156.JPEG,English_springer,train,VALID,True
1,1337,2664,9763,0.476124,train/n02979186/n02979186_3967.JPEG,cassette_player,train,VALID,True,val/n01440764/n01440764_710.JPEG,tench,val,VALID,True
2,1334,8150,7831,0.51468,train/n03445777/n03445777_5218.JPEG,golf_ball,train,VALID,True,train/n03445777/n03445777_18756.JPEG,golf_ball,train,VALID,True
3,1332,12076,956,0.539275,val/n03417042/n03417042_5301.JPEG,garbage_truck,val,VALID,True,train/n01440764/n01440764_9898.JPEG,tench,train,VALID,True
4,1330,5872,1758,0.544365,train/n03417042/n03417042_14525.JPEG,garbage_truck,train,VALID,True,train/n02102040/n02102040_7256.JPEG,English_springer,train,VALID,True


In [None]:
# visualize outliers
fd.vis.outliers_gallery()

In [None]:
# show similar image pairs
fd.similarity().head(5)

In [None]:
# visualize clusters
fd.vis.component_gallery(max_width=800)

In [None]:
# visualize clusters with specific labels
fd.vis.component_gallery(max_width=800, slice='chain_saw')

In [None]:
# show connected components
cc_df, _ = fd.connected_components()
cc_df.sort_values('count', ascending=False).head(5)

Unnamed: 0,fastdup_id,component_id,sum,count,mean_distance,min_distance,max_distance,img_filename,label,split,error_code,is_valid
349,349,36,36.5815,40.0,0.9145,0.9003,0.9339,train/n01440764/n01440764_1778.JPEG,tench,train,VALID,True
602,602,36,36.5815,40.0,0.9145,0.9003,0.9339,train/n01440764/n01440764_4906.JPEG,tench,train,VALID,True
269,269,36,36.5815,40.0,0.9145,0.9003,0.9339,train/n01440764/n01440764_13895.JPEG,tench,train,VALID,True
8472,8472,7332,36.6734,40.0,0.9168,0.9011,0.9328,train/n03445777/n03445777_9297.JPEG,golf_ball,train,VALID,True
8056,8056,7332,36.6734,40.0,0.9168,0.9011,0.9328,train/n03445777/n03445777_4104.JPEG,golf_ball,train,VALID,True


In [None]:
# getting metadata for individual images using their 'fastdup_id' available in fd.annotations()
fd[349]

{'img_filename': 'train/n01440764/n01440764_1778.JPEG',
 'label': 'tench',
 'split': 'train',
 'fastdup_id': 349,
 'error_code': 'VALID',
 'is_valid': True}