# Getting Started with FastDup

This is a walkthrough on how to install and run FastDup to find
image-duplicates/near-duplicates outliers

# 1. Import FastDup

In [1]:
import os
import fastdup

# 2. Running FastDup on the Food-101 Dataset (101K images!)

In [2]:
input_dir = '../../data/food-101'
results_dir = '../../results/food-101'

fastdup.run(input_dir=input_dir, work_dir=results_dir)

Going to loop over dir ../../data/food-101
Found total 101000 images to run on
Wrote  total of 101000 features, found 0 bad images] 100% Estimated: 0 Minutes
Found total 101000 images to run on
0) Going to train faiss
0) Finished train() faiss
9) Finished add() faiss
10) Finished write_index() faiss
Stored faiss index file ../../results/food-101/faiss.index
12) Finished search() faiss[1;32m1653078875 : INFO:     (add_vertices:460): Num vertices for group 0: 101000
[0m[1;32m1653078875 : INFO:     (commit_edge_buffer:609): In commit edge buffer (0,0)
[0m[1;32m1653078875 : INFO:     (commit_edge_buffer:680): Shuffling edges ...
[0m[1;32m1653078875 : INFO:     (commit_edge_buffer:688): Done shuffling edges in 0.006417 secs
[0m[1;32m1653078875 : INFO:     (commit_edge_buffer:692): Aggregating unique vertices...
[0m[1;32m1653078875 : INFO:     (commit_edge_buffer:705): Done aggregating unique vertex in 0.001556 secs
[0m[1;32m1653078875 : INFO:     (commit_edge_buffer:713): Combi

# 3. Detecting Top-15 Duplicated Image Pairs

In [3]:
from IPython.display import HTML
save_path = os.path.join(results_dir, 'identical-gallery-top')
fastdup.create_duplicates_gallery(os.path.join(results_dir, 'similarity.csv'), 
                                  save_path, num_images=15, descending=True)

gallery_file_name = os.path.join(save_path, 'similarity.html')
HTML(filename=gallery_file_name)

100%|██████████████████████████████████████████████████████████████████████████| 15/15 [00:00<00:00, 93.95it/s]


Stored similarity visual view in  ../../results/food-101/identical-gallery-top/similarity.html


Unnamed: 0,image,distance,from,to
0,,1.0,../../data/food-101/images/apple_pie/1461580.jpg,../../data/food-101/images/apple_pie/1469191.jpg
1,,1.0,../../data/food-101/images/gnocchi/1706892.jpg,../../data/food-101/images/gnocchi/1706909.jpg
2,,1.0,../../data/food-101/images/beef_tartare/50036.jpg,../../data/food-101/images/beef_tartare/50022.jpg
3,,1.0,../../data/food-101/images/strawberry_shortcake/587263.jpg,../../data/food-101/images/strawberry_shortcake/587260.jpg
4,,1.0,../../data/food-101/images/foie_gras/1721369.jpg,../../data/food-101/images/foie_gras/1721540.jpg
5,,1.0,../../data/food-101/images/fried_rice/2820757.jpg,../../data/food-101/images/fried_rice/2899815.jpg
6,,1.0,../../data/food-101/images/churros/2547960.jpg,../../data/food-101/images/churros/2617186.jpg
7,,1.0,../../data/food-101/images/fried_calamari/835953.jpg,../../data/food-101/images/fried_calamari/881518.jpg
8,,1.0,../../data/food-101/images/chicken_quesadilla/1590716.jpg,../../data/food-101/images/chicken_quesadilla/1579819.jpg
9,,1.0,../../data/food-101/images/chocolate_cake/51717.jpg,../../data/food-101/images/chocolate_cake/55122.jpg


# 4. Searching for Semantically Similar Images 

In [4]:
save_path = os.path.join(results_dir, 'similar-gallery-top')
fastdup.create_duplicates_gallery(os.path.join(results_dir, 'similarity.csv'), 
                                  save_path, num_images=15, descending=False)

gallery_file_name = os.path.join(save_path, 'similarity.html')
HTML(filename=gallery_file_name)

100%|██████████████████████████████████████████████████████████████████████████| 15/15 [00:00<00:00, 95.19it/s]


Stored similarity visual view in  ../../results/food-101/similar-gallery-top/similarity.html


Unnamed: 0,image,distance,from,to
2627,,0.900062,../../data/food-101/images/caprese_salad/1048582.jpg,../../data/food-101/images/caprese_salad/992553.jpg
2628,,0.900056,../../data/food-101/images/fried_calamari/1509987.jpg,../../data/food-101/images/fried_calamari/3921697.jpg
2629,,0.900055,../../data/food-101/images/spaghetti_bolognese/2954321.jpg,../../data/food-101/images/spaghetti_bolognese/473975.jpg
2630,,0.900054,../../data/food-101/images/beignets/3873758.jpg,../../data/food-101/images/beignets/1275052.jpg
2631,,0.900052,../../data/food-101/images/ceviche/1205283.jpg,../../data/food-101/images/breakfast_burrito/2404380.jpg
2632,,0.900043,../../data/food-101/images/clam_chowder/2027156.jpg,../../data/food-101/images/clam_chowder/1223949.jpg
2633,,0.900027,../../data/food-101/images/pork_chop/3725022.jpg,../../data/food-101/images/gnocchi/3791682.jpg
2634,,0.900024,../../data/food-101/images/hot_and_sour_soup/2828157.jpg,../../data/food-101/images/pho/3775540.jpg
2635,,0.900016,../../data/food-101/images/lobster_bisque/2005142.jpg,../../data/food-101/images/lobster_bisque/533134.jpg
2636,,0.900014,../../data/food-101/images/french_fries/557390.jpg,../../data/food-101/images/french_fries/309651.jpg


#  5. Detecting Outliers

In [5]:
from IPython.display import HTML

save_path = os.path.join(results_dir, 'outliers-gallery')
fastdup.create_outliers_gallery(os.path.join(results_dir, 'outliers.csv'), 
                                  save_path, num_images=10)

gallery_file_name = os.path.join(save_path, 'outliers.html')
HTML(filename=gallery_file_name)

100%|█████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 383.74it/s]

Stored outliers visual view in  ../../results/food-101/outliers-gallery/outliers.html





Unnamed: 0,image,distance,path
10099,,0.378475,../../data/food-101/images/macarons/2117640.jpg
10098,,0.379365,../../data/food-101/images/breakfast_burrito/462294.jpg
10096,,0.457944,../../data/food-101/images/macarons/2117640.jpg
10097,,0.457944,../../data/food-101/images/breakfast_burrito/462294.jpg
10095,,0.515787,../../data/food-101/images/tacos/1505262.jpg
10094,,0.528563,../../data/food-101/images/shrimp_and_grits/1047420.jpg
10093,,0.546918,../../data/food-101/images/sushi/3100962.jpg
10092,,0.54816,../../data/food-101/images/shrimp_and_grits/1047420.jpg
10091,,0.556056,../../data/food-101/images/sushi/3100962.jpg
10090,,0.573438,../../data/food-101/images/pho/2399877.jpg
