Skip to content
Merged
32 changes: 25 additions & 7 deletions examples/analysing-image-classification-dataset.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,7 @@
"outputs": [],
"source": [
"!pip install pip -U\n",
"!pip install fastdup\n",
"!pip install pandas\n",
"!pip install wurlitzer\n",
"%load_ext wurlitzer"
"!pip install fastdup"
]
},
{
Expand Down Expand Up @@ -429,10 +426,10 @@
"df_annot = df_annot[['path', 'noisy_labels_0']]\n",
"\n",
"# rename columns to fastdup's column names\n",
"df_annot = df_annot.rename({'noisy_labels_0': 'label', 'path': 'img_filename'}, axis='columns')\n",
"df_annot = df_annot.rename({'noisy_labels_0': 'label', 'path': 'filename'}, axis='columns')\n",
"\n",
"# create split column\n",
"df_annot['split'] = df_annot['img_filename'].apply(lambda x: x.split(\"/\")[0])\n",
"df_annot['split'] = df_annot['filename'].apply(lambda x: x.split(\"/\")[0])\n",
"\n",
"# map label ids to regular labels\n",
"df_annot['label'] = df_annot['label'].map(label_map)\n",
Expand All @@ -454,6 +451,28 @@
"In this example we run fastdup by providing the annotations."
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "7f69d8b2",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'0.918'"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import fastdup\n",
"fastdup.__version__"
]
},
{
"cell_type": "code",
"execution_count": 6,
Expand Down Expand Up @@ -496,7 +515,6 @@
}
],
"source": [
"import fastdup\n",
"work_dir = 'fastdup_imagenette'\n",
"\n",
"fd = fastdup.create(work_dir=work_dir, input_dir=data_dir) \n",
Expand Down
2 changes: 1 addition & 1 deletion examples/analyzing-object-detection-dataset.ipynb

Large diffs are not rendered by default.

15 changes: 4 additions & 11 deletions examples/blip_laion_captions.ipynb
Original file line number Diff line number Diff line change
@@ -1,21 +1,14 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"id": "8ecd1bd3-0374-4aa0-b14e-0a9a556a5bdc",
"metadata": {},
"source": [
"# Investigating BLIP model performance with fastdup"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ee9dccb8-1457-40d4-b443-71007d30862d",
"metadata": {},
"outputs": [],
"source": [
"# Score images with BLIP"
"# Investigating BLIP model performance with fastdup\n",
"\n",
"Score images with BLIP"
]
},
{
Expand Down
26 changes: 14 additions & 12 deletions examples/cleaning-image-dataset.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,7 @@
"outputs": [],
"source": [
"!pip install pip -U\n",
"!pip install fastdup\n",
"!pip install pandas\n",
"!pip install matplotlib\n",
"!pip install wurlitzer\n",
"%load_ext wurlitzer"
"!pip install fastdup matplotlib"
]
},
{
Expand All @@ -45,6 +41,12 @@
"## Download food-101 Dataset"
]
},
{
"cell_type": "markdown",
"id": "abb0f91a",
"metadata": {},
"source": []
},
{
"cell_type": "code",
"execution_count": null,
Expand Down Expand Up @@ -299,7 +301,7 @@
}
],
"source": [
"list_of_broken_images = broken_images['img_filename'].to_list()\n",
"list_of_broken_images = broken_images['filename'].to_list()\n",
"list_of_broken_images"
]
},
Expand Down Expand Up @@ -1862,7 +1864,7 @@
"# a function to group connected components\n",
"def get_clusters(df, sort_by='count', min_count=2, ascending=False):\n",
" # columns to aggregate\n",
" agg_dict = {'img_filename': list, 'mean_distance': max, 'count': len}\n",
" agg_dict = {'filename': list, 'mean_distance': max, 'count': len}\n",
"\n",
" if 'label' in df.columns:\n",
" agg_dict['label'] = list\n",
Expand Down Expand Up @@ -2017,7 +2019,7 @@
"cluster_images_to_keep = []\n",
"list_of_duplicates = []\n",
"\n",
"for cluster_file_list in clusters_df.img_filename:\n",
"for cluster_file_list in clusters_df.filename:\n",
" # keep first file, discard rest\n",
" keep = cluster_file_list[0]\n",
" discard = cluster_file_list[1:]\n",
Expand Down Expand Up @@ -3625,7 +3627,7 @@
}
],
"source": [
"list_of_outliers = outlier_df[outlier_df.distance < 0.68].img_filename_outlier.tolist()\n",
"list_of_outliers = outlier_df[outlier_df.distance < 0.68].filename_outlier.tolist()\n",
"list_of_outliers"
]
},
Expand Down Expand Up @@ -4472,7 +4474,7 @@
}
],
"source": [
"list_of_dark_images = dark_images['img_filename'].to_list()\n",
"list_of_dark_images = dark_images['filename'].to_list()\n",
"list_of_dark_images"
]
},
Expand Down Expand Up @@ -5451,7 +5453,7 @@
}
],
"source": [
"list_of_bright_images = bright_images['img_filename'].to_list()\n",
"list_of_bright_images = bright_images['filename'].to_list()\n",
"list_of_bright_images"
]
},
Expand Down Expand Up @@ -6334,7 +6336,7 @@
}
],
"source": [
"list_of_blurry_images = blurry_images['img_filename'].to_list()\n",
"list_of_blurry_images = blurry_images['filename'].to_list()\n",
"list_of_blurry_images"
]
},
Expand Down
Loading