visual-layer · amiralush · Apr 25, 2023 · Apr 24, 2023 · Apr 24, 2023 · Apr 24, 2023
diff --git a/examples/analysing-image-classification-dataset.ipynb b/examples/analysing-image-classification-dataset.ipynb
@@ -32,10 +32,7 @@
    "outputs": [],
    "source": [
     "!pip install pip -U\n",
-    "!pip install fastdup\n",
-    "!pip install pandas\n",
-    "!pip install wurlitzer\n",
-    "%load_ext wurlitzer"
+    "!pip install fastdup"
    ]
   },
   {
@@ -429,10 +426,10 @@
     "df_annot = df_annot[['path', 'noisy_labels_0']]\n",
     "\n",
     "# rename columns to fastdup's column names\n",
-    "df_annot = df_annot.rename({'noisy_labels_0': 'label', 'path': 'img_filename'}, axis='columns')\n",
+    "df_annot = df_annot.rename({'noisy_labels_0': 'label', 'path': 'filename'}, axis='columns')\n",
     "\n",
     "# create split column\n",
-    "df_annot['split'] = df_annot['img_filename'].apply(lambda x: x.split(\"/\")[0])\n",
+    "df_annot['split'] = df_annot['filename'].apply(lambda x: x.split(\"/\")[0])\n",
     "\n",
     "# map label ids to regular labels\n",
     "df_annot['label'] = df_annot['label'].map(label_map)\n",
@@ -454,6 +451,28 @@
     "In this example we run fastdup by providing the annotations."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "7f69d8b2",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'0.918'"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import fastdup\n",
+    "fastdup.__version__"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 6,
@@ -496,7 +515,6 @@
     }
    ],
    "source": [
-    "import fastdup\n",
     "work_dir = 'fastdup_imagenette'\n",
     "\n",
     "fd = fastdup.create(work_dir=work_dir, input_dir=data_dir) \n",

diff --git a/examples/analyzing-object-detection-dataset.ipynb b/examples/analyzing-object-detection-dataset.ipynb
diff --git a/examples/blip_laion_captions.ipynb b/examples/blip_laion_captions.ipynb
@@ -1,21 +1,14 @@
 {
  "cells": [
   {
+   "attachments": {},
    "cell_type": "markdown",
    "id": "8ecd1bd3-0374-4aa0-b14e-0a9a556a5bdc",
    "metadata": {},
    "source": [
-    "# Investigating BLIP model performance with fastdup"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ee9dccb8-1457-40d4-b443-71007d30862d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Score images with BLIP"
+    "# Investigating BLIP model performance with fastdup\n",
+    "\n",
+    "Score images with BLIP"
    ]
   },
   {

diff --git a/examples/cleaning-image-dataset.ipynb b/examples/cleaning-image-dataset.ipynb
@@ -28,11 +28,7 @@
    "outputs": [],
    "source": [
     "!pip install pip -U\n",
-    "!pip install fastdup\n",
-    "!pip install pandas\n",
-    "!pip install matplotlib\n",
-    "!pip install wurlitzer\n",
-    "%load_ext wurlitzer"
+    "!pip install fastdup matplotlib"
    ]
   },
   {
@@ -45,6 +41,12 @@
     "## Download food-101 Dataset"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "abb0f91a",
+   "metadata": {},
+   "source": []
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -299,7 +301,7 @@
     }
    ],
    "source": [
-    "list_of_broken_images = broken_images['img_filename'].to_list()\n",
+    "list_of_broken_images = broken_images['filename'].to_list()\n",
     "list_of_broken_images"
    ]
   },
@@ -1862,7 +1864,7 @@
     "# a function to group connected components\n",
     "def get_clusters(df, sort_by='count', min_count=2, ascending=False):\n",
     "    # columns to aggregate\n",
-    "    agg_dict = {'img_filename': list, 'mean_distance': max, 'count': len}\n",
+    "    agg_dict = {'filename': list, 'mean_distance': max, 'count': len}\n",
     "\n",
     "    if 'label' in df.columns:\n",
     "        agg_dict['label'] = list\n",
@@ -2017,7 +2019,7 @@
     "cluster_images_to_keep = []\n",
     "list_of_duplicates = []\n",
     "\n",
-    "for cluster_file_list in clusters_df.img_filename:\n",
+    "for cluster_file_list in clusters_df.filename:\n",
     "    # keep first file, discard rest\n",
     "    keep = cluster_file_list[0]\n",
     "    discard = cluster_file_list[1:]\n",
@@ -3625,7 +3627,7 @@
     }
    ],
    "source": [
-    "list_of_outliers = outlier_df[outlier_df.distance < 0.68].img_filename_outlier.tolist()\n",
+    "list_of_outliers = outlier_df[outlier_df.distance < 0.68].filename_outlier.tolist()\n",
     "list_of_outliers"
    ]
   },
@@ -4472,7 +4474,7 @@
     }
    ],
    "source": [
-    "list_of_dark_images = dark_images['img_filename'].to_list()\n",
+    "list_of_dark_images = dark_images['filename'].to_list()\n",
     "list_of_dark_images"
    ]
   },
@@ -5451,7 +5453,7 @@
     }
    ],
    "source": [
-    "list_of_bright_images = bright_images['img_filename'].to_list()\n",
+    "list_of_bright_images = bright_images['filename'].to_list()\n",
     "list_of_bright_images"
    ]
   },
@@ -6334,7 +6336,7 @@
     }
    ],
    "source": [
-    "list_of_blurry_images = blurry_images['img_filename'].to_list()\n",
+    "list_of_blurry_images = blurry_images['filename'].to_list()\n",
     "list_of_blurry_images"
    ]
   },