# Find issues in the image dataset

- https://github.com/woctezuma/steam-DINOv2#filtering-optional

## Install packages

In [None]:
%pip install --quiet cleanvision

## Download the image dataset

In [None]:
%cd /content

for i in range(1, 3):
  fname = f"img_{i}.zip"

  !curl -OL https://github.com/woctezuma/discord-members-metadata/releases/download/img/{fname}
  !unzip -qq {fname}

## Apply `cleanvision`

- https://github.com/cleanlab/cleanvision

In [None]:
from cleanvision import Imagelab

DATASET_PATH = "img/"
SAVE_PATH = "results"

### Find issues

In [None]:
find_issues_from_scratch = False

if find_issues_from_scratch:
  imagelab = Imagelab(data_path=DATASET_PATH)
  imagelab.find_issues()
  imagelab.save(SAVE_PATH)

In [None]:
fname = "cleanvision.zip"

%cd /content

if find_issues_from_scratch:
  !zip -qq -r {fname} {SAVE_PATH}
else:
  !curl -OL https://github.com/woctezuma/discord-members-metadata/releases/download/img/{fname}
  !unzip -qq {fname}

### Report

In [None]:
imagelab = Imagelab.load(SAVE_PATH, DATASET_PATH)
imagelab.report()

## Analyze results

### Define utility functions

In [None]:
import json

from pathlib import Path

def save_to_json(data, fname):
  with Path(fname).open('w') as f:
    json.dump(data, f, indent=True)

def load_from_json(fname):
  with Path(fname).open() as f:
    data = json.load(f)
  return data

In [None]:
IMAGE_SUFFIX = ".jpg"

def to_image_index(image_name,
                   image_suffix=IMAGE_SUFFIX):
    index_as_str = Path(image_name).name.removesuffix(image_suffix)
    return int(index_as_str)

def convert_list_to_image_indices(l, image_suffix=IMAGE_SUFFIX):
  return sorted([to_image_index(fname, image_suffix) for fname in l])

def convert_nested_lists_to_image_indices(nested_lists,
                                          image_suffix=IMAGE_SUFFIX):
  return [ convert_list_to_image_indices(l, image_suffix) for l in nested_lists ]

### Summary

> Dataframe with **global summary** of all issue types detected in your dataset and the overall prevalence of each type.
>
> Reference:
> - https://cleanvision.readthedocs.io/en/latest/tutorials/tutorial.html#imagelab.issue_summary

In [None]:
imagelab.issue_summary

### Table

> DataFrame **assessing each image** in your dataset, reporting which issues each image exhibits and a score (between 0 and 1):
> - a boolean column per issue type shows whether each image exhibits this issue type.
> - a numeric column assesses the issue severity in each image, where lower values indicate more severe cases.
>
> Reference:
> - https://cleanvision.readthedocs.io/en/latest/tutorials/tutorial.html#imagelab.issues

#### List images with at least one issue

In [None]:
BOOL_PREFIX = "is_"
SUFFIX_TO_SKIP = "_duplicates_issue"

# Reference: https://stackoverflow.com/a/19483025/376454
bool_headers = [
    header for header in list(imagelab.issues)
    if header.startswith(BOOL_PREFIX) and not header.endswith(SUFFIX_TO_SKIP)
    ]

print('Boolean columns:')
print('- ' + '\n- '.join(bool_headers))

detailed_data = {}
for header in bool_headers:
  selected_images = imagelab.issues[imagelab.issues[header]]
  selected_image_files = selected_images.index.tolist()
  detailed_data[header] = convert_list_to_image_indices(selected_image_files)

#### Export to JSON

In [None]:
fname = 'cleanvision_detailed_issues.json'
save_to_json(detailed_data, fname)

### Duplicate sets

> `imagelab.info` can be used to retrieve **which images** are near or exact duplicates of each other.
>
> References:
> - https://cleanvision.readthedocs.io/en/latest/tutorials/tutorial.html#imagelab.info
> - https://cleanvision.readthedocs.io/en/latest/tutorials/tutorial.html#Duplicate-sets

#### Count duplicate sets

In [None]:
exact_duplicates = convert_nested_lists_to_image_indices(imagelab.info['exact_duplicates']['sets'])
print(f"[exact duplicates] #sets = {len(exact_duplicates)}")

near_duplicates = convert_nested_lists_to_image_indices(imagelab.info['near_duplicates']['sets'])
print(f"[near duplicates] #sets = {len(near_duplicates)}")

#### Export to JSON

In [None]:
for field in ['exact_duplicates', 'near_duplicates']:
  fname = f'cleanvision_{field}.json'
  data = convert_nested_lists_to_image_indices(imagelab.info[field]['sets'])
  save_to_json(data, fname)

### Show a representative of each set

In [None]:
%pip install --quiet mediapy

In [None]:
import mediapy as media

DISPLAY_THRESHOLD = 2
IMG_SIZE = (128, 128)

def show_representatives(
    list_of_sets,
    display_threshold = DISPLAY_THRESHOLD,
    img_size = IMG_SIZE,
    ):
  for l in sorted(list_of_sets, key=lambda x: len(x), reverse=True):
    num_duplicates = len(l)

    if num_duplicates < display_threshold:
      break

    member_id = str(l[0])
    image_path = f"img/{member_id[0]}/{member_id}.jpg"
    image = media.read_image(image_path)

    image = media.resize_image(image, img_size)

    print(f"{member_id} (#images={num_duplicates})")
    media.show_image(image)

In [None]:
show_representatives(exact_duplicates, display_threshold=2)

In [None]:
show_representatives(near_duplicates, display_threshold=2)