# Filter inappropriate content

- https://github.com/woctezuma/discord-members-metadata

## Install packages

In [None]:
%pip install --quiet transformers mediapy

## Download the image dataset

In [None]:
%cd /content

for i in range(1, 3):
  fname = f"img_{i}.zip"

  !curl -OL https://github.com/woctezuma/discord-members-metadata/releases/download/img/{fname}
  !unzip -qq {fname}

## Download the text datasets

In [None]:
%cd /content

!curl -OL https://github.com/woctezuma/discord-members-metadata/releases/download/bio/bios.json
!curl -OL https://github.com/woctezuma/discord-members-metadata/releases/download/metadata/members.json

## Define utils

In [None]:
import json

from pathlib import Path

def save_to_json(data, fname):
  with Path(fname).open('w') as f:
    json.dump(data, f, indent=True)

def load_from_json(fname):
  with Path(fname).open() as f:
    data = json.load(f)
  return data

def safe_load_from_json(fname):
  try:
    data = load_from_json(fname)
  except FileNotFoundError:
    data = {}
  return data

In [None]:
from pathlib import Path

def get_member_id(image_path):
  return Path(image_path).stem

In [None]:
def get_output_fname(pipe):
  return pipe.model.name_or_path.replace('/', '_') + '.json'

## Classify images

Dataset

In [None]:
from glob import glob
from torch.utils.data import Dataset

FILE_PATHS = glob('img/*/*.jpg')

class MyDataset(Dataset):
    def __len__(self):
        return len(FILE_PATHS)

    def __getitem__(self, i):
        return FILE_PATHS[i]

dataset = MyDataset()

Data loader

In [None]:
from torch.utils.data import DataLoader

# The batch size was optimized to get the fastest output.
# However, I have changed from using the batch size with the pipeline, to using
# the batch size with the data loader, due to ValueError, so this might have had
# an impact on the optimal batch size.
# Reference: https://huggingface.co/docs/transformers/main_classes/pipelines
loader = DataLoader(dataset, batch_size = 8)

Pipeline

In [None]:
from transformers import pipeline

# https://huggingface.co/Falconsai/nsfw_image_detection
pipe = pipeline("image-classification",
                model="Falconsai/nsfw_image_detection",
                device="cuda")

Apply the workflow

In [None]:
from contextlib import suppress
from tqdm.auto import tqdm

processed_image_paths = []
safety_scores = []

# For my use case, this cell required ~ 25 minutes.

for image_paths in tqdm(loader):
  try:
    out = pipe(image_paths)
  except ValueError:
    out = []

  if out:
    processed_image_paths += image_paths

    for dd in out:
      safety_scores += [ d["score"] for d in dd
                        if d["label"] in ["safe", "normal"] ]

Collate the IDs with the scores. At the same time, display the worst offenders.

In [None]:
import mediapy as media

safety_score_threshold = 0.005
img_size = (128, 128)

aggregate = {}
for image_path, safety_score in sorted(
    zip(processed_image_paths, safety_scores),
    key=lambda x: x[1]):
  member_id = get_member_id(image_path)

  aggregate[member_id] = safety_score

  if safety_score < safety_score_threshold:
    image = media.read_image(image_path)
    image = media.resize_image(image, img_size)

    print(f"{member_id} {safety_score:.2}")
    media.show_image(image)

save_to_json(aggregate,
             get_output_fname(pipe))

## Classify texts

TODO