# Download Discord avatars

References:
- https://github.com/rom1504/img2dataset
- https://github.com/woctezuma/DiscordScraper
- https://github.com/woctezuma/discord-members-metadata

## Installation

In [None]:
%pip install -q mediapy

In [None]:
%pip install -qq --ignore-installed clip-retrieval

## Constants

In [None]:
IMAGE_SIZE = 512
FILE_EXT = ".jpg"
IMAGE_FOLDER = "images/"

URL_LIST_FNAME = "avatars.txt"
URL_LIST_FNAME_LATEST = "avatars_latest.txt"
EMBEDDING_FOLDER = "embeddings/"
INDEX_FOLDER = "indices/"
RESULT_FOLDER = "results/"

EMBEDDING_FNAME = EMBEDDING_FOLDER.replace("/", ".zip")
INDEX_FNAME = INDEX_FOLDER.replace("/", ".zip")
RESULT_FNAME = RESULT_FOLDER.replace("/", ".zip")
GITHUB_RELEASE_URL = "https://github.com/woctezuma/discord-members-metadata/releases/download/imgv2/"

In [None]:
def load_normalized_urls(fname = URL_LIST_FNAME):
  with open(fname) as f:
    return [l.strip().split("?")[0] for l in f.readlines()]

def get_url_suffix(image_size):
  return f"?size={image_size}"

In [None]:
def to_member_id(url):
  return url.split("/")[-2]

def to_member_ids(urls):
  return [to_member_id(url) for url in urls]

In [None]:
MAX_SIZE = 1024
upscale_suffix = get_url_suffix(MAX_SIZE)

def get_member_id(url_index, verbose=True):
  lines = load_normalized_urls()

  url_index = int(url_index)
  url = lines[url_index]
  member_id = to_member_id(url)

  if verbose:
    print(f"URL index: {url_index:>5} -> member ID: {member_id} -> {url}{upscale_suffix}")

  return member_id

In [None]:
import mediapy as media

MIN_SIZE = 128
DISPLAY_SHAPE = (MIN_SIZE, MIN_SIZE)

def display_image(fname):
  media.show_image(
      media.resize_image(
          media.read_image(fname),
          shape=DISPLAY_SHAPE,
          ),
      )

## Import text data

In [None]:
!curl -OL {GITHUB_RELEASE_URL}{URL_LIST_FNAME}
!curl -OL {GITHUB_RELEASE_URL}{URL_LIST_FNAME_LATEST}

In [None]:
suffix = get_url_suffix(IMAGE_SIZE)

lines = load_normalized_urls(URL_LIST_FNAME)

with open(URL_LIST_FNAME, "w") as f:
  edited_lines = [f"{l}{suffix}" for l in lines]
  f.write("\n".join(edited_lines))

## Download images from scratch

In [None]:
download_images_from_scratch = False

In [None]:
if download_images_from_scratch:
  %pip install -qq img2dataset

In [None]:
if download_images_from_scratch:
  !img2dataset \
  --url_list={URL_LIST_FNAME} \
  --image_size={IMAGE_SIZE} \
  --resize_mode=keep_ratio \
  --disallowed_header_directives '[]'

!du -sh {IMAGE_FOLDER}

In [None]:
if download_images_from_scratch:
  !zip -q -r img_1.zip {IMAGE_FOLDER}00000 {IMAGE_FOLDER}00001
  !zip -q -r img_2.zip {IMAGE_FOLDER}00002 {IMAGE_FOLDER}00003
  !zip -q -r img_3.zip {IMAGE_FOLDER}00004 {IMAGE_FOLDER}00005
  !zip -q img_stats.zip {IMAGE_FOLDER}*.parquet {IMAGE_FOLDER}*.json

!du -sh img_*.zip

## Import image data

In [None]:
for i in [1, 2, 3, "stats"]:
  fname = f"img_{i}.zip"

  !curl -OL {GITHUB_RELEASE_URL}{fname}
  !unzip -qq {fname}

## Analyze images

Reference:
- https://github.com/rom1504/clip-retrieval

### Prepare once

In [None]:
compute_from_scratch = False

In [None]:
if compute_from_scratch:
  !clip-retrieval inference \
  --input_dataset {IMAGE_FOLDER} \
  --output_folder {EMBEDDING_FOLDER}

  !zip -q -r {EMBEDDING_FNAME} {EMBEDDING_FOLDER}
else:
  !curl -OL {GITHUB_RELEASE_URL}{EMBEDDING_FNAME}
  !unzip -qq {EMBEDDING_FNAME}

In [None]:
if compute_from_scratch:
  !clip-retrieval index \
  --embeddings_folder {EMBEDDING_FOLDER} \
  --index_folder {INDEX_FOLDER}

  !zip -q -r {INDEX_FNAME} {INDEX_FOLDER}
else:
  !curl -OL {GITHUB_RELEASE_URL}{INDEX_FNAME}
  !unzip -qq {INDEX_FNAME}

### Process a text query

In [None]:
snapshot_urls = load_normalized_urls(URL_LIST_FNAME)
latest_urls = load_normalized_urls(URL_LIST_FNAME_LATEST)

skipped_member_ids = set(to_member_ids(snapshot_urls)).difference(to_member_ids(latest_urls))

In [None]:
import glob

# Either a local image:
url_index = 9695
QUERY = glob.glob(f"{IMAGE_FOLDER}/*/{url_index:0>9}{FILE_EXT}")[0]
# Or a text:
QUERY = "sexist"

NUM_RESULTS = 25

%rm -rf {RESULT_FOLDER}

!clip-retrieval filter \
 --indice_folder {INDEX_FOLDER} \
 --query "{QUERY}" \
 --num_results {NUM_RESULTS} \
 --output_folder {RESULT_FOLDER}

In [None]:
## Display

for fname in sorted(glob.glob(f"{RESULT_FOLDER}/*{FILE_EXT}")):
  url_index = fname.removeprefix(RESULT_FOLDER).removesuffix(FILE_EXT)
  member_id = get_member_id(url_index)

  if member_id in skipped_member_ids:
    print("Skipped")
  else:
    display_image(fname)

# Restart the notebook if you encounter the following error:
# UnidentifiedImageError: cannot identify image file <_io.BytesIO object>