# Download Discord avatars

References:
- https://github.com/rom1504/img2dataset
- https://github.com/woctezuma/DiscordScraper
- https://github.com/woctezuma/discord-members-metadata

## Installation

In [None]:
%pip install -q mediapy

In [None]:
%pip install -qq --ignore-installed clip-retrieval

## Constants

In [None]:
IMAGE_SIZE = 512
FILE_EXT = ".jpg"
IMAGE_FOLDER = "images/"

URL_LIST_FNAME = "avatars.txt"
EMBEDDING_FOLDER = "embeddings/"
INDEX_FOLDER = "indices/"
RESULT_FOLDER = "results/"

ARCHIVE_FNAME = URL_LIST_FNAME.replace(".txt", ".zip")
EMBEDDING_FNAME = EMBEDDING_FOLDER.replace("/", ".zip")
INDEX_FNAME = INDEX_FOLDER.replace("/", ".zip")
RESULT_FNAME = RESULT_FOLDER.replace("/", ".zip")

In [None]:
def load_normalized_urls():
  with open(URL_LIST_FNAME) as f:
    return [l.strip().split("?")[0] for l in f.readlines()]

In [None]:
import mediapy as media

DISPLAY_SIZE = (128,128)

def get_member_id(url_index, fname = None, verbose=True):
  lines = load_normalized_urls()

  url_index = int(url_index)
  url = lines[url_index]
  member_id = url.split("/")[-2]

  if verbose:
    print(f"URL index: {url_index:>5} -> member ID: {member_id} -> {url}")

  if fname:
    media.show_image(
        media.resize_image(
            media.read_image(fname),
            shape=DISPLAY_SIZE,
            ),
        )

  return member_id

## Import text data

In [None]:
!curl -OL https://github.com/woctezuma/discord-members-metadata/releases/download/imgv2/{URL_LIST_FNAME}

In [None]:
suffix = f"?size={IMAGE_SIZE}"

lines = load_normalized_urls()

with open(URL_LIST_FNAME, "w") as f:
  edited_lines = [f"{l}{suffix}" for l in lines]
  f.write("\n".join(edited_lines))

## Download images from scratch

In [None]:
download_images_from_scratch = False

In [None]:
if download_images_from_scratch:
  %pip install -qq img2dataset

In [None]:
if download_images_from_scratch:
  !img2dataset \
  --url_list={URL_LIST_FNAME} \
  --image_size={IMAGE_SIZE} \
  --resize_mode=keep_ratio \
  --disallowed_header_directives '[]'

!du -sh {IMAGE_FOLDER}

In [None]:
if download_images_from_scratch:
  !zip -q -r img_1.zip {IMAGE_FOLDER}00000 {IMAGE_FOLDER}00001
  !zip -q -r img_2.zip {IMAGE_FOLDER}00002 {IMAGE_FOLDER}00003
  !zip -q -r img_3.zip {IMAGE_FOLDER}00004 {IMAGE_FOLDER}00005
  !zip -q img_stats.zip {IMAGE_FOLDER}*.parquet {IMAGE_FOLDER}*.json

!du -sh img_*.zip

## Import image data

In [None]:
for i in [1, 2, 3, "stats"]:
  fname = f"img_{i}.zip"

  !curl -OL https://github.com/woctezuma/discord-members-metadata/releases/download/imgv2/{fname}
  !unzip -qq {fname}

## Analyze images

Reference:
- https://github.com/rom1504/clip-retrieval

### Prepare once

In [None]:
!clip-retrieval inference \
 --input_dataset {IMAGE_FOLDER} \
 --output_folder {EMBEDDING_FOLDER}

!zip -q -r {EMBEDDING_FNAME} {EMBEDDING_FOLDER}

In [None]:
!clip-retrieval index \
 --embeddings_folder {EMBEDDING_FOLDER} \
 --index_folder {INDEX_FOLDER}

!zip -q -r {INDEX_FNAME} {INDEX_FOLDER}

### Process a text query

In [None]:
# Either a local image:
QUERY = "images/00000/000009695.jpg"
# Or a text:
QUERY = "sexist"

%rm -rf {RESULT_FOLDER}

!clip-retrieval filter \
 --indice_folder {INDEX_FOLDER} \
 --query {QUERY} \
 --output_folder {RESULT_FOLDER}

In [None]:
import glob

for fname in sorted(glob.glob(f"{RESULT_FOLDER}/*{FILE_EXT}")):
  url_index = fname.removeprefix(RESULT_FOLDER).removesuffix(FILE_EXT)
  member_id = get_member_id(url_index, fname)