# Match Steam Images

- https://github.com/woctezuma/steam-DINOv2

## Install packages

In [None]:
%pip install --quiet faiss-gpu xformers mediapy steamspypi

## Download the image dataset

In [None]:
%cd /content

!curl -OL https://github.com/woctezuma/steam-DINOv2/releases/download/input/images_partA.tar.gz
!curl -OL https://github.com/woctezuma/steam-DINOv2/releases/download/input/images_partB.tar.gz

!tar xzf images_partA.tar.gz
!tar xzf images_partB.tar.gz

!curl -OL https://github.com/woctezuma/steam-DINOv2/releases/download/input/apps.json
!curl -OL https://github.com/woctezuma/steam-DINOv2/releases/download/input/filtered_indices.json

## Pick a DINOv2 model

- https://github.com/facebookresearch/dinov2#pretrained-models

In [None]:
all_model_names = [ 'dinov2_vits14', 'dinov2_vitb14', 'dinov2_vitl14' ]
model_name = all_model_names[0]

## Extract features

- https://github.com/woctezuma/feature-extractor

In [None]:
extract_features_from_scratch = False

if extract_features_from_scratch:
  %cd /content
  !git clone https://github.com/woctezuma/feature-extractor.git
  %cd feature-extractor
  %pip install --quiet -r requirements.txt

  !python extract_fts.py \
  --data_dir /content/images --batch_size 256 \
  --resize_size 224 --keep_ratio --crop_size 224 \
  --model_repo "facebookresearch/dinov2" --model_name {model_name} \
  --torch_features fts_{model_name}.pth \
  --numpy_features fts_{model_name}.npy

else:
  %mkdir -p /content/feature-extractor/features/
  %cd /content/feature-extractor/features/

  !curl -OL https://github.com/woctezuma/steam-DINOv2/releases/download/features/fts_{model_name}.npy
  !curl -OL https://github.com/woctezuma/steam-DINOv2/releases/download/features/img_list.json

## Match features

- https://github.com/woctezuma/feature-matcher

In [None]:
match_features_from_scratch = False

if match_features_from_scratch:
  %cd /content
  !git clone https://github.com/woctezuma/feature-matcher.git
  %cd feature-matcher
  %pip install --quiet -r requirements.txt

  !python match_fts.py \
  --input_dir /content/feature-extractor/features \
  --feature_filename fts_{model_name}.npy \
  --numpy_matches matches_{model_name}.npy \
  --numpy_similarity_scores scores_{model_name}.npy \
  --num_neighbors 10

else:
  %mkdir -p /content/feature-matcher/matches/
  %cd /content/feature-matcher/matches/

  !curl -OL https://github.com/woctezuma/steam-DINOv2/releases/download/matches/matches_{model_name}.npy
  !curl -OL https://github.com/woctezuma/steam-DINOv2/releases/download/matches/scores_{model_name}.npy

## Process a query image

### Define functions

In [None]:
CDN_URL = "https://cdn.cloudflare.steamstatic.com/steam/apps"
IMAGE_NAME = "library_600x900.jpg"

def get_image_url(app_id):
  return f"{CDN_URL}/{app_id}/{IMAGE_NAME}"

In [None]:
import mediapy as media

def get_image(app_id):
  return media.read_image(get_image_url(app_id))

In [None]:
from torchvision import transforms

# Reference: https://raw.githubusercontent.com/woctezuma/feature-extractor/minimal/src/transform_utils.py

IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)


def get_target_image_size(resize_size=256, keep_ratio=True):
    return resize_size if keep_ratio else (resize_size, resize_size)


def get_transform(
    resize_size=256,
    keep_ratio=True,
    crop_size=224,
    interpolation=transforms.InterpolationMode.BICUBIC,
):
    transforms_list = [
        transforms.Resize(
            get_target_image_size(resize_size, keep_ratio),
            interpolation=interpolation,
        ),
        transforms.CenterCrop(crop_size),
        transforms.ToTensor(),
        transforms.Normalize(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD),
    ]
    return transforms.Compose(transforms_list)


In [None]:
import numpy as np
from torchvision import transforms

def get_features(app_id, preprocess, model):
  img = get_image(app_id)

  if isinstance(img, np.ndarray):
    img = transforms.ToPILImage()(img)

  img = preprocess(img)

  img = img[None]

  return model(img)

In [None]:
import faiss

# Reference: https://github.com/woctezuma/feature-matcher/blob/main/src/match_utils.py

def build_faiss_index(embeddings):
    xb = embeddings.astype('float32')

    # Cosine similarity is a dot product on normalized vectors.
    # Embeddings are normalized because faiss uses METRIC_INNER_PRODUCT. See:
    # https://github.com/facebookresearch/faiss/wiki/MetricType-and-distances
    faiss.normalize_L2(xb)

    # Exact Search for Inner Product. See:
    # https://github.com/facebookresearch/faiss/wiki/Faiss-indexes
    index = faiss.IndexFlatIP(xb.shape[1])
    index.add(xb)

    return index


def search_faiss_index(
    index,
    query_vectors,
    num_neighbors=10,
):
    # If there is only one query vector, then add the batch dimension.
    if len(query_vectors.shape) == 1:
        query_vectors = query_vectors[None, :]

    xq = query_vectors.astype('float32')

    # Cosine similarity is a dot product on normalized vectors.
    faiss.normalize_L2(xq)

    return index.search(xq, num_neighbors)


In [None]:
import json

FOLDER_NAME = "/content"
APP_LIST_FNAME = f"{FOLDER_NAME}/apps.json"
FILTERED_INDEX_FNAME = f"{FOLDER_NAME}/filtered_indices.json"

def load_data(fname):
  with open(fname) as f:
    data = json.load(f)
  return data

def load_apps():
  return load_data(fname=APP_LIST_FNAME)

def load_indices():
  return load_data(fname=FILTERED_INDEX_FNAME)

In [None]:
def convert_faiss_output_to_app_id(i, base_apps, base_indices):
  return base_apps[base_indices[i]]

def find_similar_app_ids(query_app_id, preprocess, model, index, base_apps, base_indices, num_neighbors=10):
  query_vector = get_features(query_app_id, preprocess, model)
  scores, indices = search_faiss_index(index, query_vector, num_neighbors)

  similar_app_ids = [ convert_faiss_output_to_app_id(i, base_apps, base_indices)
                      for i in indices
                    ]

  return similar_app_ids

### Run

In [None]:
preprocess = get_transform(resize_size=224, keep_ratio=True, crop_size=224)

In [None]:
import torch

model = torch.hub.load('facebookresearch/dinov2', model_name)

In [None]:
import numpy as np

embeddings = np.load(f'/content/feature-extractor/features/fts_{model_name}.npy')
index = build_faiss_index(embeddings)

In [None]:
base_apps = load_apps()
base_indices = load_indices()

In [None]:
app_id = 271590
num_neighbors = 10

similar_app_ids = find_similar_app_ids(app_id, preprocess, model, index, base_apps, base_indices, num_neighbors)

## Export Top 100 to Markdown

### Define functions

In [None]:
import steamspypi

def get_top_100(target='top100in2weeks'):
  data = steamspypi.download({'request': target})
  app_ids = list(data.keys())

  return app_ids

### Run

In [None]:
# TODO