# Filter out Steam banners

## Utility functions

In [1]:
def get_input_file_name():
  return 'original_vertical_steam_banners.tar'

def get_output_resized_folder():
  return 'resized_vertical_steam_banners'

def get_gdrive_data_folder():
  return 'data/'

def get_local_data_folder():
  return 'data/original_vertical_steam_banners/'

def get_image_root_name():
  return 'library_600x900'

def get_image_extension():
  return '.jpg'

## Import data

In [2]:
%pip install google-colab-transfer

Collecting google-colab-transfer
  Downloading https://files.pythonhosted.org/packages/a0/90/76fc38bcad442018977ed0e4e663473ef56a4d15395b2aa09055e8c49185/Google_Colab_Transfer-0.1.6-py3-none-any.whl
Installing collected packages: google-colab-transfer
Successfully installed google-colab-transfer-0.1.6


In [3]:
import colab_transfer as ct

ct.mount_google_drive()

file_name = get_input_file_name()

gd = ct.get_path_to_home_of_google_drive()
lm = ct.get_path_to_home_of_local_machine()

ct.copy_file(file_name,
             source = gd + get_gdrive_data_folder(),
             destination = lm)

Mounted at /content/drive/
Copying /content/drive/My Drive/data/original_vertical_steam_banners.tar to /content/original_vertical_steam_banners.tar


In [4]:
from pathlib import Path

if not Path(get_local_data_folder()).exists():
  %cd /content/
  !echo tar -xf {file_name}
  !tar -xf {file_name}

/content
tar -xf original_vertical_steam_banners.tar


## Boilerplate functions

### Disk utils

In [5]:
import glob

def list_file_names(image_dir = None):
  if image_dir is None:
    image_dir = get_local_data_folder()

  if not image_dir.endswith('/'):
    image_dir += '/'

  file_names = glob.glob(image_dir + '*' + get_image_extension())

  return file_names

In [6]:
def save_app_ids_to_file(output_file_name, 
                         image_dir = None):
  file_names = list_file_names(image_dir)

  app_ids = [
            str(get_app_id(f)) for f in file_names
  ]

  app_ids = sorted(app_ids, key=int)

  print('#appIDs = {}'.format(len(app_ids)))

  with open(output_file_name, 'w') as f:
    f.write('\n'.join(app_ids))

  return

### Parsing utils

In [7]:
from pathlib import Path

def get_app_id(file_name):
  return Path(file_name).name.replace(get_image_extension(), '')

def get_fname(app_id):
  return get_local_data_folder() + str(app_id) + get_image_extension()

In [8]:
def get_steamdb_url(file_name=None,
                    app_id=None):

  if app_id is None:
    app_id = get_app_id(file_name)

  return 'https://steamdb.info/app/{}'.format(app_id)

def get_url(file_name=None,
            app_id=None):

  if app_id is None:
    app_id = get_app_id(file_name)

  base_url = 'https://cdn.cloudflare.steamstatic.com/steam/apps/{}/{}{}'

  url = base_url.format(app_id,
                        get_image_root_name(),
                        get_image_extension())

  return url  

### Detection utils

In [181]:
from PIL import Image

def detect_issues_with_images(file_names,
                              check_blank = False,
                              check_image_size = True,
                              check_channels = True,
                              verbose=True):
  # Caveat: if (check_blank == True), then code is very slow! Plus, in practice
  #         here, it is useless as it overlaps with (check_channels == True).

  app_ids_with_blank_image = [] # list of appIDs
  d = dict() # bands -> list of appIDs
  dd = dict() # image size -> list of appIDs

  for i, f in enumerate(file_names, start=1):
    app_id = get_app_id(f)
    
    img = Image.open(f)

    ## Remove blank banners

    if check_blank:
      # Reference: https://stackoverflow.com/a/38749333    
      extrema = img.convert("L").getextrema()

      if extrema[0] == extrema[1]:
        app_ids_with_blank_image.append(app_id)

    ## Remove banners with uncommon resolution (anything but 300x450)

    if check_image_size:
      sz = img.size

      try:
        dd[sz].append(app_id)
      except KeyError:
        dd[sz] = [app_id]

    ## Remove banners with uncommon bands (anything but RGB)    
    if check_channels:
      channels = img.getbands()
      channels_as_str = ''.join(channels)
      
      try:
        d[channels_as_str].append(app_id)
      except KeyError:
        d[channels_as_str] = [app_id]

    if verbose and i % 1000 == 0:
      print('{}/{}'.format(i, len(file_names)))

  return app_ids_with_blank_image, d, dd

### Removal utils

In [10]:
from pathlib import Path
import shutil

def remove_app_ids(file_names,
                   folder_other,
                   app_ids_to_remove):
  
  for f in file_names:
    app_id = get_app_id(f)

    if app_id in app_ids_to_remove:

      input_file_name = f
      output_file_name = folder_other + '/' + str(app_id) + get_image_extension()

      if Path(input_file_name).exists():
        shutil.move(input_file_name, output_file_name)

  fname = folder_other + '.txt'

  save_app_ids_to_file(fname,
                      image_dir=folder_other)

  return

## Detect

In [11]:
image_dir = 'data/original_vertical_steam_banners/'

file_names = list_file_names(image_dir=image_dir)

# Recommendation: use check_blank == False to go fast.
#                 Plus, here, this check overlaps with 'check_channels' anyway.
app_ids_with_blank_image, d, dd = detect_issues_with_images(file_names,
                                                            check_blank=False)

1000/29982
2000/29982
3000/29982
4000/29982
5000/29982
6000/29982
7000/29982
8000/29982
9000/29982
10000/29982
11000/29982
12000/29982
13000/29982
14000/29982
15000/29982
16000/29982
17000/29982
18000/29982
19000/29982
20000/29982
21000/29982
22000/29982
23000/29982
24000/29982
25000/29982
26000/29982
27000/29982
28000/29982
29000/29982


In [12]:
print(app_ids_with_blank_image)

for k, v in d.items():
  print(k, len(v))

for k, v in dd.items():
  print(k, len(v))  

[]
RGB 29642
L 306
CMYK 34
(300, 450) 29981
(600, 900) 1


## Count

In [14]:
print(app_ids_with_blank_image)

[]


In [15]:
dd_count = dict() # image size -> number of appIDs

for sz in dd:
  dd_count[sz] = len(dd[sz])

print(dd_count)

{(300, 450): 29981, (600, 900): 1}


In [16]:
d_count = dict() # bands -> number of appIDs

for channels_as_str in d:
  d_count[channels_as_str] = len(d[channels_as_str])

print(d_count)

{'RGB': 29642, 'L': 306, 'CMYK': 34}


## Visualize

In [17]:
len(app_ids_with_blank_image)

0

In [18]:
import cv2
from google.colab.patches import cv2_imshow

for app_id in app_ids_with_blank_image:
  print('Uniform image for appID = {}'.format(app_id))

  print(get_steamdb_url(app_id=app_id))

  url = get_url(app_id=app_id)
  print(url)

  cv2_imshow(cv2.imread(get_fname(app_id)))

In [19]:
l_size = sorted(dd_count.keys(), key=lambda x: dd_count[x], reverse=True)

app_ids_with_wrong_size = [
                           app_id
                           for app_id in dd[sz]
                           for sz in l_size[1:]
                           ]

len(app_ids_with_wrong_size)                           

1

In [20]:
import cv2
from google.colab.patches import cv2_imshow

for i, sz in enumerate(l_size, start=1):
  ratio = sz[0]/sz[1]
  print('{}) {} (ratio {:.2f}): #apps = {}'.format(i, sz, ratio, dd_count[sz]))
  
  examplar_index = -1
  examplar_app_id = dd[sz][examplar_index]

  print(get_steamdb_url(app_id=examplar_app_id))

  url = get_url(app_id=examplar_app_id)
  print(url)

  fname = get_fname(examplar_app_id)
  # cv2_imshow(cv2.imread(fname))

1) (300, 450) (ratio 0.67): #apps = 29981
https://steamdb.info/app/1217440
https://cdn.cloudflare.steamstatic.com/steam/apps/1217440/library_600x900.jpg
2) (600, 900) (ratio 0.67): #apps = 1
https://steamdb.info/app/346110
https://cdn.cloudflare.steamstatic.com/steam/apps/346110/library_600x900.jpg


In [21]:
l_channels = sorted(d_count.keys(), key=lambda x: d_count[x], reverse=True)

app_ids_with_wrong_channels = [
                               app_id
                               for bands in l_channels[1:]
                               for app_id in d[bands]
                               ]

len(app_ids_with_wrong_channels)

340

In [22]:
import PIL
import cv2
from google.colab.patches import cv2_imshow

for i, bands in enumerate(l_channels, start=1):
  print('{}) {}: #apps = {}'.format(i, bands, d_count[bands]))
  
  examplar_index = -1
  examplar_app_id = d[bands][examplar_index]

  print(get_steamdb_url(app_id=examplar_app_id))

  url = get_url(app_id=examplar_app_id)
  print(url)

  fname = get_fname(examplar_app_id)
  # cv2_imshow(cv2.imread(fname))

1) RGB: #apps = 29642
https://steamdb.info/app/1217440
https://cdn.cloudflare.steamstatic.com/steam/apps/1217440/library_600x900.jpg
2) L: #apps = 306
https://steamdb.info/app/1409110
https://cdn.cloudflare.steamstatic.com/steam/apps/1409110/library_600x900.jpg
3) CMYK: #apps = 34
https://steamdb.info/app/17470
https://cdn.cloudflare.steamstatic.com/steam/apps/17470/library_600x900.jpg


## Filter out original images

In [None]:
%cd /content/

for folder_other in ['blank_image', 'wrong_size', 'wrong_channels']:
  %mkdir -p {folder_other}

  if folder_other == 'blank_image':
    app_ids_to_remove = app_ids_with_blank_image
  elif folder_other == 'wrong_size':
    app_ids_to_remove = app_ids_with_wrong_size
  else:
    app_ids_to_remove = app_ids_with_wrong_channels

  remove_app_ids(file_names = list_file_names(),
                 folder_other = folder_other,
                 app_ids_to_remove = app_ids_to_remove)                 

In [None]:
# This is why we have 338 images instead of 340 for 'wrong_channels':
set(app_ids_with_wrong_channels).intersection(app_ids_with_blank_image)

## Resize images to 224x224

In [24]:
# !apt-get update > /dev/null
# !apt-get install imagemagick > /dev/null

In [131]:
input_resolution = None

file_extension = get_image_extension()

In [135]:
if input_resolution is None:
  output_resolution = 224
else:
  output_resolution = int(input_resolution/2)

print('Output resolution: {}'.format(output_resolution))

assert(output_resolution > 0)

Output resolution: 224


In [136]:
if input_resolution is None or input_resolution > 256:
  local_data_folder = get_local_data_folder()
else:
  local_data_folder = 'data/' + get_output_resized_folder() + '_{}/'.format(input_resolution)

print('From: {}'.format(local_data_folder))

From: data/original_vertical_steam_banners/


In [137]:
output_resized_folder = get_output_resized_folder() + '_{}'.format(output_resolution)

%mkdir -p /content/data/{output_resized_folder}

print('To: {}'.format(output_resized_folder))

To: resized_vertical_steam_banners_224


In [138]:
mogrifty_str = f'{output_resolution}x{output_resolution}!'

print('ImageMagick argument: {}'.format(mogrifty_str))

ImageMagick argument: 224x224!


In [139]:
file_names = list_file_names(local_data_folder)

len(file_names)

29982

In [141]:
!mogrify \
 -resize {mogrifty_str} \
 -path /content/data/{output_resized_folder} \
 /content/{local_data_folder}*{file_extension}

In [142]:
file_names = list_file_names('data/' + output_resized_folder)

len(file_names)

29982

In [173]:
resolutions = [224]
resolutions += [int(pow(2, i)) for i in range(9)]

resolutions.sort()

print(len(resolutions))

## Filter out resized images

In [182]:
%cd /content/

for resolution in resolutions:
  print('\nResolution: {}'.format(resolution))

  output_resized_folder = get_output_resized_folder() + '_{}'.format(resolution)
  image_dir = 'data/' + output_resized_folder

  folder_other = 'wrong_channels_after_resizing_to_{}'.format(resolution)
  %mkdir -p {folder_other}

  # Recommendation: use check_blank == False to go fast.
  #                 Plus, here, this check overlaps with 'check_channels' anyway.
  app_ids_with_blank_image, d, dd = detect_issues_with_images(list_file_names(image_dir = image_dir),
                                                              check_blank=False,
                                                              check_image_size=True,
                                                              check_channels=True,
                                                              verbose=False)

  print(app_ids_with_blank_image)
  for k, v in d.items():
    print(k, len(v))
  for k, v in dd.items():
    print(k, len(v))

  most_common = max(d, key=lambda x: len(d[x]))
  print('Most common: {}'.format(most_common))

  app_ids_to_remove = []
  for k in d:
    if k != most_common:
      app_ids_to_remove += d[k]
  print('#apps to remove = {}'.format(len(app_ids_to_remove)))  

  remove_app_ids(file_names = list_file_names(image_dir = image_dir),
                 folder_other = folder_other,
                 app_ids_to_remove = app_ids_to_remove)

/content

Resolution: 1
[]
RGB 29196
L 752
CMYK 34
(1, 1) 29982
Most common: RGB
#apps to remove = 786
#appIDs = 786

Resolution: 2
[]
RGB 29308
L 640
CMYK 34
(2, 2) 29982
Most common: RGB
#apps to remove = 674
#appIDs = 674

Resolution: 4
[]
RGB 29362
L 586
CMYK 34
(4, 4) 29982
Most common: RGB
#apps to remove = 620
#appIDs = 620

Resolution: 8
[]
RGB 29448
L 500
CMYK 34
(8, 8) 29982
Most common: RGB
#apps to remove = 534
#appIDs = 534

Resolution: 16
[]
RGB 29486
L 462
CMYK 34
(16, 16) 29982
Most common: RGB
#apps to remove = 496
#appIDs = 496

Resolution: 32
[]
RGB 29523
L 425
CMYK 34
(32, 32) 29982
Most common: RGB
#apps to remove = 459
#appIDs = 459

Resolution: 64
[]
RGB 29555
L 393
CMYK 34
(64, 64) 29982
Most common: RGB
#apps to remove = 427
#appIDs = 427

Resolution: 128
[]
RGB 29576
L 372
CMYK 34
(128, 128) 29982
Most common: RGB
#apps to remove = 406
#appIDs = 406

Resolution: 224
[]
RGB 29592
L 356
CMYK 34
(224, 224) 29982
Most common: RGB
#apps to remove = 390
#appIDs = 39

In [184]:
log_folder = '/content/drive/MyDrive/data/filter_after_resizing/'

%mkdir -p {log_folder}
%cp wrong_channels_after_resizing_to_*.txt {log_folder}

## Export a `.tar` archive of filtered and resized images

In [179]:
from pathlib import Path

for resolution in resolutions:
  output_resized_folder = get_output_resized_folder() + f'_{resolution}'
  output_archive_name = output_resized_folder + '.tar'
  output_data_folder = 'data/' + output_resized_folder

  file_names = list_file_names(output_data_folder)
  print('#files = {}'.format(len(file_names)))

  if Path(output_data_folder).exists():
    if not Path(output_archive_name).exists():
      !echo tar cf {output_archive_name} {output_data_folder}
      !tar cf {output_archive_name} {output_data_folder}
    
    !du -sh {output_archive_name}  

#files = 29982
30M	resized_vertical_steam_banners_1.tar
#files = 29982
30M	resized_vertical_steam_banners_2.tar
#files = 29982
30M	resized_vertical_steam_banners_4.tar
#files = 29982
30M	resized_vertical_steam_banners_8.tar
#files = 29982
39M	resized_vertical_steam_banners_16.tar
#files = 29982
54M	resized_vertical_steam_banners_32.tar
#files = 29982
109M	resized_vertical_steam_banners_64.tar
#files = 29982
286M	resized_vertical_steam_banners_128.tar
#files = 29982
677M	resized_vertical_steam_banners_224.tar
#files = 29982
835M	resized_vertical_steam_banners_256.tar


In [None]:
for resolution in resolutions:
  output_resized_folder = get_output_resized_folder() + f'_{resolution}'
  output_archive_name = output_resized_folder + '.tar'

  ct.copy_file(output_archive_name,
               source = lm,
               destination = gd + get_gdrive_data_folder())

Done.