# Dataset

Based on a Wikidata query for video games <= 1990 that have a Mobygames ID (https://w.wiki/AZeT).

## Setup

In [None]:
# folder setup
import os

path_mobygames = os.path.abspath(os.getcwd()) + '/mobygames/'
path_screenshots = os.path.abspath(os.getcwd()) + '/screenshots/'
path_samples = os.path.abspath(os.getcwd()) + '/samples/'

for folder in [path_mobygames, path_screenshots, path_samples]:
    if not os.path.exists(folder):
        os.makedirs(folder)

# essentials ...
import sys, math
def progress_bar(count_value, total):
    filled_up_Length = int(math.floor(100 * count_value / total)) + 1
    bar = '=' * filled_up_Length + '-' * (100 - filled_up_Length)
    sys.stdout.write('[%s]\r' %(bar))
    sys.stdout.flush()

def done():
    print('\n\nDone 👍🏻\n')

## Reformat Mobygames Platforms JSON

All of Mobygames platforms can be access via their API under https://api.mobygames.com/v1/platforms?api_key={YOUR_API_KEY}. I adjusted the JSON to make it searchable by slugified platform label.

In [None]:
!pip install python-slugify

In [None]:
import json, slugify

platforms = {
    'platforms': []
}

with open('platforms.json') as json_file:
    data = json.load(json_file)
    for entry in data['platforms']:
        platform = slugify.slugify(entry['platform_name'], replacements=[['/', '']])
        platforms['platforms'].append(platform)
        platforms[platform] = entry

    with open('platforms_reformatted.json', 'w') as f:
        json.dump(platforms, f, indent=4)

## Mobygames

### Scrape

In [None]:
!sudo apt install httrack

In [None]:
import csv, io, subprocess

total = 0
count = 1

with open('query.csv') as query_results:

    reader = csv.reader(query_results, delimiter=',', quotechar='"')
    next(reader, None)

    for row in reader:
        if row[5] != '':
            for entry in row[5].split(','):
                total += 1 

with open('query.csv') as query_results:

    reader = csv.reader(query_results, delimiter=',', quotechar='"')

    # skip the headers
    next(reader, None)

    for row in reader:

        if row[5] != '':
            progress_bar(count, total)
    
            entry_count = 1
    
            for entry in row[5].split(','):
    
                folder = str(row[1]+' ('+row[2]+') '+row[0].split('/')[-1])
                folder = folder.replace(',', '_')
                folder = folder.replace('/', '_')
                folder = path_mobygames+folder
                url = 'https://www.mobygames.com/game/'+entry+'/'
    
                if entry_count > 1:
                    folder = folder + ' ' + str(entry_count)
    
                command = 'httrack --continue -q -%i -w "'+url+'" -O "'+folder+'" -n -%P -N0 -s2 -p7 -D -a -K0 -c10 -%k -A25000 -F "Mozilla/4.5 (compatible; HTTrack 3.0x; Windows 98)" "-*.webp" "-*.css" "-*.js" "-*.xml" "-*.txt" "-*.pdf" "-*wikipedia*" "-*wikimedia*" "-*wikicommons*" "-*creativecommons*" "+*.png" "+*.gif" "+*.jpg" "+*.jpeg" -%s -%u -%e0 -d' 
    
                # print(command)
                process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
                process.wait()
    
                entry_count += 1
                count += 1
    
done()

### Remove Similar Screenshots

In [None]:
# install similar-images-remover
!git clone https://github.com/shashankag14/similar-images-remover.git
!pip install -r similar-images-remover/requirements.txt
!pip install numpy==1.24.2

In [None]:
import json, os, re, shutil
from distutils.dir_util import copy_tree

# setup
threshold = 0.8 # default is 0.85

with open('platforms_reformatted.json') as json_data:
    platforms = json.load(json_data)

    for folder in os.listdir(path_mobygames):

        path_game_screenshots = path_mobygames+folder+'/cdn.mobygames.com/screenshots'

        if os.path.exists(path_game_screenshots):
            target_dir = path_screenshots+folder

            if os.path.exists(target_dir):
                shutil.rmtree(target_dir)
            os.makedirs(target_dir)

            copy_tree(path_game_screenshots, target_dir)
            cmd = os.path.join(os.getcwd(), 'similar-images-remover/similar_images_remover.py --folder_path "'+path_screenshots+folder+'" --threshold '+str(threshold))
            os.system('{} {}'.format('python', cmd))

### Cleanup Similar Remover Process Files

In [None]:
!rm preprocessed_*.png
!rm similar_images_*.txt
!rm -rf removed_images

### Rename and Copy

In [None]:
import json, os, re, shutil

with open('platforms_reformatted.json') as json_data:
    platforms = json.load(json_data)
        
    for folder in os.listdir(path_screenshots):
        
        wikidata_id = 'Q'+folder.split(') Q')[-1]    
        wikidata_id = wikidata_id.split(' ')[0]

        year = folder.split('(')[1]
        year = year.split(')')[0]
        year = year.replace('_', '-')

        path_game_screenshots = path_screenshots+folder+'/'

        for path, folders, files in os.walk(path_game_screenshots):
            for filename in files:
                has_platform = False
                platform = ''

                old_filename = re.sub(r'^\d+-', '', filename)
                new_filename = ''

                for pf in platforms['platforms']:
                    if '-'+pf+'-' in filename:
                        if platform == '':
                            platform = pf
                        if filename.index(pf) >= filename.index(platform):
                            platform = pf
                    
                new_filename = old_filename.replace('-'+platform+'-', '_'+platform+'_')     
                new_filename = wikidata_id + '_' + year + '_' + new_filename

                if platform != '':
                    shutil.copy(path_game_screenshots+filename, path_screenshots+new_filename)
        
        if os.path.exists(path_game_screenshots):
            shutil.rmtree(path_game_screenshots)