# Source

Based on a Wikidata query for video games <= 1990 that have a Mobygames ID (https://w.wiki/AZeT).

## Setup

In [None]:
# folder setup
import os

path_mobygames = os.path.abspath(os.getcwd()) + '/mobygames/'
path_screenshots = os.path.abspath(os.getcwd()) + '/screenshots/'

for folder in [path_mobygames, path_screenshots, path_reduced, path_samples]:
    if not os.path.exists(folder):
        os.makedirs(folder)

## Reformat Mobygames Platforms JSON

All of Mobygames platforms can be access via their API under https://api.mobygames.com/v1/platforms?api_key={YOUR_API_KEY}. I adjusted the JSON to make it searchable by slugified platform label.

In [None]:
!pip install python-slugify

In [None]:
import json, slugify

platforms = {
    'platforms': []
}

with open('platforms.json') as json_file:
    data = json.load(json_file)
    for entry in data['platforms']:
        platform = slugify.slugify(entry['platform_name'], replacements=[['/', '']])
        platforms['platforms'].append(platform)
        platforms[platform] = entry

    with open('platforms_reformatted.json', 'w') as f:
        json.dump(platforms, f, indent=4)

## Mobygames

### Scrape

In [None]:
!sudo apt install httrack

In [None]:
import csv, subprocess

with open('query.csv') as query_results:

    reader = csv.reader(query_results, delimiter=',', quotechar='"')

    # skip the headers
    next(reader, None)

    for row in reader:

        if row[5] != '':
            progress_bar(count, total)
    
            entry_count = 1
    
            for entry in row[5].split(','):
    
                folder = str(row[1]+' ('+row[2]+') '+row[0].split('/')[-1])
                folder = folder.replace(',', '_')
                folder = folder.replace('/', '_')
                folder = path_mobygames+folder
                url = 'https://www.mobygames.com/game/'+entry+'/'
    
                if entry_count > 1:
                    folder = folder + ' ' + str(entry_count)
    
                command = 'httrack --continue -q -%i -w "'+url+'" -O "'+folder+'" -n -%P -N0 -s2 -p7 -D -a -K0 -c10 -%k -A25000 -F "Mozilla/4.5 (compatible; HTTrack 3.0x; Windows 98)" "-*.webp" "-*.css" "-*.js" "-*.xml" "-*.txt" "-*.pdf" "-*wikipedia*" "-*wikimedia*" "-*wikicommons*" "-*creativecommons*" "+*.png" "+*.gif" "+*.jpg" "+*.jpeg" -%s -%u -%e0 -d' 
    
                # print(command)
                process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
                process.wait()
    
                entry_count += 1

### Rename and Copy

In [None]:
# install similar-images-remover
!git clone https://github.com/shashankag14/similar-images-remover.git
!pip install -r similar-images-remover/requirements.txt
!pip install numpy==1.24.2

In [None]:
import json, os, re, shutil

remove_similar = False
threshold = 0.75 # default is 0.85

# open mobygames platforms json first
with open('platforms_reformatted.json') as json_data:
    platforms = json.load(json_data)

    # go through all the 'screenshots' folders in the scraped entries
    for _path, _folders, _files in os.walk(path_mobygames):
        for _folder in _folders:

            path_game_screenshots = path_mobygames+_folder+'/cdn.mobygames.com/screenshots'

            if os.path.exists(path_game_screenshots):

                # extract wikidata id and years from the main folder
                wikidata_id = 'Q'+_folder.split(') Q')[-1]    
                wikidata_id = wikidata_id.split(' ')[0]
        
                year = _folder.split('(')[1]
                year = year.split(')')[0]
                year = year.replace('_', '-')
    
                # copy screenshots folder to new location
                path_target = path_screenshots+_folder+'/'
    
                if os.path.exists(path_target):
                    shutil.rmtree(path_target)
    
                shutil.copytree(path_game_screenshots, path_target)
    
                # remove similar images, if needed
                if (remove_similar):
                    cmd = os.path.join(os.getcwd(), 'similar-images-remover/similar_images_remover.py --folder_path "'+path_target+'" --threshold '+str(threshold))
                    os.system('{} {}'.format('python', cmd))
    
                # rename the screenshots' filenames to persist information on that level
                for path, folders, files in os.walk(path_target):
                    for filename in files:
    
                        platform = ''
        
                        old_filename = re.sub(r'^\d+-', '', filename)
                        new_filename = ''
        
                        for _platform in platforms['platforms']:
                            if '-'+_platform+'-' in filename:
                                if platform == '':
                                    platform = _platform
                                if filename.index(_platform) >= filename.index(platform):
                                    platform = _platform
                            
                        new_filename = old_filename.replace('-'+platform+'-', '_'+platform+'_')     
                        new_filename = wikidata_id + '_' + year + '_' + new_filename
    
                        # only copy renamed screenshot to final destination if we have a platform
                        if platform != '':
                            shutil.copy(path_target+filename, path_screenshots+new_filename)
    
                # remove temporary copied folder
                if os.path.exists(path_target):
                    shutil.rmtree(path_target)

### Cleanup Similar Remover Process Files

In [None]:
!rm preprocessed_*.png
!rm similar_images_*.txt
!rm -rf removed_images