# Dataset

Based on a Wikidata query for video games <= 1990 that have a Mobygames ID (https://w.wiki/AZeT).

## Setup

In [None]:
# folder setup
import os

path_youtube = os.path.abspath(os.getcwd()) + '/youtube/'
path_screenshots = os.path.abspath(os.getcwd()) + '/screenshots/'
path_samples = os.path.abspath(os.getcwd()) + '/samples/'

for folder in [path_youtube, path_screenshots, path_samples]:
    if not os.path.exists(folder):
        os.makedirs(folder)

# essentials ...
import sys, math
def progress_bar(count_value, total):
    filled_up_Length = int(math.floor(100 * count_value / total)) + 1
    bar = '=' * filled_up_Length + '-' * (100 - filled_up_Length)
    sys.stdout.write('[%s]\r' %(bar))
    sys.stdout.flush()

def done():
    print('\n\nDone 👍🏻\n')

## Youtube

### Scrape

In [None]:
!pip install yt-dlp

In [None]:
import csv, yt_dlp

total = 0
count = 1

ydl_opts = {
    'cookies-from-browser': 'firefox'
}

with open('query.csv') as query_results:

    reader = csv.reader(query_results, delimiter=',', quotechar='"')
    next(reader, None)

    for row in reader:
        if row[6] != '':
            for entry in row[6].split(','):
                total += 1 

with open('query.csv') as query_results:

    reader = csv.reader(query_results, delimiter=',', quotechar='"')

    # skip the headers
    next(reader, None)

    for row in reader:

        if row[6] != '':
    
            entry_count = 1
    
            for entry in row[6].split(','):
                progress_bar(count, total)

                ydl_opts['outtmpl'] = path_youtube + row[0].split('/')[-1] + ' ' + '%(title)s'
                
                with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                    ydl.download([entry])
                
                entry_count += 1

                count += 1
    
done()

### Rename Videos

In [None]:
import csv, os, re, shutil, slugify

for path, folders, files in os.walk(path_youtube):
    for filename in files:
        wikidata_id = filename.split(' ')[0]

        platform = ''

        if 'amiga' in filename.lower():
            platform = 'amiga'
        if 'atari' in filename.lower():
            platform = 'atari-st'
        if 'dos' in filename.lower():
            platform = 'dos'
        if 'pc engine' in filename.lower():
            platform = 'dos'
        if 'spectrum' in filename.lower():
            platform = 'zx-spectrum'
        if 'c64' in filename.lower():
            platform = 'commodore-64'
        if 'commodore' in filename.lower():
            platform = 'commodore-64'
        
        with open('query.csv') as query_results:
        
            reader = csv.reader(query_results, delimiter=',', quotechar='"')
            for row in reader:
                row_wikidata_id = row[0].split('/')[-1]

                if wikidata_id == row_wikidata_id:
                    folder = str(row[1]+' ('+row[2]+') ['+platform+'] '+wikidata_id)
                    folder = folder.replace(',', '_')
                    folder = folder.replace('/', '_')
                    folder = path_youtube+folder

                    if not os.path.exists(folder):
                        os.makedirs(folder)

                    # move and rename file: wikidataID_year-year_title_system_rest
                    new_filename = wikidata_id+'_'+row[2].replace(',', '-')+'_'+slugify.slugify(row[1])+'_'+platform+'_'+filename.replace(wikidata_id+' ', '')
                    shutil.copy(path_youtube+filename, folder+'/'+new_filename)

                    

### Generate Stills

In [None]:
!sudo apt install ffmpeg

In [None]:
import os

for folder in os.listdir(path_youtube):
    for path, folders, files in os.walk(path_youtube+folder):
        for filename in files:
            os.system('ffmpeg -i "'+path_youtube+folder+'/'+filename+'" -vf fps=1/5 "'+path_youtube+folder+'/'+filename.split('.')[0]+'-%04d.png" -nostdin')
            os.remove(path_youtube+folder+'/'+filename)

### Remove Similar Stills

In [None]:
# install similar-images-remover
!git clone https://github.com/shashankag14/similar-images-remover.git
!pip install -r similar-images-remover/requirements.txt
!pip install numpy==1.24.2

In [None]:
import os

threshold = 0.75 # default is 0.85

for path, folders, files in os.walk(path_youtube):
    for folder in folders:
        cmd = os.path.join(os.getcwd(), 'similar-images-remover/similar_images_remover.py --folder_path "'+path_youtube+folder+'" --threshold '+str(threshold))
        os.system('{} {}'.format('python', cmd))

### Copy Stills

In [None]:
import os, shutil

for path, folders, files in os.walk(path_youtube):
    for folder in folders:
        for file in os.listdir(path_youtube+folder):
            shutil.copy(path_youtube+folder+'/'+file, path_screenshots+file)


### Cleanup Similar Remover Process Files

In [None]:
!rm preprocessed_*.png
!rm similar_images_*.txt
!rm -rf removed_images

In [None]:
import os, shutil

for path, folders, files in os.walk(path_youtube):
    for folder in folders:
        shutil.rmtree(path_youtube+folder)