In [1]:
headers = {
    'User-Agent': 'VetleBot/0.1'
}

import re
PATTERN_PAGE_LINK = re.compile(r'^(?:<ul>)?<li><a href="\/wiki\/[^"]+" title="[^"]+">([^<]+)<\/a><\/li>', re.MULTILINE)
PATTERN_NEXT_PAGE = re.compile(r'<a href="([^"]+)"[^>]*>Next page')

In [None]:
# Get the first "File:" page
import requests
r = requests.get('https://pcgamingwiki.com/w/index.php?title=Special%3APrefixIndex&prefix=&namespace=6', headers=headers)
done_processing = False
all_pages = []

import html
while done_processing is False:
    # Extract pages from prefix page
    pages = re.findall(PATTERN_PAGE_LINK, r.text)

    # Clean HTML entities
    pages = [html.unescape(page) for page in pages]
    
    # Append pages found to list
    all_pages += pages
    done_processing = True

    # Get the next prefix page, if it exists
    m = re.search(PATTERN_NEXT_PAGE, r.text)
    if m is not None:
        # Clean it
        url = html.unescape(m.group(1))
        
        # Add prefix, if needed
        url = 'https://pcgamingwiki.com' + url if url[0] == '/' else url
        
        # Make request
        r = requests.get(url, headers=headers)
        done_processing = False

In [4]:
len(all_pages)

49819

In [None]:
import pickle
pickle.dump(all_pages, open('../data_pcgw/all_files.p', 'wb'))

In [6]:
# Make unique
all_pages = list(set(all_pages))
all_pages.sort() # for predictability

In [7]:
len(all_pages)
all_images = {}

49819

In [119]:
# for every 50, get size
import math
batch_size = 50
batches = math.ceil(len(all_pages) / batch_size)

export_dir = '../data_pcgw'

import os
if not os.path.exists(export_dir):
    os.makedirs(export_dir)

import json
for i in range(batches):
    #if(i <= 324):
    #    continue
    pages = all_pages[i * batch_size : i * batch_size + batch_size]
    pages_comb = '|'.join(('File:' + p for p in pages))
    params = {
        'action': 'query',
        'titles': pages_comb,
        'prop': 'imageinfo',
        'iiprop': 'size|url',
        'format': 'json',
    }
    r = requests.get('https://pcgamingwiki.com/w/api.php', headers=headers, params=params)
    
    # Parse the JSON response
    data = json.loads(r.text)
    for x in data['query']['pages'].values():
        try:
            imi = x['imageinfo'][0]
            all_images[x['title']] = {'size': imi['size'], 'width': imi['width'], 'height': imi['height'], 'url': imi['url']}
        except:
            pass # Bad
    print(f'{i + 1}/{batches}', end='\r')

997/997

In [120]:
len(all_images)

49817

In [121]:
import pickle
pickle.dump(all_images, open('../data_pcgw/imageinfo.p', 'wb'))

In [147]:
bad_images = {}

In [156]:
i = 0
total = len(all_images)
for title, info in all_images.items():
    #if i < 39621:
    #    i += 1
    #    continue
    
    # TODO: handle timeouts
    r = requests.head(info['url'], headers=headers)
    
    actual_size = int(r.headers.get('Content-Length', '-1'))
    expected_size = info['size']
    
    if r.status_code != 200:
        bad_images.setdefault('missing', []).append(title)
    elif actual_size < expected_size:
        bad_images.setdefault('actually_smaller', []).append(title)
    elif actual_size > expected_size:
        bad_images.setdefault('actually_larger', []).append(title)        
    print(f'{i + 1}/{total} (smaller: %s, larger: %s, missing: %s)' % (len(bad_images.get('actually_smaller', [])), len(bad_images.get('actually_larger', [])), len(bad_images.get('missing', []))), end='\r')
    i += 1

49817/49817 (smaller: 15930, larger: 3, missing: 324)

In [157]:
import pickle
pickle.dump(bad_images, open('../data_pcgw/bad_images.p', 'wb'))

In [165]:
with open('../data_pcgw/images_actually_smaller.txt', 'w', encoding='utf-8') as f:
    for item in bad_images['actually_smaller']:
        f.write('%s\n' % item)
with open('../data_pcgw/images_actually_larger.txt', 'w', encoding='utf-8') as f:
    for item in bad_images['actually_larger']:
        f.write('%s\n' % item)
with open('../data_pcgw/images_missing.txt', 'w', encoding='utf-8') as f:
    for item in bad_images['missing']:
        f.write('%s\n' % item)