In [1]:
headers = {
    'User-Agent': 'VetleBot/0.1'
}

import re
PATTERN_PAGE_LINK = re.compile(r'^(?:<ul>)?<li><a href="\/wiki\/[^"]+" title="[^"]+">([^<]+)<\/a><\/li>', re.MULTILINE)
PATTERN_NEXT_PAGE = re.compile(r'\(<a href="([^"]+)" title="[^"]+">next page<\/a>\)')

In [2]:
# Get the first category page
import requests
r = requests.get('https://pcgamingwiki.com/wiki/Category:Games', headers=headers)
done_processing = False
all_pages = []

import html
while done_processing is False:
    # Extract pages from category page
    pages = re.findall(PATTERN_PAGE_LINK, r.text)

    # Clean HTML entities
    pages = [html.unescape(page) for page in pages]
    
    # Append pages found to list
    all_pages += pages
    done_processing = True

    # Get the next category page, if it exists
    m = re.search(PATTERN_NEXT_PAGE, r.text)
    if m is not None:
        # Clean it
        url = html.unescape(m.group(1))
        
        # Add prefix, if needed
        url = 'https://pcgamingwiki.com' + url if url[0] == '/' else url
        
        # Make request
        r = requests.get(url, headers=headers)
        done_processing = False

In [3]:
# Make unique
all_pages = list(set(all_pages))
all_pages.sort() # for predictability

In [4]:
# Export in batches of 5000
import math
batches = math.ceil(len(all_pages) / 5000)
url = 'https://pcgamingwiki.com/wiki/Special:Export'
export_dir = '../data_pcgw'

import os
if not os.path.exists(export_dir):
    os.makedirs(export_dir)
    
# Remove old exported files
import glob
for file in glob.glob(os.path.join(export_dir, '*')):
    os.remove(file)

for i in range(batches):
    pages = all_pages[i * 5000 : i * 5000 + 5000]
    pages_crnl = '\r\n'.join(pages)

    data = {
        'catname': '',
        'pages': pages_crnl,
        'curonly': '1',
        'wpDownload': '1',
        'wpEditToken': '+\\',
        'title': 'Special:Export',
    }

    r = requests.post(url, data=data, headers=headers, stream=True)
    filename = 'pcgw_games_%d.xml' % (i + 1)

    with open(os.path.join(export_dir, filename), 'wb') as f:
        for chunk in r.iter_content(chunk_size=1024):
            if chunk:
                f.write(chunk)