## Common Helpers

In [1]:
import requests
from bs4 import BeautifulSoup
import pathlib
import os

In [27]:
def load_soup(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    return soup

def get_formatted_tilename(url):
    # find just the filename
    s = url.split('/')[-1]
    num = int(s.split('.')[0].replace('a', '').replace('x', ''))
    # keep leading zeros for 3 digits
    formatted = '{:0>3d}.gif'.format(num)
    return formatted

def download_tile(url, set_dir, verbose=False):
    response = requests.get(url)
    if response.status_code == 200:
        filename = set_dir + '/' + get_formatted_tilename(url)
        save_image(response.content, filename, verbose)
    else:
        print ('Bad response code:', response.status_code, 'for', url)
        
def save_image(content, filename, verbose=False):
    with open(filename, 'wb') as f:
        f.write(content)
        if verbose:
            print ('Saved', filename)
            
def fixRelativeUrl(url):
    # HACK: for these we go up a level
    return url.replace('..', 'http://cr31.co.uk/stagecast')

# gets all absolute img addresses under an element
def get_image_addresses(element):
    imgs = element.find_all('img')
    for i in imgs:
        address = fixRelativeUrl(i.attrs['src'])
        yield address
        
def get_caption_name(element):
    name = element.find('caption').text.strip().replace('*', '')
    return name

In [7]:
# Two-Corner
def get_two_corner_tilesets(soup):
    tables = get_two_corner_tileset_tables(soup)
    for t in tables:
        yield t
        
def get_two_corner_tileset_tables(soup):
    return (soup
            .find_all('table')[3]
            .find('table')
            .find_all('table'))

# Block
def get_block_tilesets(soup):
    # 3 tables, each row is a set, row[0] is name
    tables = (soup
            .find_all('table')[4]
           .find('table')
           .find_all('table'))
    
    for table in tables:
        rows = table.find_all('tr')
        # skip first row
        for row in rows[1:]:
            yield row



In [22]:
def run_two_corner_scraper(parent_dir, key, url):
    soup = load_soup(url)
    
    for tileset in get_two_corner_tilesets(soup):
        set_name = get_caption_name(tileset)
        set_dir = parent_dir + '/' + set_name
        pathlib.Path(set_dir).mkdir(parents=True, exist_ok=True)
        
        # get all img urls from this tileset
        for img in get_image_addresses(tileset):
            download_tile(img, set_dir, verbose=False)
        print ('Done scraping', set_name)
    print ('Done scraping', key, 'tiles...')

def run_block_scraper(parent_dir, url):
    soup = load_soup(url)
    
    for tileset in get_block_tilesets(soup):
        cells = tileset.find_all('td')
        set_name = cells[0].text.strip().replace('*', '').replace('<p>', '').replace('</p>', '')
        set_dir = parent_dir + '/' + set_name
        pathlib.Path(set_dir).mkdir(parents=True, exist_ok=True)
        
        for cell in cells[1:]:
            img_src = list(get_image_addresses(cell))[0]
            download_tile(img_src, set_dir, verbose=False)
        print ('Done scraping', set_name)
    print ('Done scraping block tiles...')            

In [23]:
def run_scrapers(targets):
    tile_dir = './wang'
    pathlib.Path(tile_dir).mkdir(parents=True, exist_ok=True)

    for key, url in targets.items():
        parent_dir = './wang/' + key
        if key == 'two-corner':            
            run_two_corner_scraper(parent_dir, key, url)
        elif key == 'block':
            run_block_scraper(parent_dir, url)
        elif key == 'two-edge':
            run_two_corner_scraper(parent_dir, key, url)            
    
    print ('Scraping complete!')
    


In [28]:
targets =  { 
    'two-corner':'http://cr31.co.uk/stagecast/wang/tiles_c.html',
    'block':'http://cr31.co.uk/stagecast/wang/block_g.html',
    'two-edge':'http://cr31.co.uk/stagecast/wang/tiles_e.html',
}
run_scrapers(targets)

Saved ./wang/two-edge/Wang-2edge/004.gif
Saved ./wang/two-edge/Wang-2edge/006.gif
Saved ./wang/two-edge/Wang-2edge/014.gif
Saved ./wang/two-edge/Wang-2edge/012.gif
Saved ./wang/two-edge/Wang-2edge/005.gif
Saved ./wang/two-edge/Wang-2edge/007.gif
Saved ./wang/two-edge/Wang-2edge/015.gif
Saved ./wang/two-edge/Wang-2edge/013.gif
Saved ./wang/two-edge/Wang-2edge/001.gif
Saved ./wang/two-edge/Wang-2edge/003.gif
Saved ./wang/two-edge/Wang-2edge/011.gif
Saved ./wang/two-edge/Wang-2edge/009.gif
Saved ./wang/two-edge/Wang-2edge/000.gif
Saved ./wang/two-edge/Wang-2edge/002.gif
Saved ./wang/two-edge/Wang-2edge/010.gif
Saved ./wang/two-edge/Wang-2edge/008.gif
Done scraping Wang-2edge
Saved ./wang/two-edge/Path/004.gif
Saved ./wang/two-edge/Path/006.gif
Saved ./wang/two-edge/Path/014.gif
Saved ./wang/two-edge/Path/012.gif
Saved ./wang/two-edge/Path/005.gif
Saved ./wang/two-edge/Path/007.gif
Saved ./wang/two-edge/Path/015.gif
Saved ./wang/two-edge/Path/013.gif
Saved ./wang/two-edge/Path/001.gif
Save

Saved ./wang/two-edge/Greek/015.gif
Saved ./wang/two-edge/Greek/013.gif
Saved ./wang/two-edge/Greek/001.gif
Saved ./wang/two-edge/Greek/003.gif
Saved ./wang/two-edge/Greek/011.gif
Saved ./wang/two-edge/Greek/009.gif
Saved ./wang/two-edge/Greek/000.gif
Saved ./wang/two-edge/Greek/002.gif
Saved ./wang/two-edge/Greek/010.gif
Saved ./wang/two-edge/Greek/008.gif
Done scraping Greek
Saved ./wang/two-edge/Road/004.gif
Saved ./wang/two-edge/Road/006.gif
Saved ./wang/two-edge/Road/014.gif
Saved ./wang/two-edge/Road/012.gif
Saved ./wang/two-edge/Road/005.gif
Saved ./wang/two-edge/Road/007.gif
Saved ./wang/two-edge/Road/015.gif
Saved ./wang/two-edge/Road/013.gif
Saved ./wang/two-edge/Road/001.gif
Saved ./wang/two-edge/Road/003.gif
Saved ./wang/two-edge/Road/011.gif
Saved ./wang/two-edge/Road/009.gif
Saved ./wang/two-edge/Road/000.gif
Saved ./wang/two-edge/Road/002.gif
Saved ./wang/two-edge/Road/010.gif
Saved ./wang/two-edge/Road/008.gif
Done scraping Road
Saved ./wang/two-edge/Pipe wide/004.gif
S

Saved ./wang/two-edge/Laser/011.gif
Saved ./wang/two-edge/Laser/009.gif
Saved ./wang/two-edge/Laser/000.gif
Saved ./wang/two-edge/Laser/002.gif
Saved ./wang/two-edge/Laser/010.gif
Saved ./wang/two-edge/Laser/008.gif
Done scraping Laser
Saved ./wang/two-edge/Border/004.gif
Saved ./wang/two-edge/Border/006.gif
Saved ./wang/two-edge/Border/014.gif
Saved ./wang/two-edge/Border/012.gif
Saved ./wang/two-edge/Border/005.gif
Saved ./wang/two-edge/Border/007.gif
Saved ./wang/two-edge/Border/015.gif
Saved ./wang/two-edge/Border/013.gif
Saved ./wang/two-edge/Border/001.gif
Saved ./wang/two-edge/Border/003.gif
Saved ./wang/two-edge/Border/011.gif
Saved ./wang/two-edge/Border/009.gif
Saved ./wang/two-edge/Border/000.gif
Saved ./wang/two-edge/Border/002.gif
Saved ./wang/two-edge/Border/010.gif
Saved ./wang/two-edge/Border/008.gif
Done scraping Border
Saved ./wang/two-edge/Tube/004.gif
Saved ./wang/two-edge/Tube/006.gif
Saved ./wang/two-edge/Tube/014.gif
Saved ./wang/two-edge/Tube/012.gif
Saved ./wan