In [None]:
!{'python3 -m pip install -U dataflows'}

## Scrape site

In [None]:
from dataflows import Flow, load, printer, checkpoint, add_field
from requests_html import HTMLSession
from retrying import retry
import datetime
from datapackage import Package
import os
from requests_html import HTML
import urllib.parse
import traceback
from google.cloud import storage
import json
from time import sleep

import sys
print('redirecting stdout and stderr to console')
sys.stdout = open('/dev/stdout', 'w')
sys.stderr = open('/dev/stderr', 'w')

MAX_OBJECTS = 0

session = HTMLSession()
bucket = storage.Client().get_bucket("wmil-1946")
    
# wait 4s, 8s, 16s, 32s and continue with 32s up to 5m
@retry(wait_exponential_multiplier=4000, wait_exponential_max=32000, stop_max_delay=30000)
def retry_session_get(*args, **kwargs):
    print('session_get: ' + str(args) + ' ' + str(kwargs))
    try:
        r = session.get(*args, **kwargs)
        sleep(.2)
        return r
    except Exception as e:
        print(f"{e}, retrying up to 5m...")
        raise

def scrape_site(protocol, domain, start_path, extra_scrape_link_callback=None):
    scraped_object_ids = set()
    stats = {'num_scraped': 0}
    
    def _get_scraped_object(_type, _referrer, _value, alt=None, title=None):
        object_id = '{}:{}'.format(_type, _value)
        if object_id in scraped_object_ids:
            return None
        else:
            scraped_object_ids.add(object_id)
            return {'type': _type, 'referrer': _referrer, 'value': _value, 'alt': alt or '', 'title': title or ''}
    
    def _save_scraped_object(obj, content):
        assert obj['type'] == 'link'
        blob_name = 'btm/scraped_objects/{}'.format(obj['value'].replace('https://', '').replace('http://', ''))
        blob = bucket.blob('{}.content'.format(blob_name))
        blob.upload_from_string(content)
        blob = bucket.blob('{}.json'.format(blob_name))
        blob.upload_from_string(json.dumps(obj))
    
    def _save_image_object(obj):
        assert obj['type'] == 'image' and obj['value'] and obj['value'].startswith('http')
        r = retry_session_get(obj['value'])
        if r.status_code == 200:
            blob_name = 'btm/scraped_objects/{}'.format(obj['value'].replace('https://', '').replace('http://', ''))
            blob = bucket.blob(blob_name)
            blob.upload_from_string(r.content)       
    
    def _check_num_scraped():
        num_scraped = len(scraped_object_ids)
        if num_scraped > stats['num_scraped']:
            stats['num_scraped'] = num_scraped
            print('{} / ?'.format(num_scraped))
        return not MAX_OBJECTS or num_scraped < MAX_OBJECTS
    
    def _scrape(_url, _referrer, **obj_kwargs):
        if f'//{domain}' in _url or (extra_scrape_link_callback and extra_scrape_link_callback(_url)):
            num_scraped = len(scraped_object_ids)
            if num_scraped > stats['num_scraped']:
                stats['num_scraped'] = num_scraped
                print('{} / ?'.format(num_scraped))
            if _referrer == 'root':
                yield _get_scraped_object('root', '', _url)
            if MAX_OBJECTS < 1 or num_scraped < MAX_OBJECTS:
                obj = _get_scraped_object('link', _referrer, _url, **obj_kwargs)
                if obj:
                    yield obj
                    if _url.strip()[-4:].lower() in ['.jpg', '.gif', '.png', '.svg', 'txt', 'xml', 'ico']:
                        pass
                    elif _url.strip()[-5:].lower() in ['.jpeg']:
                        pass
                    else:
                        try:
                            r = retry_session_get(_url)
                            _save_scraped_object(obj, r.content)
                            for a in r.html.find('a'):
                                if a.attrs.get('href'):
                                    link = urllib.parse.unquote(a.attrs['href'])
                                    yield from _scrape(link, 'link:{}'.format(_url),
                                                       alt=a.attrs.get('alt'),
                                                       title=a.attrs.get('title'))
                            for link in r.html.absolute_links:
                                link = urllib.parse.unquote(link)
                                yield from _scrape(link, 'link:{}'.format(_url),
                                                   alt=a.attrs.get('alt'),
                                                   title=a.attrs.get('title'))
                            for img in r.html.find('img'):
                                if img.attrs.get('src'):
                                    img.attrs.get('alt')
                                    obj = _get_scraped_object('image', 'link:{}'.format(_url), urllib.parse.unquote(img.attrs['src']),
                                                              alt=img.attrs.get('alt'),
                                                              title=img.attrs.get('title'))
                                    if obj:
                                        # _save_image_object(obj)
                                        yield obj
                                if not _check_num_scraped(): break
                        except Exception as e:
                            print('{} ({})'.format(_url, _referrer))
                            traceback.print_exc()
                            yield {'type': 'error', 'referrer': 'link:{}'.format(_url), 'value': str(e)}
        
    return _scrape(f'{protocol}://{domain}{start_path}', 'root')

!{'rm -rf .checkpoints/scraped-site'}

Flow(
    scrape_site('http', 'www.bitmuna.com', '/', lambda link: 'bitmuna.com' in link),
    checkpoint('scraped-site'),
    printer(tablefmt='html', num_rows=1),
).process()[1]

## text-based filter by years

In [None]:
from dataflows import Flow, checkpoint, printer

def filter_years(rows):
    for row in rows:
        ok = False
        for year in range(10,47):
            for field in ['alt', 'title']:
                if '19'+str(year) in (row.get(field) or ''):
                    ok=True
                    break
            if ok: break
        if ok: yield row

Flow(
    checkpoint('scraped-site'),
    filter_years,
    printer(tablefmt='html', num_rows=1),
    checkpoint('scraped-site-filtered-years')
).process()[1]

## parse album titles and cleanup

In [None]:
from dataflows import delete_fields, add_field, Flow

def bitmuna_albums():
    
    def _bitmuna_albums(rows):
        for row in rows:
            parts = row['referrer'].split('/')
            if len(parts) > 4 and parts[2] == 'www.bitmuna.com':
                album_title, album_part_title, album_part_subtitle = '', '', ''
                if len(parts) == 5:
                    album_title = parts[3]
                elif len(parts) == 8 and parts[4] == 'nggallery':
                    album_title = parts[3]
                    album_part_title = parts[5]
                    album_part_subtitle = parts[6]
                if album_title and row['value'].lower().endswith('.jpg'):
                    yield dict(row, **{
                        'album_title': album_title,
                        'album_part_title': album_part_title,
                        'album_part_subtitle': album_part_subtitle,
                        'image': row['value']
                    })
    
    return Flow(
        add_field('album_title', 'string'),
        add_field('album_part_title', 'string'),
        add_field('album_part_subtitle', 'string'),
        add_field('image', 'string'),
        _bitmuna_albums,
        delete_fields(['type', 'referrer', 'value', 'alt']),
    )
                
Flow(
    checkpoint('scraped-site-filtered-years'),
    bitmuna_albums(),
    checkpoint('scraped-site-filtered-years-album-images'),
).process()[1]

## Render HTML Preview of the images

In [None]:
INDEX_TEMPLATE = """
<!DOCTYPE html>
<head>
    <meta charset="utf-8">
</head>
<body dir="rtl">
{content}
</body>
"""

IMAGE_TEMPLATE = """
<div style="border:1px solid black;">
    <img src="{image}"/><br/>
    <small>{title}</small>
</div>
"""

import os

def render_images_preview(out_path):
    
    os.makedirs(out_path, exist_ok=True)
    
    def _render(rows):
        albums = {}
        for row in rows:
            albums.setdefault(row['album_title'], {'rows': []})['rows'].append(row)
        album_num = 0
        index_content = ''
        for album_title, album in albums.items():
            album_num += 1
            content = '<h1>{}</h1>'.format(album_title)
            for row in album['rows']:
                content += IMAGE_TEMPLATE.format(**row)
            index_content += '<p><a href="album-{}.html">{} ({})</a></p>'.format(album_num, album_title, len(album['rows']))
            with open(out_path + '/album-' + str(album_num) + '.html', 'w') as f:
                f.write(INDEX_TEMPLATE.format(content=content))
            yield {'album_title': album_title}
        with open(out_path + '/index.html', 'w') as f:
            f.write(INDEX_TEMPLATE.format(content=index_content))
    
    return _render

Flow(
    checkpoint('scraped-site-filtered-years-album-images'),
    render_images_preview(
        '../data/btm-site-filtered-years-albums-preview',
    ),
).process()[1]