# Zionist archive

Watermarks were removed and are available on Google Storage data/zio/remove_watermarks/

Locally, files were deleted that were not suitable - e.g. containing text / borders

### Create the details_removed_watermark package

This package is committed in final-data so you don't need to recreate it

In [4]:
import os
from glob import glob
from collections import defaultdict
from dataflows import Flow, load, dump_to_path
from datapackage import Package

DETAILS_REMOVED_WATERMARK_FINAL_DATA_PATH = 'final-data/zio/details_removed_watermark'
REMOVED_WATERMARK_FILES_PATH = 'data/zio/remove_watermark/files'
DETAILS_URL_PREFIX = 'http://www.zionistarchives.org.il/Pages/'
SCRAPE_DETAILS_DATAPACKAGE_JSON = 'final-data/zio/scrape_details/datapackage.json'

if os.path.exists(f'{DETAILS_REMOVED_WATERMARK_FINAL_DATA_PATH}/datapackage.json'):
    print(f'data already exists, delete {DETAILS_REMOVED_WATERMARK_FINAL_DATA_PATH}/datapackage.json to recreate')
else:
    valid_rownums=[int(f.split('_')[2].split('.')[0]) for f in glob(f'{REMOVED_WATERMARK_FILES_PATH}/*') 
                   if f.startswith(f'{REMOVED_WATERMARK_FILES_PATH}/rownum_')]
    stats = defaultdict(int)

    def process_rows(rows):
        do_it = rows.res.name == 'zio_details'
        for rownum,row in enumerate(rows):
            if do_it:
                if rownum in valid_rownums:
                    row['rownum'] = rownum
                    details_url = row['details_url']
                    row['details_url'] = f'{DETAILS_URL_PREFIX}{details_url}'
                    row['removed_watermark_file'] = f'{REMOVED_WATERMARK_FILES_PATH}/rownum_{rownum}.png'
                    stats['rows_with_valid_photo'] += 1
                    yield row
                else:
                    stats['rows_with_invalid_photo'] += 1
            else:
                yield row


    def update_descriptor(package):
        package_descriptor = package.pkg.descriptor
        for resource_descriptor in package_descriptor['resources']:
            if resource_descriptor['name'] == 'zio_details':
                resource_descriptor['schema']['fields'].append({'name': 'removed_watermark_file', 'type': 'string'})
        yield Package(package_descriptor)
        yield from package


    print(Flow(load(SCRAPE_DETAILS_DATAPACKAGE_JSON), process_rows, update_descriptor, 
               dump_to_path(DETAILS_REMOVED_WATERMARK_FINAL_DATA_PATH)).process()[1])
    print(dict(stats))

data already exists, delete final-data/zio/details_removed_watermark/datapackage.json to recreate
