From 056c3d074f7709999ee0c58fb1b1ad2c4712b017 Mon Sep 17 00:00:00 2001 From: macbre Date: Mon, 16 Jan 2023 17:06:14 +0000 Subject: [PATCH 1/2] harvest.py: allow to offset the list of resources --- harvest.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/harvest.py b/harvest.py index d0fad02..81c74ca 100644 --- a/harvest.py +++ b/harvest.py @@ -22,6 +22,8 @@ OAI_SET_NAME = 'MDL:CD:Warwilustrpras' +START_FROM_ITEM = 5005 + @dataclass class RecordMeta: """ @@ -243,6 +245,10 @@ def main(): logger.info('pywikibot: %r', commons) for idx, record in enumerate(get_set(harvester, OAI_SET_NAME)): + if idx < START_FROM_ITEM: + logger.info('Skipping record #%d due to START_FROM_ITEM', idx) + continue + logger.info('---') logger.info('Record #%d found: %r', idx + 1, record) # logger.info('Metadata: %r', record.metadata) From c55472f1687a587199d0d626bf99b4cef008e0cd Mon Sep 17 00:00:00 2001 From: macbre Date: Mon, 16 Jan 2023 17:24:21 +0000 Subject: [PATCH 2/2] Upload an external resource via a temporary file --- harvest.py | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/harvest.py b/harvest.py index 81c74ca..458e406 100644 --- a/harvest.py +++ b/harvest.py @@ -2,6 +2,7 @@ List MBC sets """ import logging +import tempfile from dataclasses import dataclass import pywikibot @@ -224,12 +225,28 @@ def upload_to_commons(site: pywikibot.Site, record: RecordMeta) -> bool: logger.info('%r exists, skipping an upload', file_page) return False - return file_page.upload( - source=record.content_url, - text=file_description, - comment=UPLOAD_COMMENT, - report_success=True, - ) + # now fetch the resource to a local temporary file + with tempfile.NamedTemporaryFile(prefix='mbc-harvest-') as temp_upload: + logger.info('Fetching <%s> into %s temporary file', record.content_url, temp_upload.name) + + response = requests.get(record.content_url) + + response_size = int(response.headers['content-length'] or 0) / 1024 / 1024 + logger.info('HTTP %d (%.2f MB)', response.status_code, response_size) + + # write the response to a temporary file + temp_upload.write(response.content) + + # and upload from the file + ret = file_page.upload( + source=temp_upload.name, + text=file_description, + comment=UPLOAD_COMMENT, + report_success=True, + ignore_warnings=False, + ) + + return ret def main():