/
HarvesterCommand.php
503 lines (482 loc) · 15.8 KB
/
HarvesterCommand.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
<?php
/**
* OAI-PMH Harvest Tool (Symfony Console Command)
*
* PHP version 7
*
* Copyright (c) Demian Katz 2016.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2,
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
* @category VuFind
* @package Harvest_Tools
* @author Demian Katz <demian.katz@villanova.edu>
* @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License
* @link https://vufind.org/wiki/indexing:oai-pmh Wiki
*/
namespace VuFindHarvest\OaiPmh;
use Laminas\Http\Client;
use Symfony\Component\Console\Attribute\AsCommand;
use Symfony\Component\Console\Command\Command;
use Symfony\Component\Console\Input\InputArgument;
use Symfony\Component\Console\Input\InputInterface;
use Symfony\Component\Console\Input\InputOption;
use Symfony\Component\Console\Output\OutputInterface;
use VuFindHarvest\ConsoleOutput\ConsoleWriter;
use VuFindHarvest\ConsoleOutput\WriterAwareTrait;
use VuFindHarvest\Exception\OaiException;
/**
* OAI-PMH Harvest Tool (Symfony Console Command)
*
* @category VuFind
* @package Harvest_Tools
* @author Demian Katz <demian.katz@villanova.edu>
* @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License
* @link https://vufind.org/wiki/indexing:oai-pmh Wiki
*/
#[AsCommand(
name: 'harvest/harvest_oai',
description: 'OAI-PMH harvester'
)]
class HarvesterCommand extends Command
{
use WriterAwareTrait;
/**
* The name of the command
*
* @var string
*/
protected static $defaultName = 'harvest/harvest_oai';
/**
* HTTP client
*
* @var Client
*/
protected $client;
/**
* Root directory for harvesting
*
* @var string
*/
protected $harvestRoot;
/**
* Harvester factory
*
* @var HarvesterFactory
*/
protected $factory;
/**
* Silent mode
*
* @var bool
*/
protected $silent;
/**
* Constructor
*
* @param Client $client HTTP client (omit for default)
* @param string $harvestRoot Root directory for harvesting (omit for
* default)
* @param HarvesterFactory $factory Harvester factory (omit for default)
* @param bool $silent Should we suppress output?
* @param string|null $name The name of the command; passing null
* means it must be set in configure()
*/
public function __construct(
$client = null,
$harvestRoot = null,
HarvesterFactory $factory = null,
$silent = false,
$name = null
) {
$this->client = $client ?: new Client();
$this->harvestRoot = $harvestRoot ?: getcwd();
$this->factory = $factory ?: new HarvesterFactory();
$this->silent = $silent;
parent::__construct($name);
}
/**
* Configure the command.
*
* @return void
*
* @SuppressWarnings(PHPMD.ExcessiveMethodLength)
*/
protected function configure()
{
$this
->setHelp('Harvests metadata using the OAI-PMH protocol.')
->addArgument(
'target',
InputArgument::OPTIONAL,
'the name of a section of the configuration specified by the ini '
. "option,\nor a directory to harvest into if no .ini file is used. "
. "If <target> is\nomitted, all .ini sections will be processed."
)->addOption(
'from',
null,
InputOption::VALUE_REQUIRED,
'Harvest start date'
)->addOption(
'until',
null,
InputOption::VALUE_REQUIRED,
'Harvest end date'
)->addOption(
'ini',
null,
InputOption::VALUE_REQUIRED,
'.ini file to load; if you set other more specific options, they'
. " will\noverride equivalent settings loaded from the .ini file."
)->addOption(
'url',
null,
InputOption::VALUE_REQUIRED,
'Base URL of OAI-PMH server'
)->addOption(
'httpUser',
null,
InputOption::VALUE_REQUIRED,
'Username to access url'
)->addOption(
'httpPass',
null,
InputOption::VALUE_REQUIRED,
'Password to access url'
)->addOption(
'set',
null,
InputOption::VALUE_REQUIRED,
'Set name to harvest'
)->addOption(
'metadataPrefix',
null,
InputOption::VALUE_REQUIRED,
'Metadata prefix to harvest'
)->addOption(
'timeout',
null,
InputOption::VALUE_REQUIRED,
'HTTP timeout (in seconds)'
)->addOption(
'combineRecords',
null,
InputOption::VALUE_NONE,
'Turn off "one record per file" mode'
)->addOption(
'combineRecordsTag',
null,
InputOption::VALUE_REQUIRED,
'Specify the XML tag wrapped around multiple records in '
. "combineRecords\nmode (default = <collection> if this "
. 'option is omitted)'
)->addOption(
'globalSearch',
null,
InputOption::VALUE_REQUIRED,
'Regular expression to replace in raw XML'
)->addOption(
'globalReplace',
null,
InputOption::VALUE_REQUIRED,
'String to replace globalSearch regex matches'
)->addOption(
'injectDate',
null,
InputOption::VALUE_REQUIRED,
'Inject date from header into specified tag'
)->addOption(
'injectId',
null,
InputOption::VALUE_REQUIRED,
'Inject ID from header into specified tag'
)->addOption(
'injectSetName',
null,
InputOption::VALUE_REQUIRED,
'Inject setName from header into specified tag'
)->addOption(
'injectSetSpec',
null,
InputOption::VALUE_REQUIRED,
'Inject setSpec from header into specified tag'
)->addOption(
'idSearch',
null,
InputOption::VALUE_REQUIRED,
'Regular expression to replace in ID'
. ' (only relevant when injectId is on)'
)->addOption(
'idReplace',
null,
InputOption::VALUE_REQUIRED,
'String to replace idSearch regex matches'
)->addOption(
'dateGranularity',
null,
InputOption::VALUE_REQUIRED,
'"YYYY-MM-DDThh:mm:ssZ," "YYYY-MM-DD" or "auto" (default)'
)->addOption(
'harvestedIdLog',
null,
InputOption::VALUE_REQUIRED,
'Filename (relative to harvest directory)'
. ' to store log of harvested IDs.'
)->addOption(
'autosslca',
null,
InputOption::VALUE_NONE,
'Attempt to autodetect SSL certificate file/path'
)->addOption(
'sslcapath',
null,
InputOption::VALUE_REQUIRED,
'Path to SSL certificate authority directory'
)->addOption(
'sslcafile',
null,
InputOption::VALUE_REQUIRED,
'Path to SSL certificate authority file'
)->addOption(
'nosslverifypeer',
null,
InputOption::VALUE_NONE,
'Disable SSL verification'
)->addOption(
'sanitize',
null,
InputOption::VALUE_NONE,
'Strip illegal characters from XML'
)->addOption(
'sanitizeRegex',
null,
InputOption::VALUE_REQUIRED,
'Optional regular expression defining XML characters to remove'
)->addOption(
'badXMLLog',
null,
InputOption::VALUE_REQUIRED,
'Filename (relative to harvest directory) to log'
. ' XML fixed by sanitize setting'
)->addOption(
'stopAfter',
null,
InputOption::VALUE_NONE,
'an option to stop harvesting after the first n records of each set.'
);
}
/**
* Use command-line switches to add/override settings found in the .ini
* file, if necessary.
*
* @param InputInterface $input Input object
* @param array $settings Incoming settings
*
* @return array
*/
protected function updateSettingsWithConsoleOptions(
InputInterface $input,
$settings
) {
$directMapSettings = [
'url', 'set', 'metadataPrefix', 'timeout', 'combineRecordsTag',
'injectDate', 'injectId', 'injectSetName', 'injectSetSpec',
'idSearch', 'idReplace', 'dateGranularity', 'harvestedIdLog',
'badXMLLog', 'httpUser', 'httpPass', 'sslcapath', 'sslcafile',
'sanitizeRegex',
];
foreach ($directMapSettings as $setting) {
if ($value = $input->getOption($setting)) {
$settings[$setting] = $value;
}
}
$flagSettings = [
'combineRecords' => ['combineRecords', true],
'verbose' => ['verbose', true],
'autosslca' => ['autosslca', true],
'nosslverifypeer' => ['sslverifypeer', false],
'sanitize' => ['sanitize', true],
];
foreach ($flagSettings as $in => $details) {
if ($input->hasOption($in) && $input->getOption($in)) {
[$out, $val] = $details;
$settings[$out] = $val;
}
}
return $settings;
}
/**
* Run the command.
*
* @param InputInterface $input Input object
* @param OutputInterface $output Output object
*
* @return int 0 for success
*/
protected function execute(InputInterface $input, OutputInterface $output)
{
// Only set up output writer if not in silent mode:
if (!$this->silent) {
$this->setOutputWriter(new ConsoleWriter($output));
}
if (!$allSettings = $this->getSettings($input)) {
return 1;
}
// Loop through all the settings and perform harvests:
$processed = $skipped = $errors = 0;
foreach ($allSettings as $target => $baseSettings) {
$settings = $this->updateSettingsWithConsoleOptions(
$input,
$baseSettings
);
if (empty($target) || empty($settings)) {
$skipped++;
continue;
}
$this->writeLine("Processing {$target}...");
try {
$this->harvestSingleRepository($input, $output, $target, $settings);
} catch (\Exception $e) {
if (
$e instanceof OaiException
&& strtolower($e->getOaiCode()) == 'norecordsmatch'
) {
$this->writeLine('No new records found.');
} else {
$this->writeLine($e->getMessage());
$errors++;
}
}
$processed++;
}
// All done.
if (isset($settings['stopAfter'])) {
$this->writeLine(
'stopAfter option set; '
. 'all sources may not have been fully harvested.'
);
}
if ($processed == 0 && $skipped > 0) {
$this->writeLine(
'No valid settings found; '
. 'please set url and metadataPrefix at minimum.'
);
return 1;
}
if ($errors > 0) {
$this->writeLine(
"Completed with {$errors} error(s) -- "
. "{$processed} source(s) processed."
);
return 1;
}
$this->writeLine(
"Completed without errors -- {$processed} source(s) processed."
);
return 0;
}
/**
* Get the target directory for writing harvested files.
*
* @return string
*/
protected function getHarvestRoot()
{
return $this->harvestRoot;
}
/**
* Get an HTTP client.
*
* @return Client
*/
protected function getHttpClient()
{
return $this->client;
}
/**
* Load configuration from an .ini file (or return false on error)
*
* @param string $ini Configuration file to load
* @param string|bool $section Section of .ini to load (or false for all)
*
* @return array|bool
*/
protected function getSettingsFromIni($ini, $section)
{
$oaiSettings = @parse_ini_file($ini, true);
if (empty($oaiSettings)) {
$this->writeLine("Please add OAI-PMH settings to {$ini}.");
return false;
}
if ($section) {
if (!isset($oaiSettings[$section])) {
$this->writeLine("$section not found in $ini.");
return false;
}
$oaiSettings = [$section => $oaiSettings[$section]];
}
return $oaiSettings;
}
/**
* Load the harvest settings. Return false on error.
*
* @param InputInterface $input Input object
*
* @return array|bool
*/
protected function getSettings(InputInterface $input)
{
$ini = $input->getOption('ini');
$section = $input->getArgument('target');
if (!$ini && !$section) {
$this->writeLine(
'Please specify an .ini file with the --ini flag'
. ' or a target directory with the first parameter.'
);
return false;
}
return $ini
? $this->getSettingsFromIni($ini, $section)
: [$section => []];
}
/**
* Harvest a single repository.
*
* @param InputInterface $input Input object
* @param OutputInterface $output Output object
* @param string $target Name of repo (used for target directory)
* @param array $settings Settings for the harvester.
*
* @return void
* @throws \Exception
*/
protected function harvestSingleRepository(
InputInterface $input,
OutputInterface $output,
$target,
$settings
) {
$settings['from'] = $input->getOption('from');
$settings['until'] = $input->getOption('until');
$settings['silent'] = false;
$harvest = $this->factory->getHarvester(
$target,
$this->getHarvestRoot(),
$this->getHttpClient(),
$settings,
$output
);
$harvest->launch();
}
}