-
Notifications
You must be signed in to change notification settings - Fork 8
/
HarvesterConsoleRunner.php
345 lines (325 loc) · 11.3 KB
/
HarvesterConsoleRunner.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
<?php
/**
* OAI-PMH Harvest Tool (Console Wrapper)
*
* PHP version 7
*
* Copyright (c) Demian Katz 2016.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2,
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
* @category VuFind
* @package Harvest_Tools
* @author Demian Katz <demian.katz@villanova.edu>
* @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License
* @link https://vufind.org/wiki/indexing:oai-pmh Wiki
*/
namespace VuFindHarvest\OaiPmh;
use Laminas\Console\Getopt;
use Laminas\Http\Client;
use VuFindHarvest\ConsoleOutput\ConsoleWriter;
use VuFindHarvest\ConsoleOutput\WriterAwareTrait;
/**
* OAI Class
*
* OAI-PMH Harvest Tool (Console Wrapper)
*
* @category VuFind
* @package Harvest_Tools
* @author Demian Katz <demian.katz@villanova.edu>
* @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License
* @link https://vufind.org/wiki/indexing:oai-pmh Wiki
*/
class HarvesterConsoleRunner
{
use WriterAwareTrait;
/**
* Console options
*
* @var Getopt
*/
protected $opts;
/**
* HTTP client
*
* @var Client
*/
protected $client;
/**
* Root directory for harvesting
*
* @var string
*/
protected $harvestRoot;
/**
* Harvester factory
*
* @var HarvesterFactory
*/
protected $factory;
/**
* Constructor
*
* @param Getopt $opts CLI options (omit for defaults)
* @param Client $client HTTP client (omit for default)
* @param string $harvestRoot Root directory for harvesting (omit for
* default)
* @param HarvesterFactory $factory Harvester factory (omit for default)
* @param bool $silent Should we suppress output?
*/
public function __construct($opts = null, $client = null, $harvestRoot = null,
HarvesterFactory $factory = null, $silent = false
) {
$this->opts = $opts ?: static::getDefaultOptions();
$this->client = $client ?: new Client();
$this->harvestRoot = $harvestRoot ?: getcwd();
$this->factory = $factory ?: new HarvesterFactory();
if (!$silent) {
$this->setOutputWriter(new ConsoleWriter());
}
}
/**
* Get the default Options object.
*
* @return Getopt
*/
public static function getDefaultOptions()
{
return new Getopt(
[
'help|h' => 'Display usage message',
'verbose|v' => 'Display verbose output',
'from-s' => 'Harvest start date',
'until-s' => 'Harvest end date',
'ini-s' => '.ini file to load',
'url-s' => 'Base URL of OAI-PMH server',
'httpUser-s' => 'Username to access url (optional)',
'httpPass-s' => 'Password to access url (optional)',
'set-s' => 'Set name to harvest',
'metadataPrefix-s' => 'Metadata prefix to harvest',
'timeout-i' => 'HTTP timeout (in seconds)',
'combineRecords' => 'Turn off "one record per file" mode',
'combineRecordsTag-s' => 'Specify the XML tag wrapped around'
. ' multiple records in combineRecords mode'
. ' (default = <collection>)',
'globalSearch-s' => 'Regular expression to replace in raw XML',
'globalReplace-s' => 'String to replace globalSearch regex matches',
'injectDate-s' => 'Inject date from header into specified tag',
'injectId-s' => 'Inject ID from header into specified tag',
'injectSetName-s' => 'Inject setName from header into specified tag',
'injectSetSpec-s' => 'Inject setSpec from header into specified tag',
'idSearch-s' => 'Regular expression to replace in ID'
. ' (only relevant when injectId is on)',
'idReplace-s' => 'String to replace idSearch regex matches',
'dateGranularity-s' => '"YYYY-MM-DDThh:mm:ssZ," "YYYY-MM-DD" or '
. '"auto" (default)',
'harvestedIdLog-s' => 'Filename (relative to harvest directory)'
. ' to store log of harvested IDs.',
'autosslca' => 'Attempt to autodetect SSL certificate file/path',
'sslcapath-s' => 'Path to SSL certificate authority directory',
'sslcafile-s' => 'Path to SSL certificate authority file',
'nosslverifypeer' => 'Disable SSL verification',
'sanitize' => 'Strip illegal characters from XML',
'sanitizeRegex-s' =>
'Optional regular expression defining XML characters to remove',
'badXMLLog-s' => 'Filename (relative to harvest directory) to log'
. ' XML fixed by sanitize setting'
]
);
}
/**
* Use command-line switches to add/override settings found in the .ini
* file, if necessary.
*
* @param array $settings Incoming settings
*
* @return array
*/
protected function updateSettingsWithConsoleOptions($settings)
{
$directMapSettings = [
'url', 'set', 'metadataPrefix', 'timeout', 'combineRecordsTag',
'injectDate', 'injectId', 'injectSetName', 'injectSetSpec',
'idSearch', 'idReplace', 'dateGranularity', 'harvestedIdLog',
'badXMLLog', 'httpUser', 'httpPass', 'sslcapath', 'sslcafile',
'sanitizeRegex',
];
foreach ($directMapSettings as $setting) {
if ($value = $this->opts->getOption($setting)) {
$settings[$setting] = $value;
}
}
$flagSettings = [
'combineRecords' => ['combineRecords', true],
'v' => ['verbose', true],
'autosslca' => ['autosslca', true],
'nosslverifypeer' => ['sslverifypeer', false],
'sanitize' => ['sanitize', true],
];
foreach ($flagSettings as $in => $details) {
if ($this->opts->getOption($in)) {
list($out, $val) = $details;
$settings[$out] = $val;
}
}
return $settings;
}
/**
* Render help message.
*
* @return void
*/
public function getHelp()
{
$msg = $this->opts->getUsageMessage();
// Amend the auto-generated help message:
$options = "[ options ] [ target ]\n"
. "Where [ target ] is the name of a section of the configuration\n"
. "specified by the ini option, or a directory to harvest into if\n"
. "no .ini file is used. If [ target ] is omitted, all .ini sections\n"
. "will be processed. [ options ] may be selected from those below,\n"
. "and will override .ini settings where applicable.";
$this->write(str_replace('[ options ]', $options, $msg));
}
/**
* Run the task and return true on success.
*
* @return bool
*/
public function run()
{
// Support help message:
if ($this->opts->getOption('h')) {
$this->getHelp();
return true;
}
if (!$allSettings = $this->getSettings()) {
return false;
}
// Loop through all the settings and perform harvests:
$processed = $skipped = 0;
foreach ($allSettings as $target => $baseSettings) {
$settings = $this->updateSettingsWithConsoleOptions($baseSettings);
if (empty($target) || empty($settings)) {
$skipped++;
continue;
}
$this->writeLine("Processing {$target}...");
try {
$this->harvestSingleRepository($target, $settings);
} catch (\Exception $e) {
$this->writeLine($e->getMessage());
return false;
}
$processed++;
}
// All done.
if ($processed == 0 && $skipped > 0) {
$this->writeLine(
'No valid settings found; '
. 'please set url and metadataPrefix at minimum.'
);
return false;
}
$this->writeLine(
"Completed without errors -- {$processed} source(s) processed."
);
return true;
}
/**
* Get the target directory for writing harvested files.
*
* @return string
*/
protected function getHarvestRoot()
{
return $this->harvestRoot;
}
/**
* Get an HTTP client.
*
* @return Client
*/
protected function getHttpClient()
{
return $this->client;
}
/**
* Load configuration from an .ini file (or return false on error)
*
* @param string $ini Configuration file to load
* @param string|bool $section Section of .ini to load (or false for all)
*
* @return array|bool
*/
protected function getSettingsFromIni($ini, $section)
{
$oaiSettings = @parse_ini_file($ini, true);
if (empty($oaiSettings)) {
$this->writeLine("Please add OAI-PMH settings to {$ini}.");
return false;
}
if ($section) {
if (!isset($oaiSettings[$section])) {
$this->writeLine("$section not found in $ini.");
return false;
}
$oaiSettings = [$section => $oaiSettings[$section]];
}
return $oaiSettings;
}
/**
* Load the harvest settings. Return false on error.
*
* @return array|bool
*/
protected function getSettings()
{
$ini = $this->opts->getOption('ini');
$argv = $this->opts->getRemainingArgs();
$section = $argv[0] ?? false;
if (!$ini && !$section) {
$this->writeLine(
'Please specify an .ini file with the --ini flag'
. ' or a target directory with the first parameter.'
);
return false;
}
return $ini
? $this->getSettingsFromIni($ini, $section)
: [$section => []];
}
/**
* Harvest a single repository.
*
* @param string $target Name of repo (used for target directory)
* @param array $settings Settings for the harvester.
*
* @return void
* @throws \Exception
*/
protected function harvestSingleRepository($target, $settings)
{
$settings['from'] = $this->opts->getOption('from');
$settings['until'] = $this->opts->getOption('until');
$settings['silent'] = false;
$harvest = $this->factory->getHarvester(
$target,
$this->getHarvestRoot(),
$this->getHttpClient(),
$settings
);
$harvest->launch();
}
}