-
Notifications
You must be signed in to change notification settings - Fork 8
/
SimpleXmlResponseProcessor.php
162 lines (147 loc) · 4.57 KB
/
SimpleXmlResponseProcessor.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
<?php
/**
* Class for processing API responses into SimpleXML objects.
*
* PHP version 7
*
* Copyright (c) Demian Katz 2016.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2,
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
* @category VuFind
* @package Harvest_Tools
* @author Demian Katz <demian.katz@villanova.edu>
* @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License
* @link https://vufind.org/wiki/indexing:oai-pmh Wiki
*/
namespace VuFindHarvest\ResponseProcessor;
/**
* Class for processing API responses into SimpleXML objects.
*
* @category VuFind
* @package Harvest_Tools
* @author Demian Katz <demian.katz@villanova.edu>
* @license http://opensource.org/licenses/gpl-2.0.php GNU General Public License
* @link https://vufind.org/wiki/indexing:oai-pmh Wiki
*/
class SimpleXmlResponseProcessor implements ResponseProcessorInterface
{
/**
* Should we sanitize XML?
*
* @var bool
*/
protected $sanitize = false;
/**
* Filename for logging bad XML responses (false for none)
*
* @var string|bool
*/
protected $badXmlLog = false;
/**
* An array of regex strings used to sanitize XML
*
* @var array
*/
protected $sanitizeRegex = [];
/**
* Constructor
*
* @param string $basePath Base path to harvest directory.
* @param array $settings OAI-PMH settings from oai.ini.
*/
public function __construct($basePath, $settings = [])
{
$this->sanitize = $settings['sanitize'] ?? false;
$this->badXmlLog = isset($settings['badXMLLog'])
? $basePath . $settings['badXMLLog'] : false;
$this->sanitizeRegex = $settings['sanitizeRegex']
?? ['/[^\x{0009}\x{000a}\x{000d}\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}]+/u'];
}
/**
* Log a bad XML response.
*
* @param string $xml Bad XML
*
* @return void
*/
protected function logBadXML($xml)
{
$file = @fopen($this->badXmlLog, 'a');
if (!$file) {
throw new \Exception("Problem opening {$this->badXmlLog}.");
}
fwrite($file, $xml . "\n\n");
fclose($file);
}
/**
* Sanitize XML.
*
* @param string $rawXml XML to sanitize
*
* @return string
*/
protected function sanitizeXml($rawXml)
{
// Make sure the encoding is correct before applying regular expressions:
$utf8xml = mb_convert_encoding($rawXml, 'UTF-8', 'UTF-8');
// Sanitize the XML if requested:
$newXml = trim(preg_replace($this->sanitizeRegex, ' ', $utf8xml));
if ($rawXml !== $newXml && $this->badXmlLog) {
$this->logBadXML($rawXml);
}
return $newXml;
}
/**
* Collect LibXML errors into a single string.
*
* @return string
*/
protected function collectXmlErrors()
{
$callback = function ($e) {
return trim($e->message);
};
return implode('; ', array_map($callback, libxml_get_errors()));
}
/**
* Process an OAI-PMH response into a SimpleXML object. Throw an exception if
* an error is detected.
*
* @param string $xml Raw XML to process
*
* @return mixed
*
* @throws \Exception
*/
public function process($xml)
{
// Sanitize if necessary:
if ($this->sanitize) {
$xml = $this->sanitizeXml($xml);
}
// Parse the XML (newer versions of LibXML require a special flag for
// large documents, and responses may be quite large):
$flags = LIBXML_VERSION >= 20900 ? LIBXML_PARSEHUGE : 0;
$oldSetting = libxml_use_internal_errors(true);
$result = simplexml_load_string($xml, null, $flags);
$errors = $this->collectXmlErrors();
libxml_use_internal_errors($oldSetting);
if (!$result) {
throw new \Exception('Problem loading XML: ' . $errors);
}
// If we got this far, we have a valid response:
return $result;
}
}