/
EmptyEscapeParser.php
227 lines (199 loc) · 6.33 KB
/
EmptyEscapeParser.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
<?php
/**
* League.Csv (https://csv.thephpleague.com).
*
* @author Ignace Nyamagana Butera <nyamsprod@gmail.com>
* @license https://github.com/thephpleague/csv/blob/master/LICENSE (MIT License)
* @version 9.2.0
* @link https://github.com/thephpleague/csv
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/
declare(strict_types=1);
namespace League\Csv\Polyfill;
use Generator;
use League\Csv\Stream;
use SplFileObject;
use TypeError;
use function explode;
use function get_class;
use function gettype;
use function in_array;
use function is_object;
use function ltrim;
use function rtrim;
use function sprintf;
use function str_replace;
use function substr;
/**
* A Polyfill to PHP's SplFileObject behavior when reading a CSV document
* with the SplFileObject::READ_CSV and SplFileObject::SKIP_EMPTY flags on
* and the empty string as the escape parameter.
*
* <code>
* $file = new SplFileObject('/path/to/file.csv', 'r');
* $file->setFlags(SplFileObject::READ_CSV | SplFileObject::READ_AHEAD | SplFileObject::SKIP_EMPTY);
* $file->setCsvControl($delimiter, $enclosure, ''); //this does not currently in any PHP stable release
* </code>
*
* instead you can do this
*
* <code>
* $file = new SplFileObject('/path/to/file.csv', 'r');
* $file->setFlags(SplFileObject::READ_CSV | SplFileObject::READ_AHEAD | SplFileObject::SKIP_EMPTY);
* $file->setCsvControl($delimiter, $enclosure, $escape);
* EmptyEscapeParser::parse($file); //parsing will be done while ignoring the escape character value.
* </code>
*
* @see https://php.net/manual/en/function.fgetcsv.php
* @see https://php.net/manual/en/function.fgets.php
* @see https://tools.ietf.org/html/rfc4180
* @see http://edoceo.com/utilitas/csv-file-format
*
* @internal used internally to parse a CSV document without using the escape character
*/
final class EmptyEscapeParser
{
/**
* @internal
*/
const FIELD_BREAKS = [false, '', "\r\n", "\n", "\r"];
/**
* @var SplFileObject|Stream
*/
private static $document;
/**
* @var string
*/
private static $delimiter;
/**
* @var string
*/
private static $enclosure;
/**
* @var string
*/
private static $trim_mask;
/**
* @var string|bool
*/
private static $line;
/**
* Converts the document into a CSV record iterator.
*
* Each record array contains strings elements.
*
* @param SplFileObject|Stream $document
*
* @return Generator|array[]
*/
public static function parse($document): Generator
{
self::$document = self::filterDocument($document);
list(self::$delimiter, self::$enclosure, ) = self::$document->getCsvControl();
self::$trim_mask = str_replace([self::$delimiter, self::$enclosure], '', " \t\0\x0B");
self::$document->setFlags(0);
self::$document->rewind();
while (self::$document->valid()) {
$record = self::extractRecord();
if ([null] !== $record) {
yield $record;
}
}
}
/**
* Filters the submitted document.
*
* @param SplFileObject|Stream $document
*
* @return SplFileObject|Stream
*/
private static function filterDocument($document)
{
if ($document instanceof Stream || $document instanceof SplFileObject) {
return $document;
}
throw new TypeError(sprintf(
'Expected a %s or an SplFileObject object, %s given',
Stream::class,
is_object($document) ? get_class($document) : gettype($document)
));
}
/**
* Extracts a record form the CSV document.
*/
private static function extractRecord(): array
{
$record = [];
self::$line = self::$document->fgets();
do {
$method = 'extractFieldContent';
$buffer = ltrim(self::$line, self::$trim_mask);
if (($buffer[0] ?? '') === self::$enclosure) {
$method = 'extractEnclosedFieldContent';
self::$line = $buffer;
}
$record[] = self::$method();
} while (false !== self::$line);
return $record;
}
/**
* Extracts the content from a field without enclosure.
*
* - Leading and trailing whitespaces must be removed.
* - trailing line-breaks must be removed.
*
* @return null|string
*/
private static function extractFieldContent()
{
if (in_array(self::$line, self::FIELD_BREAKS, true)) {
self::$line = false;
return null;
}
list($content, self::$line) = explode(self::$delimiter, self::$line, 2) + [1 => false];
if (false === self::$line) {
return rtrim($content, "\r\n");
}
return $content;
}
/**
* Extracts the content from a field with enclosure.
*
* - Field content can spread on multiple document lines.
* - Content inside enclosure must be preserved.
* - Double enclosure sequence must be replaced by single enclosure character.
* - Trailing line break must be removed if they are not part of the field content.
* - Invalid field content are treated as per fgetcsv behavior.
*/
private static function extractEnclosedFieldContent(): string
{
if ((self::$line[0] ?? '') === self::$enclosure) {
self::$line = substr(self::$line, 1);
}
$content = '';
while (false !== self::$line) {
list($buffer, $remainder) = explode(self::$enclosure, self::$line, 2) + [1 => false];
$content .= $buffer;
if (false !== $remainder) {
self::$line = $remainder;
break;
}
self::$line = self::$document->fgets();
}
if (in_array(self::$line, self::FIELD_BREAKS, true)) {
self::$line = false;
return rtrim($content, "\r\n");
}
$char = self::$line[0] ?? '';
if (self::$delimiter === $char) {
self::$line = substr(self::$line, 1);
return $content;
}
if (self::$enclosure === $char) {
return $content.self::$enclosure.self::extractEnclosedFieldContent();
}
return $content.self::extractFieldContent();
}
}