/
ArticleInfoApi.php
480 lines (415 loc) · 15.1 KB
/
ArticleInfoApi.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
<?php
declare(strict_types = 1);
namespace App\Model;
use App\Repository\ArticleInfoRepository;
use DateTime;
use Doctrine\DBAL\Driver\ResultStatement;
use Symfony\Component\DependencyInjection\ContainerInterface;
use Symfony\Component\DomCrawler\Crawler;
use Symfony\Component\HttpKernel\Exception\HttpException;
use Symfony\Component\HttpKernel\Exception\ServiceUnavailableHttpException;
/**
* An ArticleInfoApi is standalone logic for the Article Info tool. These methods perform SQL queries
* or make API requests and can be called directly, without any knowledge of the child ArticleInfo class.
* It does require that the ArticleInfoRepository be set, however.
* @see ArticleInfo
*/
class ArticleInfoApi extends Model
{
/** @var ContainerInterface The application's DI container. */
protected $container;
/** @var int Number of revisions that belong to the page. */
protected $numRevisions;
/** @var int Maximum number of revisions to process, as configured. */
protected $maxRevisions;
/** @var mixed[] Prose stats, with keys 'characters', 'words', 'references', 'unique_references', 'sections'. */
protected $proseStats;
/** @var array Number of categories, templates and files on the page. */
protected $transclusionData;
/** @var mixed[] Various statistics about bots that edited the page. */
protected $bots;
/** @var int Number of edits made to the page by bots. */
protected $botRevisionCount;
/** @var int[] Number of in and outgoing links and redirects to the page. */
protected $linksAndRedirects;
/** @var string[] Assessments of the page (see Page::getAssessments). */
protected $assessments;
/** @var string[] List of Wikidata and Checkwiki errors. */
protected $bugs;
/**
* ArticleInfoApi constructor.
* @param Page $page The page to process.
* @param ContainerInterface $container The DI container.
* @param false|int $start Start date as Unix timestmap.
* @param false|int $end End date as Unix timestamp.
*/
public function __construct(Page $page, ContainerInterface $container, $start = false, $end = false)
{
$this->page = $page;
$this->container = $container;
$this->start = $start;
$this->end = $end;
}
/**
* Get the number of revisions belonging to the page.
* @return int
*/
public function getNumRevisions(): int
{
if (!isset($this->numRevisions)) {
$this->numRevisions = $this->page->getNumRevisions(null, $this->start, $this->end);
}
return $this->numRevisions;
}
/**
* Are there more revisions than we should process, based on the config?
* @return bool
*/
public function tooManyRevisions(): bool
{
return $this->getMaxRevisions() > 0 && $this->getNumRevisions() > $this->getMaxRevisions();
}
/**
* Get the maximum number of revisions that we should process.
* @return int
*/
public function getMaxRevisions(): int
{
if (!isset($this->maxRevisions)) {
$this->maxRevisions = (int) $this->container->getParameter('app.max_page_revisions');
}
return $this->maxRevisions;
}
/**
* Get various basic info used in the API, including the number of revisions, unique authors, initial author
* and edit count of the initial author. This is combined into one query for better performance. Caching is
* intentionally disabled, because using the gadget, this will get hit for a different page constantly, where
* the likelihood of cache benefiting us is slim.
* @return string[]|false false if the page was not found.
*/
public function getBasicEditingInfo()
{
return $this->getRepository()->getBasicEditingInfo($this->page);
}
/**
* Get the top editors to the page by edit count.
* @param int $limit Default 20, maximum 1,000.
* @param bool $noBots Set to non-false to exclude bots from the result.
* @return array
*/
public function getTopEditorsByEditCount(int $limit = 20, bool $noBots = false): array
{
// Quick cache, valid only for the same request.
static $topEditors = null;
if (null !== $topEditors) {
return $topEditors;
}
$rows = $this->getRepository()->getTopEditorsByEditCount(
$this->page,
$this->start,
$this->end,
min($limit, 1000),
$noBots
);
$topEditors = [];
$rank = 0;
foreach ($rows as $row) {
$topEditors[] = [
'rank' => ++$rank,
'username' => $row['username'],
'count' => $row['count'],
'minor' => $row['minor'],
'first_edit' => [
'id' => $row['first_revid'],
'timestamp' => $row['first_timestamp'],
],
'latest_edit' => [
'id' => $row['latest_revid'],
'timestamp' => $row['latest_timestamp'],
],
];
}
return $topEditors;
}
/**
* Get prose and reference information.
* @return array With keys 'characters', 'words', 'references', 'unique_references'
*/
public function getProseStats(): array
{
if (isset($this->proseStats)) {
return $this->proseStats;
}
$datetime = is_int($this->end) ? new DateTime("@$this->end") : null;
$html = $this->page->getHTMLContent($datetime);
$crawler = new Crawler($html);
[$chars, $words] = $this->countCharsAndWords($crawler, '#mw-content-text p');
$refs = $crawler->filter('#mw-content-text .reference');
$refContent = [];
$refs->each(function ($ref) use (&$refContent): void {
$refContent[] = $ref->text();
});
$uniqueRefs = count(array_unique($refContent));
$sections = count($crawler->filter('#mw-content-text .mw-headline'));
$this->proseStats = [
'characters' => $chars,
'words' => $words,
'references' => $refs->count(),
'unique_references' => $uniqueRefs,
'sections' => $sections,
];
return $this->proseStats;
}
/**
* Count the number of characters and words of the plain text within the DOM element matched by the given selector.
* @param Crawler $crawler
* @param string $selector HTML selector.
* @return array [num chars, num words]
*/
private function countCharsAndWords(Crawler $crawler, string $selector): array
{
$totalChars = 0;
$totalWords = 0;
$paragraphs = $crawler->filter($selector);
$paragraphs->each(function ($node) use (&$totalChars, &$totalWords): void {
/** @var Crawler $node */
$text = preg_replace('/\[\d+]/', '', trim($node->text(null, true)));
$totalChars += strlen($text);
$totalWords += count(explode(' ', $text));
});
return [$totalChars, $totalWords];
}
/**
* Get the page assessments of the page.
* @see https://www.mediawiki.org/wiki/Extension:PageAssessments
* @return string[]|false False if unsupported.
* @codeCoverageIgnore
*/
public function getAssessments()
{
if (!is_array($this->assessments)) {
$this->assessments = $this->page
->getProject()
->getPageAssessments()
->getAssessments($this->page);
}
return $this->assessments;
}
/**
* Get the list of page's wikidata and Checkwiki errors.
* @see Page::getErrors()
* @return string[]
*/
public function getBugs(): array
{
if (!is_array($this->bugs)) {
$this->bugs = $this->page->getErrors();
}
return $this->bugs;
}
/**
* Get the number of wikidata nad CheckWiki errors.
* @return int
*/
public function numBugs(): int
{
return count($this->getBugs());
}
/**
* Generate the data structure that will used in the ArticleInfo API response.
* @param Project $project
* @param Page $page
* @return array
* @codeCoverageIgnore
*/
public function getArticleInfoApiData(Project $project, Page $page): array
{
/** @var int $pageviewsOffset Number of days to query for pageviews */
$pageviewsOffset = 30;
$data = [
'project' => $project->getDomain(),
'page' => $page->getTitle(),
'watchers' => (int) $page->getWatchers(),
'pageviews' => $page->getLastPageviews($pageviewsOffset),
'pageviews_offset' => $pageviewsOffset,
];
$info = false;
try {
$articleInfoRepo = new ArticleInfoRepository();
$articleInfoRepo->setContainer($this->container);
$info = $articleInfoRepo->getBasicEditingInfo($page);
} catch (ServiceUnavailableHttpException $e) {
// No more open database connections.
$data['error'] = 'Unable to fetch revision data. Please try again later.';
} catch (HttpException $e) {
/**
* The query most likely exceeded the maximum query time,
* so we'll abort and give only info retrieved by the API.
*/
$data['error'] = 'Unable to fetch revision data. The query may have timed out.';
}
if (false !== $info) {
$creationDateTime = DateTime::createFromFormat('YmdHis', $info['created_at']);
$modifiedDateTime = DateTime::createFromFormat('YmdHis', $info['modified_at']);
$secsSinceLastEdit = (new DateTime)->getTimestamp() - $modifiedDateTime->getTimestamp();
// Some wikis (such foundation.wikimedia.org) may be missing the creation date.
$creationDateTime = false === $creationDateTime
? null
: $creationDateTime->format('Y-m-d');
$assessment = $page->getProject()
->getPageAssessments()
->getAssessment($page);
$data = array_merge($data, [
'revisions' => (int) $info['num_edits'],
'editors' => (int) $info['num_editors'],
'minor_edits' => (int) $info['minor_edits'],
'author' => $info['author'],
'author_editcount' => null === $info['author_editcount'] ? null : (int) $info['author_editcount'],
'created_at' => $creationDateTime,
'created_rev_id' => $info['created_rev_id'],
'modified_at' => $modifiedDateTime->format('Y-m-d H:i'),
'secs_since_last_edit' => $secsSinceLastEdit,
'last_edit_id' => (int) $info['modified_rev_id'],
'assessment' => $assessment,
]);
}
return $data;
}
/************************ Link statistics ************************/
/**
* Get the number of external links on the page.
* @return int
*/
public function linksExtCount(): int
{
return $this->getLinksAndRedirects()['links_ext_count'];
}
/**
* Get the number of incoming links to the page.
* @return int
*/
public function linksInCount(): int
{
return $this->getLinksAndRedirects()['links_in_count'];
}
/**
* Get the number of outgoing links from the page.
* @return int
*/
public function linksOutCount(): int
{
return $this->getLinksAndRedirects()['links_out_count'];
}
/**
* Get the number of redirects to the page.
* @return int
*/
public function redirectsCount(): int
{
return $this->getLinksAndRedirects()['redirects_count'];
}
/**
* Get the number of external, incoming and outgoing links, along with the number of redirects to the page.
* @return int[]
* @codeCoverageIgnore
*/
private function getLinksAndRedirects(): array
{
if (!is_array($this->linksAndRedirects)) {
$this->linksAndRedirects = $this->page->countLinksAndRedirects();
}
return $this->linksAndRedirects;
}
/**
* Fetch transclusion data (categories, templates and files) that are on the page.
* @return array With keys 'categories', 'templates' and 'files'.
*/
public function getTransclusionData(): array
{
if (!is_array($this->transclusionData)) {
$this->transclusionData = $this->getRepository()
->getTransclusionData($this->page);
}
return $this->transclusionData;
}
/**
* Get the number of categories that are on the page.
* @return int
*/
public function getNumCategories(): int
{
return $this->getTransclusionData()['categories'];
}
/**
* Get the number of templates that are on the page.
* @return int
*/
public function getNumTemplates(): int
{
return $this->getTransclusionData()['templates'];
}
/**
* Get the number of files that are on the page.
* @return int
*/
public function getNumFiles(): int
{
return $this->getTransclusionData()['files'];
}
/************************ Bot statistics ************************/
/**
* Number of edits made to the page by current or former bots.
* @param string[] $bots Used only in unit tests, where we supply mock data for the bots that will get processed.
* @return int
*/
public function getBotRevisionCount(?array $bots = null): int
{
if (isset($this->botRevisionCount)) {
return $this->botRevisionCount;
}
if (null === $bots) {
$bots = $this->getBots();
}
$count = 0;
foreach (array_values($bots) as $data) {
$count += $data['count'];
}
$this->botRevisionCount = $count;
return $count;
}
/**
* Get and set $this->bots about bots that edited the page. This is done separately from the main query because
* we use this information when computing the top 10 editors in ArticleInfo, where we don't want to include bots.
* @return mixed[]
*/
public function getBots(): array
{
if (isset($this->bots)) {
return $this->bots;
}
// Parse the bot edits.
$this->bots = [];
$limit = $this->tooManyRevisions() ? $this->getMaxRevisions() : null;
/** @var ResultStatement $botData */
$botData = $this->getRepository()->getBotData($this->page, $this->start, $this->end, $limit);
while ($bot = $botData->fetchAssociative()) {
$this->bots[$bot['username']] = [
'count' => (int)$bot['count'],
'current' => '1' === $bot['current'],
];
}
// Sort by edit count.
uasort($this->bots, function ($a, $b) {
return $b['count'] - $a['count'];
});
return $this->bots;
}
/**
* Get the number of bots that edited the page.
* @return int
*/
public function getNumBots(): int
{
return count($this->getBots());
}
}