/
import_mediawiki_dumps.pp
41 lines (37 loc) · 2.06 KB
/
import_mediawiki_dumps.pp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# SPDX-License-Identifier: Apache-2.0
# == Class profile::analytics::refinery::job::import_mediawiki_dumps
# Schedules an import of page-history xmldumps and site-info jsondumps to hadoop.
#
# NOTE: This class assumes the xmldatadumps folder under which public dumps
# can be found is mounted under /mnt/data
#
class profile::analytics::refinery::job::import_mediawiki_dumps (
Wmflib::Ensure $ensure_timers = lookup('profile::analytics::refinery::job::import_mediawiki_dumps::ensure_timers', { 'default_value' => 'present' }),
) {
# Import siteinfo-namespaces
profile::analytics::refinery::job::import_mediawiki_dumps_config { 'refinery-import-siteinfo-dumps':
ensure => $ensure_timers,
dump_type => 'siteinfo-namespaces',
log_file_name => 'import_siteinfo_dumps.log',
timer_description => 'Schedules daily an incremental import of the current month of siteinfo-namespaces jsondumps into HDFS',
timer_interval => '*-*-* 02:00:00',
}
# Import pages-meta-history
# Note: Skip wikidatawiki to speed-up data availability (T357859)
profile::analytics::refinery::job::import_mediawiki_dumps_config { 'refinery-import-page-history-dumps':
ensure => $ensure_timers,
dump_type => 'pages-meta-history',
log_file_name => 'import_pages_history_dumps.log',
skip_list => 'wikidatawiki',
timer_description => 'Schedules daily an incremental import of the current month of pages-meta-history xmldumps into HDFS',
timer_interval => '*-*-* 03:00:00',
}
# Import pages-meta-current
profile::analytics::refinery::job::import_mediawiki_dumps_config { 'refinery-import-page-current-dumps':
ensure => $ensure_timers,
dump_type => 'pages-meta-current',
log_file_name => 'import_pages_current_dumps.log',
timer_description => 'Schedules daily an incremental import of the current month of pages-meta-current xmldumps into HDFS',
timer_interval => '*-*-* 05:00:00',
}
}