In [8]:
import sys
import time

In [9]:
# The time range for which to update. START is included, END is excluded.
START = "201801"
END = "201802"

In [10]:
def print_err(*args, **kwargs):
    print(*args, file=sys.stderr, **kwargs)

In [11]:
all_wikis = run_mariadb(
"""select site_global_key
from enwiki.sites
where site_group in
('commons', 'incubator', 'foundation', 'mediawiki', 'meta', 'sources', 
'species','wikibooks', 'wikidata', 'wikinews', 'wikipedia', 'wikiquote',
'wikisource', 'wikiversity', 'wikivoyage', 'wiktionary') 
order by site_global_key asc"""
, fmt = "raw")

all_wikis = [row["site_global_key"] for row in all_wikis]

In [12]:
isolate = "set session transaction isolation level read uncommitted;"

use_db = "use {db};"

editor_month = """
insert into staging.editor_month
select
  database() as wiki,
  str_to_date(concat(rev_month, "01"), "%Y%m%d") as month,
  local_user_id,
  ifnull(user_name, "") as user_name,
  ifnull(sum(edits), 0) as edits,
  ifnull(sum(content_edits), 0) as content_edits,
  ifnull(sum(edits * deleted), 0) as deleted_edits,
  ifnull(sum(mobile_web_edits), 0) as mobile_web_edits,
  ifnull(sum(mobile_app_edits), 0) as mobile_app_edits,
  ifnull(sum(visual_edits), 0) as visual_edits,
  ifnull(sum(ve_source_edits), 0) as ve_source_edits,
  if(ug_group = "bot" or ufg_group = "bot", 1, 0) as bot_flag,
  str_to_date(user_registration, "%Y%m%d%H%i%S") as user_registration
from
(
select
  left(rev_timestamp, 6) as `rev_month`,
  rev_user as `local_user_id`,
  count(*) as `edits`,
  sum(page_namespace = 0 or cn.namespace is not null) as content_edits,
  sum(
    ts_tags like "%mobile edit%" and
    (ts_tags like "%mobile web edit%" or ts_tags not like "%mobile app edit%")
  ) as mobile_web_edits,
  sum(ts_tags like "%mobile app edit%") as mobile_app_edits,
  sum(ts_tags like "%visualeditor%" and ts_tags not like "%visualeditor-wikitext%") as visual_edits,
  sum(ts_tags like "%visualeditor-wikitext%") as ve_source_edits,
  0 as `deleted`
from revision
left join page on rev_page = page_id
left join tag_summary on rev_id = ts_rev_id
left join datasets.content_namespaces cn on database() = wiki and page_namespace = namespace
where rev_timestamp >= "{start}" and rev_timestamp < "{end}"
group by left(rev_timestamp, 6), rev_user

union all

select
  left(ar_timestamp, 6) as `rev_month`,
  ar_user as `local_user_id`,
  count(*) as `edits`,
  sum(ar_namespace = 0 or cn.namespace is not null) as content_edits,
  sum(
    ts_tags like "%mobile edit%" and
    (ts_tags like "%mobile web edit%" or ts_tags not like "%mobile app edit%")
  ) as mobile_web_edits,
  sum(ts_tags like "%mobile app edit%") as mobile_app_edits,
  sum(ts_tags like "%visualeditor%" and ts_tags not like "%visualeditor-wikitext%") as visual_edits,
  sum(ts_tags like "%visualeditor-wikitext%") as ve_source_edits,
  1 as `deleted`
from archive
left join tag_summary on ar_rev_id = ts_rev_id
left join datasets.content_namespaces cn on database() = wiki and ar_namespace = namespace
where ar_timestamp >= "{start}" and ar_timestamp < "{end}"
group by left(ar_timestamp, 6), ar_user
) revs
left join user on local_user_id = user_id
left join user_groups on local_user_id = ug_user and ug_group = "bot"
left join user_former_groups on local_user_id = ufg_user and ufg_group = "bot"
group by month, local_user_id;
"""

In [None]:
for wiki in all_wikis:
    init = time.perf_counter()
    run_mariadb(
        isolate,
        use_db.format(db = wiki),
        editor_month.format(start = START, end = END)
    )
    elapsed = time.perf_counter() - init
    print_err("{} completed in {:0.0f} s".format(wiki, elapsed))

aawiki completed in 0 s
aawikibooks completed in 0 s
aawiktionary completed in 0 s
abwiki completed in 1 s
abwiktionary completed in 0 s
acewiki completed in 0 s
adywiki completed in 0 s
afwiki completed in 1 s
afwikibooks completed in 0 s
afwikiquote completed in 0 s
afwiktionary completed in 0 s
akwiki completed in 0 s
akwikibooks completed in 0 s
akwiktionary completed in 1 s
alswiki completed in 0 s
alswikibooks completed in 1 s
alswikiquote completed in 0 s
alswiktionary completed in 0 s
amwiki completed in 0 s
amwikiquote completed in 0 s
amwiktionary completed in 0 s
angwiki completed in 0 s
angwikibooks completed in 0 s
angwikiquote completed in 0 s
angwikisource completed in 0 s
angwiktionary completed in 0 s
anwiki completed in 1 s
anwiktionary completed in 0 s
arcwiki completed in 1 s


# Quality checks

In [10]:
run_mariadb("""select month, count(*) from editor_month where month >= "2017-01-01" group by month""")

Unnamed: 0,month,count(*)
0,2017-01-01,393795
1,2017-02-01,380416
2,2017-03-01,407496
3,2017-04-01,383681
4,2017-05-01,389033
5,2017-06-01,362528
6,2017-07-01,354429
7,2017-08-01,357186
8,2017-09-01,362564
9,2017-10-01,380788


In [11]:
run_mariadb("""
    select month, count(*) 
    from editor_month 
    where month >= "2017-01-01" and content_edits >= 100
    group by month
""")

Unnamed: 0,month,count(*)
0,2017-01-01,15738
1,2017-02-01,14796
2,2017-03-01,15646
3,2017-04-01,15022
4,2017-05-01,15111
5,2017-06-01,14462
6,2017-07-01,15132
7,2017-08-01,15139
8,2017-09-01,14917
9,2017-10-01,15247
