In [8]:
import numpy as np
import pandas as pd
import requests
import re
import json
import datetime

import wmfdata as wmf
from wmfdata import charting, mariadb, hive
from wmfdata.utils import pct_str, pd_display_all

import xlsxwriter as xl

In [9]:
query_vars = dict(
    snapshot = "2019-09",
    start= "2017-06-01", 
    end="2018-06-01",
    
    pv_start = "201706",
    pv_end = "201806",
    
    ner_start = "2017-04",
    ner_end = "2018-04"
)

# List of wikis

In [17]:
# Gather all content wikis
wikis = wmf.hive.run("""
SELECT
  database_code,
  database_group AS project_code,
  language_code,
  CONCAT("https://", domain_name) AS domain_name,
  language_name,
  english_name as wiki_name
FROM canonical_data.wikis
WHERE
  database_group in (
    "commons", "incubator", "foundation", "mediawiki", "meta", "sources",
    "species","wikibooks", "wikidata", "wikinews", "wikipedia", "wikiquote",
    "wikisource", "wikiversity", "wikivoyage", "wiktionary"
  ) AND
  status = "open" AND
  visibility = "public" AND
  editability = "public"
""")

In [18]:
wikis.sample(10)

Unnamed: 0,database_code,project_code,language_code,domain_name,language_name,wiki_name
370,lawiktionary,wiktionary,la,https://la.wiktionary.org,Latin,Latin Wiktionary
513,ptwikiquote,wikiquote,pt,https://pt.wikiquote.org,Portuguese,Portuguese Wikiquote
33,azwiki,wikipedia,az,https://az.wikipedia.org,Azerbaijani,Azerbaijani Wikipedia
314,jawikiversity,wikiversity,ja,https://ja.wikiversity.org,Japanese,Japanese Wikiversity
350,krcwiki,wikipedia,krc,https://krc.wikipedia.org,Karachay-Balkar,Karachay-Balkar Wikipedia
58,bmwiki,wikipedia,bm,https://bm.wikipedia.org,Bambara,Bambara Wikipedia
359,kwwiki,wikipedia,kw,https://kw.wikipedia.org,Cornish,Cornish Wikipedia
208,frwiktionary,wiktionary,fr,https://fr.wiktionary.org,French,French Wiktionary
410,miwiktionary,wiktionary,mi,https://mi.wiktionary.org,Maori,Maori Wiktionary
651,towiki,wikipedia,to,https://to.wikipedia.org,Tongan,Tongan Wikipedia


# Data

In [19]:
def merge_in(df, on="database_code"):
    global wikis
    wikis = pd.merge(wikis, df, how="left", on=on).fillna(0)
    
def top_10(df, col):
    return df.sort_values(col, ascending=False).head(10)
  
def rename_df(df):
    return df.rename({"wiki": "database_code", "domain": "domain_name"}, axis=1)

In [20]:
wikis.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 732 entries, 0 to 731
Data columns (total 6 columns):
database_code    732 non-null object
project_code     732 non-null object
language_code    732 non-null object
domain_name      732 non-null object
language_name    732 non-null object
wiki_name        732 non-null object
dtypes: object(6)
memory usage: 34.4+ KB


## Monthly active editors

In [21]:
mae = wmf.hive.run("""
SELECT
    wiki AS database_code,
    COUNT(*) / 12 AS monthly_active_editors,
    SUM(
        CAST(TRUNC(user_registration, 'MM') = TRUNC(month, 'MM') AS INT)
        )/ 12 AS monthly_new_active_editors
FROM cchen.editor_month
WHERE
    content_edits >= 5 
    AND month >= "{start}" 
    AND month < "{end}" 
    AND user_id != 0 
    --AND user_id IS NOT NULL
    AND bot_by_group = FALSE 
    AND (
        user_name not regexp "bot\\b" or
        user_name in ("Paucabot", "Niabot", "Marbot")    
    )    
GROUP BY wiki
""".format(**query_vars))

In [22]:
merge_in(mae)

## Monthly unique devices

In [17]:
mud = wmf.hive.run("""
SELECT
    regexp_replace(
        regexp_replace(
            regexp_replace(domain, "zero\\\\.", ""),
        '^m\\\\.', ''),
    '\\\\.m\\\\.', '.') AS domain_name,
  SUM(uniques_estimate) / 12 AS monthly_unique_devices
FROM wmf.unique_devices_per_domain_monthly
WHERE 
    CONCAT(year, month) >= "{pv_start}" and
    CONCAT(year, month) < "{pv_end}"
GROUP BY    
    regexp_replace(
        regexp_replace(
            regexp_replace(domain, "zero\\\\.", ""),
        '^m\\\\.', ''),
    '\\\\.m\\\\.', '.')
""".format(**query_vars))

mud["domain_name"] = "https://" + mud["domain_name"]

top_10(mud, "monthly_unique_devices")

Unnamed: 0,domain_name,monthly_unique_devices
194,https://en.wikipedia.org,696456600.0
208,https://es.wikipedia.org,139841100.0
164,https://de.wikipedia.org,92964350.0
366,https://ja.wikipedia.org,84374000.0
629,https://ru.wikipedia.org,82848950.0
249,https://fr.wikipedia.org,76052840.0
598,https://pt.wikipedia.org,49912060.0
356,https://it.wikipedia.org,47468420.0
853,https://zh.wikipedia.org,33209570.0
32,https://ar.wikipedia.org,28593390.0


In [18]:
merge_in(mud, on="domain_name")

## Overall SIZE rank

In [19]:
SIZE = np.sqrt(wikis["monthly_unique_devices"] * wikis["monthly_active_editors"])
rank = SIZE.rank(method="min", na_option="bottom", ascending=False)
wikis["overall_SIZE_rank"] = rank

## New editor retention

In [20]:
with open("queries/new_editor_retention.hql") as f:
    q = f.read()

ner = wmf.hive.run(
    q.format(start = "{ner_start}", end = "{ner_end}").format(**query_vars))

In [21]:
top_10(ner, "new_editor_retention")

Unnamed: 0,database_code,new_editor_retention
134,stqwiki,1.0
497,zeawiki,1.0
35,emlwiki,1.0
277,nrmwiki,1.0
353,csbwiki,1.0
514,bmwiki,1.0
283,piwiki,1.0
45,frpwiki,1.0
303,szlwiki,1.0
581,liwikiquote,1.0


In [22]:
merge_in(ner)

## Mobile editing proportion

In [23]:
mep = wmf.hive.run(
"""
SELECT 
    wiki AS database_code,
    SUM(mobile_web_edits + mobile_app_edits) / SUM(edits) AS mobile_editing_proportion
FROM cchen.editor_month
WHERE
    month >= "{start}" 
    AND month < "{end}"
    -- A user is a bot if they have a matching name or have the bot flag on *any* wiki
    -- See https://meta.wikimedia.org/wiki/Research:Active_editor and https://meta.wikimedia.org/wiki/Research:Bot_user
    AND user_id != 0
    AND NOT bot_by_group 
    AND (user_name not regexp "bot\\b" OR user_name in ("Paucabot", "Niabot", "Marbot"))
GROUP BY wiki
""".format(**query_vars))

In [24]:
top_10(mep, "mobile_editing_proportion")

Unnamed: 0,database_code,mobile_editing_proportion
120,lawikiquote,0.826698
176,rmywiki,0.697836
72,gawiktionary,0.617493
423,sdwiki,0.500936
405,pswiki,0.478328
406,pswiktionary,0.434783
127,lmowiki,0.428491
620,mrwikiquote,0.406593
605,lowiki,0.404348
726,xmfwiki,0.400645


In [25]:
merge_in(mep)

## Bot editing proportion

In [26]:
bep = wmf.hive.run("""
SELECT
   database_code,
   SUM(CASE WHEN user_is_bot THEN edit_count END)/ SUM(edit_count) AS bot_editing_proportion
FROM wmf.edit_hourly
INNER JOIN canonical_data.wikis ON CONCAT(project,".org") = domain_name
    AND database_group in 
    (
        "commons", "incubator", "foundation", "mediawiki", "meta", "sources", 
        "species","wikibooks", "wikidata", "wikinews", "wikipedia", "wikiquote",
        "wikisource", "wikiversity", "wikivoyage", "wiktionary"
    )
WHERE
    ts  >= "{start}" 
   AND ts  < "{end}"
   AND snapshot = "{snapshot}"
GROUP BY database_code
""".format(**query_vars))

top_10(bep, "bot_editing_proportion")

Unnamed: 0,database_code,bot_editing_proportion
227,mgwiktionary,0.998347
331,cebwiki,0.989099
257,cywiki,0.955131
2,bswikinews,0.906534
296,cawikinews,0.867612
694,frwikinews,0.849761
403,srwiki,0.847158
725,zh_min_nanwiki,0.838015
521,ruwiktionary,0.831689
371,cawiktionary,0.815883


In [27]:
merge_in(bep)

## Anonymous editing proportion

In [28]:
aep = wmf.hive.run("""
SELECT
   database_code,
   SUM(IF(user_is_anonymous, edit_count, 0)) / SUM(edit_count)  AS anonymous_editing_proportion
FROM wmf.edit_hourly
INNER JOIN canonical_data.wikis ON CONCAT(project,".org") = domain_name
    AND database_group in 
    (
        "commons", "incubator", "foundation", "mediawiki", "meta", "sources", 
        "species","wikibooks", "wikidata", "wikinews", "wikipedia", "wikiquote",
        "wikisource", "wikiversity", "wikivoyage", "wiktionary"
    )
WHERE
    ts >= "{start}" 
   AND ts < "{end}"
   AND NOT user_is_bot
   AND snapshot = "{snapshot}"
GROUP BY database_code
""".format(**query_vars))

top_10(aep, "anonymous_editing_proportion")

Unnamed: 0,database_code,anonymous_editing_proportion
465,kywiktionary,0.930286
285,tlwikibooks,0.912766
146,viwikibooks,0.893918
414,cywikibooks,0.829787
534,zhwikiversity,0.813
11,huwiktionary,0.72722
588,mtwiktionary,0.719414
177,mgwiki,0.684651
479,sqwiktionary,0.65327
404,srwikiquote,0.652605


In [29]:
merge_in(aep)

## Majority-mobile editors proportion

In [30]:
mmep = wmf.hive.run("""
SELECT 
   wiki AS database_code,
   SUM(CASE WHEN mobile_editing_proportion > 0.5 THEN 1 END) / COUNT(*) as majority_mobile_editors_proportion
FROM 
(
    SELECT 
        wiki,
        SUM(mobile_web_edits + mobile_app_edits) / SUM(edits) AS mobile_editing_proportion
    FROM cchen.editor_month
    WHERE
        month >= "{start}" 
        AND month < "{end}" 
        AND user_id != 0
        AND NOT bot_by_group 
        AND (user_name not regexp "bot\\b" OR user_name in ("Paucabot", "Niabot", "Marbot"))
    GROUP BY wiki, user_name
) user_edits
GROUP BY wiki
""".format(**query_vars))
top_10(mmep, "majority_mobile_editors_proportion")

Unnamed: 0,database_code,majority_mobile_editors_proportion
10,arwikisource,0.541971
8,arwikinews,0.501259
475,arwikibooks,0.497159
22,bnwiki,0.489153
323,hiwiki,0.473695
562,hiwikibooks,0.467742
7,arwiki,0.46036
9,arwikiquote,0.446064
476,arwikiversity,0.442553
62,fawiktionary,0.43804


In [31]:
merge_in(mmep)

## Revert rate

In [32]:
rr = wmf.hive.run("""
    SELECT
        wiki_db AS database_code,
        SUM(IF(revision_is_identity_reverted, 1, 0)) / COUNT(*) AS revert_rate
        --SUM(CAST(revision_is_identity_reverted AS INT)) / COUNT(*) AS revert_rate
    FROM
        wmf.mediawiki_history
    WHERE
        event_entity = "revision" 
        AND event_type = "create" 
        AND snapshot = "{snapshot}" 
        AND event_timestamp >= "{start}" 
        AND event_timestamp < "{end}" 
        AND SIZE(event_user_is_bot_by_historical) = 0 
    GROUP BY wiki_db
""".format(**query_vars))

top_10(rr, "revert_rate")

Unnamed: 0,database_code,revert_rate
523,xalwiki,0.483597
579,tlwikibooks,0.414582
557,jvwiktionary,0.405941
780,viwikiquote,0.370804
378,dzwiki,0.366853
152,bgwikisource,0.361786
456,mgwikibooks,0.355556
339,sawiktionary,0.336585
413,kbdwiki,0.323583
391,ltwikisource,0.285714


In [33]:
merge_in(rr)

## Monthly pageviews

In [34]:
pageviews = hive.run("""
SELECT CONCAT("https://", project, ".org") AS domain_name, 
       SUM(view_count)/12 AS monthly_average_pageviews
FROM wmf.projectview_hourly
WHERE
    agent_type = "user" 
    AND CONCAT(year, month) >= "{pv_start}" 
    AND CONCAT(year, month) < "{pv_end}"
GROUP BY CONCAT("https://", project, ".org")
""".format(**query_vars))
top_10(pageviews, "monthly_average_pageviews")

Unnamed: 0,domain_name,monthly_average_pageviews
653,https://en.wikipedia.org,7617446000.0
272,https://es.wikipedia.org,1095616000.0
422,https://ja.wikipedia.org,1055091000.0
20,https://de.wikipedia.org,975220300.0
99,https://ru.wikipedia.org,918834600.0
531,https://fr.wikipedia.org,686521900.0
50,https://it.wikipedia.org,513289700.0
497,https://zh.wikipedia.org,368315900.0
585,https://pt.wikipedia.org,337922200.0
206,https://pl.wikipedia.org,247648100.0


In [35]:
pageviews = pageviews.replace("https://wikidata.org", "https://www.wikidata.org")

In [36]:
merge_in(pageviews, on="domain_name")

## Mobile pageviews proportion

In [37]:
mpp = wmf.hive.run("""
SELECT 
    CONCAT("https://", project, ".org") AS domain_name,
    SUM(if(access_method = "mobile web", view_COUNT, 0)) / SUM(view_COUNT) AS mobile_web_pageviews_proportion,
    SUM(if(access_method = "mobile app", view_COUNT, 0)) / SUM(view_COUNT) AS mobile_app_pageviews_proportion
FROM wmf.projectview_hourly
WHERE
    agent_type = "user" 
    AND CONCAT(year, month) >= "{pv_start}" 
    AND CONCAT(year, month) < "{pv_end}" 
GROUP BY CONCAT("https://", project, ".org")
""".format(**query_vars))

In [38]:
mpp = mpp.replace("https://wikidata.org", "https://www.wikidata.org")

In [39]:
top_10(mpp, "mobile_web_pageviews_proportion")

Unnamed: 0,domain_name,mobile_web_pageviews_proportion,mobile_app_pageviews_proportion
620,https://wikipedia.org,1.0,0.0
668,https://hi.wikibooks.org,0.904597,0.000247
540,https://hi.wikipedia.org,0.864018,0.011292
550,https://jv.wiktionary.org,0.831347,0.0
291,https://id.wikibooks.org,0.823915,4.6e-05
10,https://bn.wikipedia.org,0.81308,0.013498
44,https://hi.wikiquote.org,0.810344,1.8e-05
176,https://id.wiktionary.org,0.808276,4e-06
49,https://ig.wikipedia.org,0.763,7.8e-05
545,https://id.wikiquote.org,0.751988,0.000398


In [40]:
merge_in(mpp, on="domain_name")

## Monthly active administrators

In [41]:
maa = hive.run("""
SELECT
    wiki as database_code,
    sum(monthly_active_administrators) / 12 as monthly_active_administrators
FROM (
    SELECT
        wiki_db as wiki,
        substr(log_timestamp, 1, 6) as month,
        count(distinct log_actor) as monthly_active_administrators
    from wmf_raw.mediawiki_logging
    WHERE
        log_type in ("block", "delete", "protect", "rights")
        -- Omit the "delete_redir", "move_prot", and "autopromote" actions, which can be done by regular users
        AND log_action not in ("autopromote", "delete_redir", "move_prot")
        AND log_timestamp >= "{start}" 
        AND log_timestamp < "{end}" 
        AND snapshot = "{snapshot}"
    GROUP BY wiki_db, substr(log_timestamp, 1, 6)
) mae
GROUP BY wiki
""".format(**query_vars))

top_10(maa, "monthly_active_administrators")

Unnamed: 0,database_code,monthly_active_administrators
96,enwiki,428.833333
127,commonswiki,166.833333
660,dewiki,137.75
410,frwiki,106.083333
741,ruwiki,103.916667
675,itwiki,98.166667
180,ptwiki,85.166667
460,plwiki,74.166667
130,metawiki,55.75
29,eswiki,52.833333


In [42]:
merge_in(maa)

## Monthly non-bot edits

In [43]:
mnbe = wmf.hive.run("""
SELECT
   database_code,
   SUM(edit_count) /12 AS monthly_nonbot_edits
FROM wmf.edit_hourly
INNER JOIN canonical_data.wikis ON CONCAT(project,".org") = domain_name
    AND database_group in 
    (
        "commons", "incubator", "foundation", "mediawiki", "meta", "sources", 
        "species","wikibooks", "wikidata", "wikinews", "wikipedia", "wikiquote",
        "wikisource", "wikiversity", "wikivoyage", "wiktionary"
    )
WHERE
    ts >= "{start}" 
   AND ts < "{end}" 
   AND NOT user_is_bot
   AND snapshot = "{snapshot}" 
GROUP BY database_code
""".format(**query_vars))

top_10(mnbe, "monthly_nonbot_edits")

Unnamed: 0,database_code,monthly_nonbot_edits
604,wikidatawiki,6383017.0
117,enwiki,4204580.0
153,commonswiki,2879248.0
145,viwiki,1153474.0
206,dewiki,811728.6
339,frwiki,709024.9
577,eswiki,565627.8
237,ruwiki,482306.1
53,itwiki,437575.8
40,zhwiki,346479.2


In [44]:
merge_in(mnbe)

## Edits Gini coefficient

In [45]:
user_edits = wmf.hive.run("""
    SELECT
        wiki_db AS wiki,
        COUNT(*) AS user_edits
    FROM
        wmf.mediawiki_history
    WHERE
        event_entity = "revision" 
        AND event_type = "create" 
        AND snapshot = "{snapshot}" 
        AND event_timestamp >= "{start}" 
        AND event_timestamp < "{end}" 
        AND SIZE(event_user_is_bot_by_historical) = 0
        --event_user_is_bot_by_name = false 
        --array_contains(event_user_groups, "bot") = false
        --array_contains(event_user_is_bot_by, "NULL")= false
    GROUP BY event_user_id, wiki_db
""".format(**query_vars))

In [46]:
# FROM https://github.com/oliviaguest/gini
def gini(array):
    """Calculate the Gini coefficient of a numpy array."""
    # based on bottom eq:
    # http://www.statsdirect.com/help/generatedimages/equations/equation154.svg
    # FROM:
    # http://www.statsdirect.com/help/default.htm#nonparametric_methods/gini.htm
    # All values are treated equally, arrays must be 1d:
    array = array.flatten()
    if np.amin(array) < 0:
        # Values cannot be negative:
        array -= np.amin(array)
    # Values cannot be 0:
    array = array + 0.0000001
    # Values must be sorted:
    array = np.sort(array)
    # Index per array element:
    index = np.arange(1,array.shape[0]+1)
    # Number of array elements:
    n = array.shape[0]
    # Gini coefficient:
    return ((np.sum((2 * index - n - 1) * array)) / (n * np.sum(array)))

In [47]:
egc = user_edits.groupby("wiki").apply(lambda g: gini(g["user_edits"].values)).reset_index()

In [48]:
egc.columns = ["database_code", "edits_Gini_coefficient"]

In [49]:
merge_in(egc)

## Monthly editors

In [51]:
me = wmf.hive.run("""
SELECT
    wiki AS database_code,
    COUNT(*) / 12 AS monthly_editors
FROM cchen.editor_month
WHERE
    month >= "{start}" 
    AND month < "{end}" 
    AND user_id != 0 
    AND bot_by_group = FALSE
    AND (user_name not regexp "bot\\b" or user_name in ("Paucabot", "Niabot", "Marbot"))
GROUP BY wiki
""".format(**query_vars))

top_10(me, "monthly_editors")

Unnamed: 0,database_code,monthly_editors
47,enwiki,134582.833333
274,commonswiki,34402.0
514,dewiki,20697.083333
724,wikidatawiki,18779.5
545,frwiki,18163.333333
527,eswiki,17499.833333
347,jawiki,13334.75
656,ruwiki,11571.75
344,itwiki,8859.5
242,zhwiki,7929.083333


In [52]:
merge_in(me)

## Unique devices per editor

In [53]:
wikis["unique_devices_per_editor"] = wikis["monthly_unique_devices"] / wikis["monthly_editors"]

In [54]:
wikis = wikis.replace([np.inf], 0)

## Article COUNT

In [55]:
#wikis_list = wikis["wiki"].tolist()
wikis_list = wikis["database_code"].tolist()

In [63]:
#as of 09/19
wikis_list_not_working = ['alswiktionary', 'alswikibooks', 'alswikiquote', 'mowiki', 'mowiktionary']

In [64]:
wikis_list_clean = [x for x in wikis_list if x not in wikis_list_not_working]

In [67]:
ac = wmf.mariadb.run("""
SELECT
    database() AS database_code,
    ss_good_articles AS article_COUNT
FROM site_stats
""", wikis_list_clean)

In [68]:
top_10(ac, "article_COUNT")

Unnamed: 0,database_code,article_COUNT
701,wikidatawiki,67349786
93,commonswiki,56445649
152,enwiktionary,6157354
145,enwiki,5970543
406,mgwiktionary,5837703
86,cebwiki,5378807
608,svwiki,3745724
210,frwiktionary,3625347
120,dewiki,2365402
203,frwiki,2155372


In [69]:
merge_in(ac)

## Cumulative content edits

In [70]:
cce = wmf.hive.run("""
    SELECT
        wiki_db AS database_code,
        COUNT(*) AS cumulative_content_edits
    FROM
        wmf.mediawiki_history
    WHERE
        event_entity = "revision" 
        AND event_type = "create" 
        AND snapshot = "{snapshot}" 
        AND page_namespace_is_content = true 
        AND SIZE(event_user_is_bot_by_historical) = 0 
        AND array_contains(event_user_groups, "bot") = false
    GROUP BY wiki_db
""".format(**query_vars))

In [71]:
top_10(cce, "cumulative_content_edits")

Unnamed: 0,database_code,cumulative_content_edits
658,enwiki,566201296
22,wikidatawiki,372857088
505,commonswiki,198248629
407,dewiki,116621022
586,frwiki,92946573
200,eswiki,79143135
50,ruwiki,61670956
71,jawiki,56818195
534,itwiki,55942300
517,zhwiki,32255056


In [72]:
merge_in(cce)

## Edits per content page

In [73]:
wikis["edits_per_content_page"] = wikis["cumulative_content_edits"] / wikis["article_COUNT"]

## Script direction

In [74]:
rtl_url = "https://noc.wikimedia.org/conf/dblists/rtl.dblist"
rtl_wikis = pd.Series(requests.get(rtl_url).text.split("\n"))
rtl = pd.DataFrame({"database_code": rtl_wikis, "script_direction": "right-to-left"})

merge_in(rtl)
wikis["script_direction"] = wikis["script_direction"].replace([0], "left-to-right")

## Monthly structured discussions messages

In [75]:
msdm = wmf.mariadb.run(
"""
SELECT
    rev_user_wiki AS database_code,
    COUNT(*) / 12 AS monthly_structured_discussions_messages
FROM flowdb.flow_revision
WHERE
    rev_change_type in ("new-post", "reply") 
    AND date_format(FROM_unixtime(
        (conv(substring(hex(rev_id), 1, 12), 16, 10) >> 2) / 1000),
        "%Y-%m-%d %H:%i:%S") >= "{start}" 
    AND date_format(FROM_unixtime(
        (conv(substring(hex(rev_id), 1, 12), 16, 10) >> 2) / 1000),
        "%Y-%m-%d %H:%i:%S") < "{end}"
GROUP BY rev_user_wiki
""".format(**query_vars), "wikishared")

top_10(msdm, "monthly_structured_discussions_messages")

Unnamed: 0,database_code,monthly_structured_discussions_messages
21,mediawikiwiki,3603.5
9,frwiki,3052.9167
37,zhwiki,1699.8333
0,arwiki,1220.5833
36,wikidatawiki,1212.1667
2,cawiki,749.4167
5,elwiki,328.5833
25,plwiki,223.25
16,hewiki,222.6667
6,fawiki,160.5833


In [76]:
merge_in(msdm)

## Visual edits

In [77]:
ve = wmf.hive.run("""
SELECT 
    wiki AS database_code,
    SUM(visual_edits) / SUM(edits) AS visual_edits
FROM cchen.editor_month
WHERE
    month >= "{start}" 
    AND month < "{end}"
    AND user_id != 0
    AND NOT bot_by_group 
    AND (user_name not regexp "bot\\b" OR user_name in ("Paucabot", "Niabot", "Marbot"))
GROUP BY wiki
""".format(**query_vars))

top_10(ve, "visual_edits")

Unnamed: 0,database_code,visual_edits
488,bgwikibooks,0.637594
247,angwiki,0.510024
377,mgwiki,0.4
634,novwiki,0.332147
284,dinwiki,0.328826
593,kswiki,0.290323
356,kiwiki,0.25885
196,suwiki,0.244115
46,elwikivoyage,0.240909
173,ptwikiversity,0.223665


In [78]:
merge_in(ve)

## Mobile unique devices

In [79]:
mob_ud = wmf.hive.run("""
SELECT
    regexp_replace(
        regexp_replace(
            regexp_replace(regexp_replace(domain, "www\\\\.", ""), "zero\\\\.", ""),
        '^m\\\\.', ''),
    '\\\\.m\\\\.', '.') AS domain_name,
    SUM(if((domain regexp '^m\\\\.' or  domain regexp '\\\\.m\\\\.'), uniques_estimate, 0)) AS mobile_COUNT,
    SUM(uniques_estimate) AS total_COUNT,
    SUM(
        IF((domain regexp '^m\\\\.' or  domain regexp '\\\\.m\\\\.'), uniques_estimate, 0)
    ) / SUM(uniques_estimate) AS mobile_unique_devices
FROM wmf.unique_devices_per_domain_monthly
WHERE 
    CONCAT(year, month) >= "{pv_start}" AND
    CONCAT(year, month) < "{pv_end}"
    
    
    
GROUP BY    
    regexp_replace(
        regexp_replace(
            regexp_replace(regexp_replace(domain, "www\\\\.", ""), "zero\\\\.", ""),
        '^m\\\\.', ''),
    '\\\\.m\\\\.', '.')
""".format(**query_vars))

mob_ud["domain_name"] = "https://" + mob_ud["domain_name"]

In [80]:
mob_ud = mob_ud.replace("https://wikidata.org", "https://www.wikidata.org")

In [81]:
top_10(mob_ud, "mobile_unique_devices")

Unnamed: 0,domain_name,mobile_count,total_count,mobile_unique_devices
340,https://ig.wikipedia.org,1576494,1659716,0.949858
299,https://hi.wikibooks.org,1385981,1495662,0.926667
304,https://hi.wiktionary.org,1184357,1317786,0.898748
300,https://hi.wikipedia.org,80130832,91233823,0.878302
375,https://jv.wiktionary.org,239642,274477,0.873086
642,https://sa.wiktionary.org,336046,385697,0.871269
301,https://hi.wikiquote.org,286321,330944,0.865164
91,https://bn.wikisource.org,527266,613176,0.859893
334,https://id.wikiquote.org,485423,575474,0.843519
332,https://id.wikibooks.org,5630417,6683891,0.842386


In [82]:
merge_in(mob_ud, on="domain_name")

# Readying for spreadsheet

In [83]:
wikis.columns.tolist()

['database_code',
 'project_code',
 'language_code',
 'domain_name',
 'language_name',
 'project_name',
 'wiki_name',
 'monthly_active_editors',
 'monthly_new_active_editors',
 'monthly_unique_devices',
 'overall_SIZE_rank',
 'new_editor_retention',
 'mobile_editing_proportion',
 'bot_editing_proportion',
 'anonymous_editing_proportion',
 'majority_mobile_editors_proportion',
 'revert_rate',
 'monthly_average_pageviews',
 'mobile_web_pageviews_proportion',
 'mobile_app_pageviews_proportion',
 'monthly_active_administrators',
 'monthly_nonbot_edits',
 'edits_Gini_coefficient',
 'monthly_editors',
 'unique_devices_per_editor',
 'article_COUNT',
 'cumulative_content_edits',
 'edits_per_content_page',
 'script_direction',
 'monthly_structured_discussions_messages',
 'visual_edits',
 'mobile_count',
 'total_count',
 'mobile_unique_devices']

In [84]:
wikis = wikis[[
    'overall_SIZE_rank',
    'monthly_unique_devices',
    'mobile_unique_devices',
    'mobile_web_pageviews_proportion',
    'mobile_app_pageviews_proportion',
    'unique_devices_per_editor',
    'monthly_editors',
    'monthly_active_editors',
    'monthly_active_administrators',
    'majority_mobile_editors_proportion',
    'monthly_new_active_editors',
    'new_editor_retention',
    'monthly_nonbot_edits',
    'bot_editing_proportion',
    'mobile_editing_proportion',
    'visual_edits',
    'anonymous_editing_proportion',
    'revert_rate',
    'edits_Gini_coefficient',
    'monthly_structured_discussions_messages',
    'article_COUNT',
    'cumulative_content_edits',
    'edits_per_content_page',
    'script_direction',
    'database_code',
    'project_code',
    'language_code',
    'domain_name',
    'language_name',
    'project_name',
    'wiki_name',
]]

In [85]:
wikis.rename(columns={
    'article_COUNT':'content_pages',
    'anonymous_editing_proportion': 'anonymous_edits',
    'mobile_editing_proportion': 'mobile_edits',
    'bot_editing_proportion':'bot_edits',
    'new_editor_retention':'second_month_editor_retention',
    'majority_mobile_editors_proportion':'majority_mobile_editors',
    'mobile_app_pageviews_proportion':'mobile_app_pageviews',
    'mobile_web_pageviews_proportion':'mobile_web_pageviews',
    'domain_name':'domain',
    'wiki_name': 'wiki'
}, inplace=True)

#drop the underscore in headers
wikis.columns = wikis.columns.str.replace('_', ' ')

#add url column and put at the front
wikis['url'] = wikis['wiki'] + '#' + wikis['domain']
cols = wikis.columns.tolist()
cols = cols[-1:] + cols[:-1]
wikis = wikis[cols]

#sort by size
wikis = wikis.sort_values("overall SIZE rank").fillna(0).reset_index(drop=True)

#format to 2 decimal places
pd.options.display.float_format = '{:.2f}'.format
#format floats to use comma separators https://pandas.pydata.org/pandas-docs/version/0.23.4/options.html
pd.options.display.float_format = '{:,}'.format

In [209]:
def make_hyperlink(val):
    wiki, domain = val.split('#')
    #url = "https://custom.url/{}"
    return '=HYPERLINK("{domain}", "{wiki}")'.format(domain=domain, wiki=wiki)

wikis['url'] = wikis['url'].apply(lambda x: make_hyperlink(x))

In [216]:

def get_col_widths(dataframe):
    # First we find the maximum length of the index column   
    idx_max = max([len(str(s)) for s in dataframe.index.values] + [len(str(dataframe.index.name))])
    # Then, we concatenate this to the max of the lengths of column name and its values for each column, left to right
    return [idx_max] + [max([len(str(s)) for s in dataframe[col].values] + [len(col)]) for col in dataframe.columns]

for i, width in enumerate(get_col_widths(wikis)):
    worksheet.set_column(i, i, width)
    
    
#write to xls
#https://xlsxwriter.readthedocs.io/working_with_pandas.html
#https://xlsxwriter.readthedocs.io/example_pandas_column_formats.html
    
# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter('wikis.xlsx', engine='xlsxwriter')

# Convert the dataframe to an XlsxWriter Excel object.
wikis.to_excel(writer, sheet_name='2018_updated', float_format = "%0.2f", index=False)

# Get the xlsxwriter workbook and worksheet objects.
workbook  = writer.book
worksheet = writer.sheets['2018_updated']

# create desired xlsxwriter formats
headers = workbook.add_format({'bold': True})
alignment = workbook.add_format({'align': 'left'})

# apply formats to header and index
worksheet.set_row(0, None, headers)
worksheet.set_column(0,0, 18, alignment)
#set_column(first_col, last_col, width, cell_format, options)

# Add some cell formats
percent = workbook.add_format({'num_format': '0%'})
comma = workbook.add_format({'num_format': '#,##0.00'})


# Set the format on the percent columns.
# Set the format but not the column width.
worksheet.set_column('D:F', 18, percent)
worksheet.set_column('K:K', 18, percent)
worksheet.set_column('M:M', 18, percent)
worksheet.set_column('O:S', 18, percent)

worksheet.set_column('C:C', 18, comma)
worksheet.set_column('G:I', 18, comma)
worksheet.set_column('L:L', 18, comma)
worksheet.set_column('N:N', 18, comma)
worksheet.set_column('U:W', 18, comma)


# Add a sample alternative link format.
blue_format = workbook.add_format({
    'font_color': 'blue',
    'bold':       1,
    'underline':  1,
    'font_size':  12,
})

#Set the format and the width
worksheet.set_column('A:A', 50, blue_format)

worksheet.set_column('B:AF', 18)

# Close the Pandas Excel writer and output the Excel file.
writer.save()

In [217]:
wikis.to_csv("wikis.csv", sep=',', encoding = 'utf-8', index=False)