In [107]:
import datetime as dt
import json
import re

import numpy as np
import pandas as pd
import requests
import xlsxwriter

import wmfdata as wmf
from wmfdata.utils import get_dblist

# Parameters

In [2]:
# Data will be generated for the 12 months beginning with START
START = "2021-01"

# An ordered list of wikis for which to output each metric after it is generated. 
# This is useful for spot checking the values generated against the previous snapshot.
WIKIS_TO_CHECK = [
    "enwiki",
    "eswiki",
    "dewiki",
    "jawiki",
    "frwiki",
    "commonswiki"
]

# Date manipulation

In [3]:
start = dt.datetime.strptime(START, "%Y-%m")
end = start.replace(year=start.year + 1)
last_month = dt.date.today().replace(day=1) - dt.timedelta(days=1)
# Used to name the output files
file_stem = (end - dt.timedelta(days=1)).strftime("%b %Y")

# Queries run for time >= start and time < end 
query_vars = dict(
    snapshot = last_month.strftime("%Y-%m"),
    start = start.strftime('%Y-%m-%d'), 
    end = end.strftime('%Y-%m-%d'),
    ym_start = start.strftime("%Y-%m"),
    ym_end = end.strftime("%Y-%m"),
    pv_start = start.strftime("%Y%m"),
    pv_end = end.strftime("%Y%m")
)

# List of wikis

In [4]:
# Gather all content wikis
wikis = wmf.spark.run("""
SELECT
  database_code,
  database_group AS project_code,
  language_code,
  domain_name,
  language_name,
  english_name as wiki_name
FROM canonical_data.wikis
WHERE
  database_group IN (
    "commons", "incubator", "foundation", "mediawiki", "meta", "sources",
    "species","wikibooks", "wikidata", "wikinews", "wikipedia", "wikiquote",
    "wikisource", "wikiversity", "wikivoyage", "wiktionary"
  )
  AND status = "open"
  AND visibility = "public"
  AND editability = "public"
""", session_type="yarn-large")

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.


PYSPARK_PYTHON=/usr/lib/anaconda-wmf/bin/python3


SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/usr/lib/spark2/jars/slf4j-log4j12-1.7.16.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/usr/lib/hadoop/lib/slf4j-log4j12-1.7.25.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/01/15 01:12:29 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).
22/01/15 01:12:30 WARN Utils: Service 'sparkDriver' could not bind on port 12000. Attempting port 12001.
22/01/15 01:12:30 WARN Utils: Service 'sparkDriver' could not bind on port 12001. Attempting port 12002.
22/01/15 01:12:30 

# Data

In [5]:
def merge_in(df, on="database_code"):
    global wikis
    wikis = pd.merge(wikis, df, how="left", on=on).fillna(0)

wikis_to_check = (
    pd.DataFrame({"database_code": WIKIS_TO_CHECK})
    .merge(wikis, how="left", on="database_code")
)

def check(df, index_col="database_code"):
    indexes = wikis_to_check[index_col].to_frame()
    return indexes.merge(df, how="left", on=index_col)

## Monthly active editors

In [6]:
mae = wmf.spark.run("""
SELECT
    wiki AS database_code,
    COUNT(*) / 12 AS monthly_active_editors,
    SUM(
        CAST(TRUNC(user_registration, 'MM') = TRUNC(month, 'MM') AS INT)
    )/ 12 AS monthly_new_active_editors
FROM neilpquinn.editor_month
WHERE
    content_edits >= 5 
    AND month >= "{start}" 
    AND month < "{end}" 
    AND user_id != 0 
    AND bot_by_group = FALSE 
    AND (
        user_name NOT REGEXP "bot\\b"
        OR user_name IN ("Paucabot", "Niabot", "Marbot")    
    )    
GROUP BY wiki
""".format(**query_vars))

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.
22/01/15 01:14:43 WARN SharedInMemoryCache: Evicting cached table partition metadata from memory due to size constraints (spark.sql.hive.filesourcePartitionFileCacheSize = 262144000 bytes). This may impact query planning performance.
                                                                                

In [7]:
check(mae)

Unnamed: 0,database_code,monthly_active_editors,monthly_new_active_editors
0,enwiki,32804.583333,4946.416667
1,eswiki,4644.333333,1036.583333
2,dewiki,5523.833333,434.0
3,jawiki,5389.333333,876.5
4,frwiki,5199.333333,782.0
5,commonswiki,14311.0,3653.916667


In [8]:
merge_in(mae)

## Unique devices

In [9]:
ud = wmf.spark.run("""
SELECT
    -- Strip mobile subdomains so mobile and desktop sites are combined. 
    REGEXP_REPLACE(
        REGEXP_REPLACE(
            -- The canonical domains for Wikidata and MediaWiki.org start with `www`, which 
            -- gets _replaced_ by the mobile subdomain. Combine the two possibilites for each site.
            REGEXP_REPLACE(
                REGEXP_REPLACE(domain, "^m\\\\.wikidata", "www.wikidata"),
            "^m\\\\.mediawiki", "www.mediawiki"),
        "^m\\\\.", ""),
    "\\\\.m\\\\.", ".") AS domain_name,
    SUM(uniques_estimate) / 12 AS monthly_unique_devices,
    SUM(IF(
        (domain REGEXP "^m\\\\." OR  domain REGEXP "\\\\.m\\\\."),
        uniques_estimate,
        0
    )) / SUM(uniques_estimate) AS mobile_unique_devices
FROM wmf.unique_devices_per_domain_monthly
WHERE 
    CONCAT(year, LPAD(month, 2, "0")) >= "{pv_start}" 
    AND CONCAT(year, LPAD(month, 2, "0")) < "{pv_end}"  
GROUP BY    
    REGEXP_REPLACE(
        REGEXP_REPLACE(
            -- The canonical domains for Wikidata and MediaWiki.org start with `www`, which 
            -- gets _replaced_ by the mobile subdomain. Combine the two possibilites for each site.
            REGEXP_REPLACE(
                REGEXP_REPLACE(domain, "^m\\\\.wikidata", "www.wikidata"),
            "^m\\\\.mediawiki", "www.mediawiki"),
        "^m\\\\.", ""),
    "\\\\.m\\\\.", ".")
""".format(**query_vars))

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.
                                                                                

In [10]:
check(ud, index_col="domain_name")

Unnamed: 0,domain_name,monthly_unique_devices,mobile_unique_devices
0,en.wikipedia.org,809352600.0,0.696283
1,es.wikipedia.org,157325100.0,0.708869
2,de.wikipedia.org,104721600.0,0.605649
3,ja.wikipedia.org,107537000.0,0.734168
4,fr.wikipedia.org,94240730.0,0.653896
5,commons.wikimedia.org,22147650.0,0.509546


In [11]:
merge_in(ud, on="domain_name")

## Overall size rank

In [12]:
size = np.sqrt(wikis["monthly_unique_devices"] * wikis["monthly_active_editors"])
rank = size.rank(method="min", na_option="bottom", ascending=False)
wikis["overall_size_rank"] = rank

## New editor retention

In [13]:
with open("queries/new_editor_retention.hql") as f:
    q = f.read()

ner = wmf.spark.run(
    q.format(**query_vars)
)

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.
                                                                                

In [14]:
check(ner, "database_code")

Unnamed: 0,database_code,second_month_new_editor_retention
0,enwiki,0.074592
1,eswiki,0.051678
2,dewiki,0.072817
3,jawiki,0.1085
4,frwiki,0.057455
5,commonswiki,0.033101


In [15]:
merge_in(ner)

## Mobile editing proportion

In [16]:
mep = wmf.spark.run(
"""
SELECT
    CONCAT(project, ".org") AS domain_name,
    SUM(CASE WHEN ARRAY_CONTAINS(revision_tags, "mobile edit") THEN edit_count END)
        / SUM(edit_count) AS mobile_edits
FROM wmf.edit_hourly
WHERE
    ts >= "{start}" 
    AND ts < "{end}"
    AND snapshot = "{snapshot}"
    AND NOT user_is_bot
GROUP BY CONCAT(project, ".org")
""".format(**query_vars))

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.
                                                                                

In [17]:
check(mep, index_col="domain_name")

Unnamed: 0,domain_name,mobile_edits
0,en.wikipedia.org,0.144092
1,es.wikipedia.org,0.239887
2,de.wikipedia.org,0.050405
3,ja.wikipedia.org,0.232572
4,fr.wikipedia.org,0.098942
5,commons.wikimedia.org,0.020507


In [18]:
merge_in(mep, on="domain_name")

## Bot editing proportion

In [19]:
bep = wmf.spark.run("""
SELECT
   CONCAT(project, ".org") AS domain_name,
   SUM(CASE WHEN user_is_bot THEN edit_count END)
        / SUM(edit_count) AS bot_edits
FROM wmf.edit_hourly
WHERE
    ts  >= "{start}" 
    AND ts  < "{end}"
    AND snapshot = "{snapshot}"
GROUP BY CONCAT(project, ".org")
""".format(**query_vars))

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.
                                                                                

In [20]:
check(bep, index_col="domain_name")

Unnamed: 0,domain_name,bot_edits
0,en.wikipedia.org,0.175525
1,es.wikipedia.org,0.103631
2,de.wikipedia.org,0.12177
3,ja.wikipedia.org,0.115351
4,fr.wikipedia.org,0.206058
5,commons.wikimedia.org,0.462031


In [21]:
merge_in(bep, on="domain_name")

## Anonymous editing proportion

In [22]:
aep = wmf.spark.run("""
SELECT
   CONCAT(project, ".org") AS domain_name,
   SUM(CASE WHEN user_is_anonymous THEN edit_count END) / SUM(edit_count) AS anonymous_edits
FROM wmf.edit_hourly
WHERE
    ts >= "{start}" 
    AND ts < "{end}"
    AND snapshot = "{snapshot}"
    AND NOT user_is_bot
GROUP BY CONCAT(project, ".org")
""".format(**query_vars))

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.
                                                                                

In [23]:
check(aep, index_col="domain_name")

Unnamed: 0,domain_name,anonymous_edits
0,en.wikipedia.org,0.170487
1,es.wikipedia.org,0.270625
2,de.wikipedia.org,0.089402
3,ja.wikipedia.org,0.24158
4,fr.wikipedia.org,0.125333
5,commons.wikimedia.org,0.012536


In [24]:
merge_in(aep, on="domain_name")

## Majority-mobile editors proportion

In [25]:
mmep = wmf.spark.run("""
WITH user AS (
    SELECT 
        wiki_db AS database_code,
        SUM(CAST(ARRAY_CONTAINS(revision_tags, "mobile edit") AS INT))
            / COUNT(*) AS mobile_editing_proportion
    FROM wmf.mediawiki_history
    WHERE
        event_entity = "revision" 
        AND event_type = "create" 
        AND snapshot = "{snapshot}" 
        AND event_timestamp >= "{start}" 
        AND event_timestamp < "{end}"
        AND NOT event_user_is_anonymous
        AND SIZE(event_user_is_bot_by_historical) = 0
    GROUP BY
        wiki_db,
        event_user_text
)
SELECT
   database_code,
   SUM(CAST(mobile_editing_proportion > 0.5 AS INT))
       / COUNT(*) AS majority_mobile_editors
FROM user
GROUP BY database_code
""".format(**query_vars))

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.
22/01/15 01:16:44 WARN Utils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.debug.maxToStringFields' in SparkEnv.conf.
                                                                                

In [26]:
check(mmep)

Unnamed: 0,database_code,majority_mobile_editors
0,enwiki,0.261189
1,eswiki,0.300681
2,dewiki,0.147814
3,jawiki,0.296639
4,frwiki,0.179282
5,commonswiki,0.159998


In [27]:
merge_in(mmep)

## Revert rate

In [28]:
rr = wmf.spark.run("""
    SELECT
        wiki_db AS database_code,
        SUM(CAST(revision_is_identity_reverted AS INT)) / COUNT(*) AS revert_rate
    FROM
        wmf.mediawiki_history
    WHERE
        event_entity = "revision" 
        AND event_type = "create" 
        AND snapshot = "{snapshot}" 
        AND event_timestamp >= "{start}" 
        AND event_timestamp < "{end}" 
        AND SIZE(event_user_is_bot_by_historical) = 0 
    GROUP BY wiki_db
""".format(**query_vars))

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.
                                                                                

In [29]:
check(rr)

Unnamed: 0,database_code,revert_rate
0,enwiki,0.110091
1,eswiki,0.186721
2,dewiki,0.075174
3,jawiki,0.068036
4,frwiki,0.070056
5,commonswiki,0.013713


In [30]:
merge_in(rr)

## Pageviews

In [138]:
pv = wmf.spark.run("""
SELECT
    IF(
        project IN ("mediawiki", "wikidata"),
        CONCAT("www.", project, ".org"),
        CONCAT(project, ".org")
    ) AS domain_name, 
    SUM(view_count) / 12 AS monthly_pageviews,
    SUM(CASE WHEN access_method = "mobile web" THEN view_count END)
        / SUM(view_count) AS mobile_web_pageviews,
    SUM(CASE WHEN access_method = "mobile app" THEN view_count END)
        / SUM(view_COUNT) AS mobile_app_pageviews
FROM wmf.projectview_hourly
WHERE
    agent_type = "user" 
    AND CONCAT(year, LPAD(month, 2, "0")) >= "{pv_start}" 
    AND CONCAT(year, LPAD(month, 2, "0")) < "{pv_end}"
GROUP BY
    IF(
        project IN ("mediawiki", "wikidata"),
        CONCAT("www.", project, ".org"),
        CONCAT(project, ".org")
    )
""".format(**query_vars))

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.
                                                                                

In [139]:
check(pv, index_col="domain_name")

Unnamed: 0,domain_name,monthly_pageviews,mobile_web_pageviews,mobile_app_pageviews
0,en.wikipedia.org,7413413000.0,0.605818,0.02147223
1,es.wikipedia.org,953975000.0,0.650961,0.006659381
2,de.wikipedia.org,924580900.0,0.528485,0.04493527
3,ja.wikipedia.org,1103630000.0,0.653886,0.009408045
4,fr.wikipedia.org,716812600.0,0.603533,0.01651341
5,commons.wikimedia.org,98071580.0,0.33334,9.278937e-07


In [144]:
merge_in(pv, on="domain_name")

## Monthly active administrators

In [35]:
maa = wmf.spark.run("""
SELECT
    wiki AS database_code,
    SUM(monthly_active_administrators) / 12 AS monthly_active_administrators
FROM (
    SELECT
        wiki_db AS wiki,
        SUBSTR(log_timestamp, 1, 6) AS month,
        COUNT(DISTINCT log_actor) AS monthly_active_administrators
    FROM wmf_raw.mediawiki_logging
    WHERE
        log_type IN ("block", "delete", "protect", "rights")
        -- Omit the "delete_redir", "move_prot", and "autopromote" actions, which can be done by regular users
        AND log_action NOT IN ("autopromote", "delete_redir", "move_prot")
        AND log_timestamp >= "{pv_start}" 
        AND log_timestamp < "{pv_end}" 
        AND snapshot = "{snapshot}"
    GROUP BY wiki_db, SUBSTR(log_timestamp, 1, 6)
) mae
GROUP BY wiki
""".format(**query_vars))

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.
                                                                                

In [36]:
check(maa)

Unnamed: 0,database_code,monthly_active_administrators
0,enwiki,423.583333
1,eswiki,50.166667
2,dewiki,129.5
3,jawiki,34.0
4,frwiki,98.416667
5,commonswiki,152.916667


In [37]:
merge_in(maa)

## Monthly non-bot edits

In [38]:
mnbe = wmf.spark.run("""
SELECT
   CONCAT(project, ".org") AS domain_name,
   SUM(edit_count) / 12 AS monthly_nonbot_edits
FROM wmf.edit_hourly
WHERE
    ts >= "{start}"
    AND ts < "{end}"
    AND NOT user_is_bot
    AND snapshot = "{snapshot}"
GROUP BY CONCAT(project, ".org")
""".format(**query_vars))

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.
                                                                                

In [39]:
check(mnbe, index_col="domain_name")

Unnamed: 0,domain_name,monthly_nonbot_edits
0,en.wikipedia.org,4500923.0
1,es.wikipedia.org,640597.8
2,de.wikipedia.org,782159.1
3,ja.wikipedia.org,454635.2
4,fr.wikipedia.org,738161.6
5,commons.wikimedia.org,4256383.0


In [40]:
merge_in(mnbe, on="domain_name")

## Edits Gini coefficient

In [41]:
user_edits = wmf.spark.run("""
SELECT
    wiki_db AS wiki,
    COUNT(*) AS user_edits
FROM
    wmf.mediawiki_history
WHERE
    event_entity = "revision" 
    AND event_type = "create" 
    AND snapshot = "{snapshot}" 
    AND event_timestamp >= "{start}" 
    AND event_timestamp < "{end}" 
    AND SIZE(event_user_is_bot_by_historical) = 0
GROUP BY
    wiki_db,
    event_user_id
""".format(**query_vars))

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.
                                                                                

In [42]:
# FROM https://github.com/oliviaguest/gini
def gini(array):
    """Calculate the Gini coefficient of a numpy array."""
    # based on bottom eq:
    # http://www.statsdirect.com/help/generatedimages/equations/equation154.svg
    # FROM:
    # http://www.statsdirect.com/help/default.htm#nonparametric_methods/gini.htm
    # All values are treated equally, arrays must be 1d:
    array = array.flatten()
    if np.amin(array) < 0:
        # Values cannot be negative:
        array -= np.amin(array)
    # Values cannot be 0:
    array = array + 0.0000001
    # Values must be sorted:
    array = np.sort(array)
    # Index per array element:
    index = np.arange(1,array.shape[0]+1)
    # Number of array elements:
    n = array.shape[0]
    # Gini coefficient:
    return ((np.sum((2 * index - n - 1) * array)) / (n * np.sum(array)))

egc = (
    user_edits
    .groupby("wiki")
    .apply(lambda g: gini(g["user_edits"].values))
    .reset_index()
)

egc.columns = ["database_code", "edits_Gini_coefficient"]

In [43]:
check(egc)

Unnamed: 0,database_code,edits_Gini_coefficient
0,enwiki,0.954271
1,eswiki,0.95881
2,dewiki,0.959961
3,jawiki,0.938387
4,frwiki,0.9592
5,commonswiki,0.979744


In [44]:
merge_in(egc)

## Monthly editors

In [45]:
me = wmf.spark.run("""
SELECT
    wiki AS database_code,
    COUNT(*) / 12 AS monthly_editors
FROM neilpquinn.editor_month
WHERE
    month >= "{start}" 
    AND month < "{end}" 
    AND user_id != 0 
    AND NOT bot_by_group
    AND (
        user_name NOT REGEXP "bot\\b"
        OR user_name IN ("Paucabot", "Niabot", "Marbot")
    )
GROUP BY wiki
""".format(**query_vars))

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.
                                                                                

In [46]:
check(me)

Unnamed: 0,database_code,monthly_editors
0,enwiki,135505.666667
1,eswiki,17713.916667
2,dewiki,19901.333333
3,jawiki,15551.416667
4,frwiki,20209.416667
5,commonswiki,40649.833333


In [47]:
merge_in(me)

## Unique devices per editor

In [48]:
wikis["unique_devices_per_editor"] = wikis["monthly_unique_devices"] / wikis["monthly_editors"]

In [49]:
wikis = wikis.replace([np.inf], 0)

## Content pages

In [50]:
wikis_list = wikis["database_code"].tolist()

In [51]:
ac = wmf.mariadb.run("""
SELECT
    database() AS database_code,
    ss_good_articles AS content_pages
FROM site_stats
""", wikis_list)

In [52]:
check(ac)

Unnamed: 0,database_code,content_pages
0,enwiki,6438874
1,eswiki,1745095
2,dewiki,2653479
3,jawiki,1309198
4,frwiki,2389363
5,commonswiki,77861123


In [53]:
merge_in(ac)

## Cumulative content edits

In [54]:
cce = wmf.spark.run("""
SELECT
    wiki_db AS database_code,
    COUNT(*) AS cumulative_content_edits
FROM
    wmf.mediawiki_history
WHERE
    event_entity = "revision" 
    AND event_type = "create" 
    AND snapshot = "{snapshot}" 
    AND page_namespace_is_content
    AND SIZE(event_user_is_bot_by_historical) = 0 
GROUP BY wiki_db
""".format(**query_vars))

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.
                                                                                

In [55]:
check(cce)

Unnamed: 0,database_code,cumulative_content_edits
0,enwiki,655224211
1,eswiki,94772615
2,dewiki,134182420
3,jawiki,66956406
4,frwiki,110040129
5,commonswiki,284536774


In [56]:
merge_in(cce)

## Edits per content page

In [57]:
wikis["edits_per_content_page"] = wikis["cumulative_content_edits"] / wikis["content_pages"]

## Script direction

In [58]:
rtl_wikis = get_dblist("rtl")
rtl = pd.DataFrame({"database_code": rtl_wikis, "script_direction": "right-to-left"})
merge_in(rtl)
wikis["script_direction"] = wikis["script_direction"].replace({0: "left-to-right"})

## Monthly structured discussions messages

In [59]:
msdm = wmf.mariadb.run(
"""
SELECT
    rev_user_wiki AS database_code,
    COUNT(*) / 12 AS monthly_structured_discussions_messages
FROM flowdb.flow_revision
WHERE
    rev_change_type IN ("new-post", "reply") 
    AND date_format(FROM_UNIXTIME(
        (CONV(SUBSTRING(HEX(rev_id), 1, 12), 16, 10) >> 2) / 1000),
        "%Y-%m-%d %H:%i:%S") >= "{start}" 
    AND date_format(FROM_UNIXTIME(
        (CONV(SUBSTRING(HEX(rev_id), 1, 12), 16, 10) >> 2) / 1000),
        "%Y-%m-%d %H:%i:%S") < "{end}"
GROUP BY rev_user_wiki
""".format(**query_vars), "wikishared")

In [60]:
check(msdm)

Unnamed: 0,database_code,monthly_structured_discussions_messages
0,enwiki,
1,eswiki,
2,dewiki,
3,jawiki,
4,frwiki,4110.9167
5,commonswiki,


In [61]:
merge_in(msdm)

## Visual edits

In [62]:
ve = wmf.spark.run(
"""
SELECT
    CONCAT(project, ".org") AS domain_name,
    SUM(CASE WHEN ARRAY_CONTAINS(revision_tags, "visualeditor") THEN edit_count END)
        / SUM(edit_count) AS visual_edits
FROM wmf.edit_hourly
WHERE
    ts >= "{start}" 
    AND ts < "{end}"
    AND snapshot = "{snapshot}"
    AND NOT user_is_bot
GROUP BY CONCAT(project, ".org")
""".format(**query_vars))

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.
                                                                                

In [63]:
check(ve, index_col="domain_name")

Unnamed: 0,domain_name,visual_edits
0,en.wikipedia.org,0.083029
1,es.wikipedia.org,0.145526
2,de.wikipedia.org,0.098849
3,ja.wikipedia.org,0.134256
4,fr.wikipedia.org,0.15761
5,commons.wikimedia.org,0.000622


In [64]:
merge_in(ve, on="domain_name")

# Readying for output

In [145]:
wikis_formatted = (
    wikis[[
        "overall_size_rank",
        "monthly_unique_devices",
        "mobile_unique_devices",
        "monthly_pageviews",
        "mobile_web_pageviews",
        "mobile_app_pageviews",
        "unique_devices_per_editor",
        "monthly_editors",
        "monthly_active_editors",
        "monthly_active_administrators",
        "majority_mobile_editors",
        "monthly_new_active_editors",
        "second_month_new_editor_retention",
        "monthly_nonbot_edits",
        "bot_edits",
        "mobile_edits",
        "visual_edits",
        "anonymous_edits",
        "revert_rate",
        "edits_Gini_coefficient",
        "monthly_structured_discussions_messages",
        "content_pages",
        "cumulative_content_edits",
        "edits_per_content_page",
        "script_direction",
        "database_code",
        "project_code",
        "language_code",
        "language_name",
        "domain_name",
        "wiki_name"
    ]].astype({
        # These columns are not monthly averages and will never have a fractional component
        "overall_size_rank": int,
        "content_pages": int,
        "cumulative_content_edits": int
    })
    .sort_values("overall_size_rank")
)

# Convert underscores to spaces in column names in the final step, to avoid having to
# work with spaces beforehand
wikis_formatted = wikis_formatted.rename(columns=lambda c: c.replace("_", " "))

## Output CSV

In [146]:
wikis_formatted.to_csv(
    f"snapshots/{file_stem}.csv",
    float_format="%0.4f",
    index=False
)

## Output XLSX spreadsheet

In [136]:
# This produces some but not all of our desired formatting. Things still needed:
# * Make the number formats actually work
# * Automatically set appropriate column widths
# * Style hyperlinks blue
# * Set alternating row colors
# * Add a filter
# 
# Continuing to try to make xlsxwriter work does not seem like a good idea. 
# Google Sheets seems to have a more comprehensive and better maintained API, and it
# allows us to go directly to our desired destination. 

xslx_data = wikis_formatted.copy()

count_columns = [
    "overall_size_rank",
    "monthly_unique_devices",
    "monthly_pageviews",
    "unique_devices_per_editor",
    "monthly_editors",
    "monthly_active_editors",
    "monthly_active_administrators",
    "monthly_new_active_editors",
    "monthly_nonbot_edits",
    "monthly_structured_discussions_messages",
    "content_pages",
    "cumulative_content_edits",
    "edits_per_content_page",
]

proportion_columns = [
    "mobile_unique_devices",
    "mobile_web_pageviews",
    "mobile_app_pageviews",
    "majority_mobile_editors",
    "second_month_new_editor_retention",
    "bot_edits",
    "mobile_edits",
    "visual_edits",
    "anonymous_edits",
    "revert_rate",
    "edits_Gini_coefficient",
]

def get_column_positions(column_names):
    return xslx_data.columns.get_indexer_for(column_names)

count_column_positions = get_column_positions(count_columns)
proportion_column_positions = get_column_positions(proportion_columns)

# Helpful documentation:
# https://xlsxwriter.readthedocs.io/working_with_pandas.html
# https://xlsxwriter.readthedocs.io/example_pandas_column_formats.html
with xlsxwriter.Workbook(f'{file_stem}.xlsx') as workbook:
    worksheet = workbook.add_worksheet(file_stem)
    
    basic_settings = {
        "font_name": "Arial",
        "font_size": 10,
        "align": "left"
    }
    basic_format = workbook.add_format(basic_settings)

    def add_derived_format(extra_settings):
        settings = basic_settings.copy()
        settings.update(extra_settings)
        return workbook.add_format(settings)
    
    header_format = add_derived_format({
        "bold": True,
        "text_wrap": True   
    })
    
    count_format = add_derived_format({
        "align": "right",
        "num_format": "#,##0"
    })
    
    percent_format = add_derived_format({
        "align": "right",
        "num_format": "0.0%"
    })

    # Turn the wiki name column into a link formula
    wiki_name = (
        '=HYPERLINK("https://'
        + wikis_formatted["domain name"]
        + '", "'
        + wikis_formatted["wiki name"]
        + '")'
    )

    xlsx_data = wikis_formatted.drop(columns=["wiki name"])
    xlsx_data.insert(0, "wiki name", wiki_name)
    
    for position, name in enumerate(xlsx_data.columns.values):
        worksheet.write(0, position, name, header_format)
        
        if position in count_column_positions:
            column_format = count_format
        elif position in proportion_column_positions:
            column_format = percent_format
        else: 
            column_format = basic_format

        worksheet.write_column(1, position, xlsx_data[name], column_format)
    
    worksheet.freeze_panes(1, 1)
    worksheet.autofilter(0, 0, len(xlsx_data.index), len(xlsx_data.columns) - 1)