In [1]:
import numpy as np
import pandas as pd

import wmfdata as wmf
from wmfdata.utils import get_dblist

# Parameters

In [2]:
# Data will be generated for the 12 months ending with and including SNAPSHOT
# This is also the mediawiki_history snapshot necessary for the calculations.
SNAPSHOT = "2021-12"

# An ordered list of wikis for which to output each metric after it is generated. 
# This is useful for spot checking the values generated against the previous snapshot.
WIKIS_TO_CHECK = [
    "enwiki",
    "eswiki",
    "jawiki",
    "dewiki",
    "frwiki",
    "ruwiki"
]

# Date manipulation

In [3]:
snapshot = pd.Period(SNAPSHOT)
start = (snapshot - 11).start_time
end = (snapshot + 1).start_time
# Used to name the output files
file_stem = snapshot.strftime("%b %Y")

# Start included, end excluded
query_vars = dict(
    snapshot=snapshot.strftime("%Y-%m"),
    start=start.strftime('%Y-%m-%d'), 
    end=end.strftime('%Y-%m-%d'),
    ym_start=start.strftime("%Y-%m"),
    ym_end=end.strftime("%Y-%m"),
    pv_start=start.strftime("%Y%m"),
    pv_end=end.strftime("%Y%m"),
    # New editor retention needs different time boundaries, since we define retention
    # in a given month as the status of the new users who registered two months prior
    ner_cohort_start=(snapshot - 14).strftime("%Y-%m"),
    ner_cohort_end=(snapshot - 1).strftime("%Y-%m")
)

# List of wikis

In [4]:
# The canonical_data.wikis table is not automatically updated,
# so you should first update it manually by running the notebook:
# https://github.com/wikimedia-research/canonical-data/blob/master/generation/wikis.ipynb
#
# This is important since new wikis are regularly opened and old ones are sometimes closed.
wikis = wmf.spark.run("""
SELECT
  database_code,
  database_group AS project_code,
  language_code,
  domain_name,
  language_name,
  english_name as wiki_name
FROM canonical_data.wikis
WHERE
  database_group IN (
    "commons", "incubator", "foundation", "mediawiki", "meta", "sources",
    "species","wikibooks", "wikidata", "wikinews", "wikipedia", "wikiquote",
    "wikisource", "wikiversity", "wikivoyage", "wiktionary"
  )
  AND status = "open"
  AND visibility = "public"
  AND editability = "public"
""", session_type="yarn-large")

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.


PYSPARK_PYTHON=/usr/lib/anaconda-wmf/bin/python3


SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/usr/lib/spark2/jars/slf4j-log4j12-1.7.16.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/usr/lib/hadoop/lib/slf4j-log4j12-1.7.25.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/02/04 17:06:29 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).
22/02/04 17:06:29 WARN Utils: Service 'sparkDriver' could not bind on port 12000. Attempting port 12001.
22/02/04 17:06:29 WARN Utils: Service 'sparkDriver' could not bind on port 12001. Attempting port 12002.
22/02/04 17:06:29 

# Check for missing wikis in mediawiki_history

Sometimes, wikis are not added to mediawiki_history after they're created (e.g. [T299548](https://phabricator.wikimedia.org/T299548), [T220456](https://phabricator.wikimedia.org/T220456)). Let's check for that.

In [5]:
mwh_wikis = wmf.spark.run("""
SELECT DISTINCT wiki_db AS database_code
FROM wmf.mediawiki_history
""".format(**query_vars))

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.
22/02/04 17:07:01 WARN Utils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.debug.maxToStringFields' in SparkEnv.conf.
22/02/04 17:08:53 WARN SharedInMemoryCache: Evicting cached table partition metadata from memory due to size constraints (spark.sql.hive.filesourcePartitionFileCacheSize = 262144000 bytes). This may impact query planning performance.
org.apache.spark.shuffle.FetchFailedException: java.util.concurrent.TimeoutException: Timeout waiting for task.
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.throwFetchFailedException(ShuffleBlockFetcherIterator.scala:554)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:485)
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.next(ShuffleBlockFetcherIterator.scala:64)
	at scala.collection.Iterator$$anon$12.nextCur(Iterator.scala:435)
	at scala.c

In [6]:
mwh_missing_wikis = set(wikis["database_code"]) - set(mwh_wikis["database_code"])
mwh_missing_wikis

{'altwiki',
 'amiwiki',
 'banwikisource',
 'bclwiktionary',
 'dagwiki',
 'diqwiktionary',
 'jvwikisource',
 'lmowiktionary',
 'madwiki',
 'mniwiki',
 'mniwiktionary',
 'mnwwiktionary',
 'niawiki',
 'niawiktionary',
 'pwnwiki',
 'shiwiki',
 'skrwiki',
 'skrwiktionary',
 'smnwiki',
 'taywiki',
 'trvwiki',
 'wawikisource'}

These missing wikis should be reported to Data Engineering. In the meantime, let's continue generating the data for the remaining wikis.

In [7]:
wikis = wikis.drop(
    index=wikis.query("database_code in @mwh_missing_wikis").index
)

# Data collection

In [8]:
def merge_in(df, on="database_code"):
    global wikis
    wikis = pd.merge(wikis, df, how="left", on=on).fillna(0)

wikis_to_check = (
    pd.DataFrame({"database_code": WIKIS_TO_CHECK})
    .merge(wikis, how="left", on="database_code")
)

def check(df, index_col="database_code"):
    indexes = wikis_to_check[index_col].to_frame()
    return indexes.merge(df, how="left", on=index_col)

## Unique devices

In [9]:
ud = wmf.spark.run("""
SELECT
    -- Strip mobile subdomains so mobile and desktop sites are combined. 
    REGEXP_REPLACE(
        REGEXP_REPLACE(
            -- The canonical domains for Wikidata and MediaWiki.org start with `www`, which 
            -- gets _replaced_ by the mobile subdomain. Combine the two possibilites for each site.
            REGEXP_REPLACE(
                REGEXP_REPLACE(domain, "^m\\\\.wikidata", "www.wikidata"),
            "^m\\\\.mediawiki", "www.mediawiki"),
        "^m\\\\.", ""),
    "\\\\.m\\\\.", ".") AS domain_name,
    SUM(uniques_estimate) / 12 AS monthly_unique_devices,
    SUM(IF(
        (domain REGEXP "^m\\\\." OR  domain REGEXP "\\\\.m\\\\."),
        uniques_estimate,
        0
    )) / SUM(uniques_estimate) AS mobile_unique_devices
FROM wmf.unique_devices_per_domain_monthly
WHERE 
    CONCAT(year, LPAD(month, 2, "0")) >= "{pv_start}" 
    AND CONCAT(year, LPAD(month, 2, "0")) < "{pv_end}"  
GROUP BY    
    REGEXP_REPLACE(
        REGEXP_REPLACE(
            -- The canonical domains for Wikidata and MediaWiki.org start with `www`, which 
            -- gets _replaced_ by the mobile subdomain. Combine the two possibilites for each site.
            REGEXP_REPLACE(
                REGEXP_REPLACE(domain, "^m\\\\.wikidata", "www.wikidata"),
            "^m\\\\.mediawiki", "www.mediawiki"),
        "^m\\\\.", ""),
    "\\\\.m\\\\.", ".")
""".format(**query_vars))

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.
                                                                                

In [10]:
check(ud, index_col="domain_name")

Unnamed: 0,domain_name,monthly_unique_devices,mobile_unique_devices
0,en.wikipedia.org,809352600.0,0.696283
1,es.wikipedia.org,157325100.0,0.708869
2,ja.wikipedia.org,107537000.0,0.734168
3,de.wikipedia.org,104721600.0,0.605649
4,fr.wikipedia.org,94240730.0,0.653896
5,ru.wikipedia.org,99008180.0,0.654718


In [11]:
merge_in(ud, on="domain_name")

## Pageviews

In [12]:
pv = wmf.spark.run("""
SELECT
    IF(
        project IN ("mediawiki", "wikidata"),
        CONCAT("www.", project, ".org"),
        CONCAT(project, ".org")
    ) AS domain_name, 
    SUM(view_count) / 12 AS monthly_pageviews,
    SUM(CASE WHEN access_method = "mobile web" THEN view_count END)
        / SUM(view_count) AS mobile_web_pageviews,
    SUM(CASE WHEN access_method = "mobile app" THEN view_count END)
        / SUM(view_COUNT) AS mobile_app_pageviews
FROM wmf.projectview_hourly
WHERE
    agent_type = "user" 
    AND CONCAT(year, LPAD(month, 2, "0")) >= "{pv_start}" 
    AND CONCAT(year, LPAD(month, 2, "0")) < "{pv_end}"
GROUP BY
    IF(
        project IN ("mediawiki", "wikidata"),
        CONCAT("www.", project, ".org"),
        CONCAT(project, ".org")
    )
""".format(**query_vars))

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.
                                                                                

In [13]:
check(pv, index_col="domain_name")

Unnamed: 0,domain_name,monthly_pageviews,mobile_web_pageviews,mobile_app_pageviews
0,en.wikipedia.org,7413413000.0,0.605818,0.021472
1,es.wikipedia.org,953975000.0,0.650961,0.006659
2,ja.wikipedia.org,1103630000.0,0.653886,0.009408
3,de.wikipedia.org,924580900.0,0.528485,0.044935
4,fr.wikipedia.org,716812600.0,0.603533,0.016513
5,ru.wikipedia.org,825617600.0,0.571403,0.014359


In [14]:
merge_in(pv, on="domain_name")

## Monthly editors

In [17]:
me = wmf.spark.run("""
SELECT
    wiki AS database_code,
    COUNT(*) / 12 AS monthly_editors,
    SUM(CAST(content_edits >= 5 AS INT)) / 12 AS monthly_active_editors,
    SUM(CAST(
        content_edits >= 5
        AND TRUNC(user_registration, 'MM') = TRUNC(month, 'MM') 
    AS INT)) / 12 AS monthly_new_active_editors
FROM wmf_product.editor_month
WHERE
    month >= "{start}" 
    AND month < "{end}" 
    AND user_id != 0
    -- Despite the name, this field identifies bots using both the name and group strategies
    AND NOT bot_by_group
GROUP BY wiki
""".format(**query_vars))

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.
                                                                                

In [18]:
check(me)

Unnamed: 0,database_code,monthly_editors,monthly_active_editors,monthly_new_active_editors
0,enwiki,135506.25,32804.666667,4946.416667
1,eswiki,17713.916667,4644.333333,1036.583333
2,jawiki,15551.75,5389.5,876.5
3,dewiki,19901.333333,5523.833333,434.0
4,frwiki,20209.416667,5199.333333,782.0
5,ruwiki,12347.333333,3630.833333,543.916667


In [19]:
merge_in(me)

## Monthly active administrators

In [20]:
maa = wmf.spark.run("""
SELECT
    wiki AS database_code,
    SUM(monthly_active_administrators) / 12 AS monthly_active_administrators
FROM (
    SELECT
        wiki_db AS wiki,
        SUBSTR(log_timestamp, 1, 6) AS month,
        COUNT(DISTINCT log_actor) AS monthly_active_administrators
    FROM wmf_raw.mediawiki_logging
    WHERE
        log_type IN ("block", "delete", "protect", "rights")
        -- Omit the "delete_redir", "move_prot", and "autopromote" actions, which can be done by regular users
        AND log_action NOT IN ("autopromote", "delete_redir", "move_prot")
        AND log_timestamp >= "{pv_start}" 
        AND log_timestamp < "{pv_end}" 
        AND snapshot = "{snapshot}"
    GROUP BY wiki_db, SUBSTR(log_timestamp, 1, 6)
) mae
GROUP BY wiki
""".format(**query_vars))

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.
                                                                                

In [21]:
check(maa)

Unnamed: 0,database_code,monthly_active_administrators
0,enwiki,423.583333
1,eswiki,50.166667
2,jawiki,34.0
3,dewiki,129.5
4,frwiki,98.416667
5,ruwiki,107.75


In [22]:
merge_in(maa)

## Majority-mobile editors proportion

In [23]:
mmep = wmf.spark.run("""
WITH user AS (
    SELECT 
        wiki_db AS database_code,
        SUM(CAST(ARRAY_CONTAINS(revision_tags, "mobile edit") AS INT))
            / COUNT(*) AS mobile_editing_proportion
    FROM wmf.mediawiki_history
    WHERE
        event_entity = "revision" 
        AND event_type = "create" 
        AND snapshot = "{snapshot}" 
        AND event_timestamp >= "{start}" 
        AND event_timestamp < "{end}"
        AND NOT event_user_is_anonymous
        AND SIZE(event_user_is_bot_by_historical) = 0
    GROUP BY
        wiki_db,
        event_user_text
)
SELECT
   database_code,
   SUM(CAST(mobile_editing_proportion > 0.5 AS INT))
       / COUNT(*) AS majority_mobile_editors
FROM user
GROUP BY database_code
""".format(**query_vars))

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.
                                                                                

In [24]:
check(mmep)

Unnamed: 0,database_code,majority_mobile_editors
0,enwiki,0.261189
1,eswiki,0.300681
2,jawiki,0.296639
3,dewiki,0.147814
4,frwiki,0.179282
5,ruwiki,0.296091


In [25]:
merge_in(mmep)

## New editor retention

In [26]:
# As of February 2022, `cchen.new_editors` will soon stop receiving updates.
# Consider using `wmf_product.new_editors` instead.
ner = wmf.spark.run(
"""
SELECT 
    wiki AS database_code,
    SUM(CAST(2nd_month_edits >= 1 AS INT))
        / SUM(CAST(1st_month_edits >= 1 AS INT)) AS second_month_new_editor_retention
FROM cchen.new_editors
WHERE 
    cohort >= "{ner_cohort_start}" and
    cohort < "{ner_cohort_end}"
GROUP BY wiki
""".format(**query_vars))

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.
                                                                                

In [27]:
check(ner, "database_code")

Unnamed: 0,database_code,second_month_new_editor_retention
0,enwiki,0.076538
1,eswiki,0.050753
2,jawiki,0.106032
3,dewiki,0.074379
4,frwiki,0.063499
5,ruwiki,0.06289


In [28]:
merge_in(ner)

## Monthly non-bot edits

In [29]:
mnbe = wmf.spark.run("""
SELECT
    CONCAT(project, ".org") AS domain_name,
    SUM(edit_count) / 12 AS monthly_nonbot_edits,
    SUM(CASE WHEN ARRAY_CONTAINS(revision_tags, "mobile edit") THEN edit_count END)
        / SUM(edit_count) AS mobile_edits,
    SUM(CASE WHEN ARRAY_CONTAINS(revision_tags, "visualeditor") THEN edit_count END)
        / SUM(edit_count) AS visual_edits,
    SUM(CASE WHEN user_is_anonymous THEN edit_count END) / SUM(edit_count) AS anonymous_edits
FROM wmf.edit_hourly
WHERE
    ts >= "{start}"
    AND ts < "{end}"
    AND NOT user_is_bot
    AND snapshot = "{snapshot}"
GROUP BY CONCAT(project, ".org")
""".format(**query_vars))

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.
                                                                                

In [30]:
check(mnbe, index_col="domain_name")

Unnamed: 0,domain_name,monthly_nonbot_edits,mobile_edits,visual_edits,anonymous_edits
0,en.wikipedia.org,4500923.0,0.144092,0.083029,0.170487
1,es.wikipedia.org,640597.8,0.239887,0.145526,0.270625
2,ja.wikipedia.org,454635.2,0.232572,0.134256,0.24158
3,de.wikipedia.org,782159.1,0.050405,0.098849,0.089402
4,fr.wikipedia.org,738161.6,0.098942,0.15761,0.125333
5,ru.wikipedia.org,506692.3,0.10602,0.150411,0.186179


In [31]:
merge_in(mnbe, on="domain_name")

## Bot editing proportion

In [32]:
bep = wmf.spark.run("""
SELECT
   CONCAT(project, ".org") AS domain_name,
   SUM(CASE WHEN user_is_bot THEN edit_count END)
        / SUM(edit_count) AS bot_edits
FROM wmf.edit_hourly
WHERE
    ts  >= "{start}" 
    AND ts  < "{end}"
    AND snapshot = "{snapshot}"
GROUP BY CONCAT(project, ".org")
""".format(**query_vars))

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.
                                                                                

In [33]:
check(bep, index_col="domain_name")

Unnamed: 0,domain_name,bot_edits
0,en.wikipedia.org,0.175525
1,es.wikipedia.org,0.103631
2,ja.wikipedia.org,0.115351
3,de.wikipedia.org,0.12177
4,fr.wikipedia.org,0.206058
5,ru.wikipedia.org,0.199857


In [34]:
merge_in(bep, on="domain_name")

## Revert rate

In [35]:
rr = wmf.spark.run("""
    SELECT
        wiki_db AS database_code,
        SUM(CAST(revision_is_identity_reverted AS INT)) / COUNT(*) AS revert_rate
    FROM
        wmf.mediawiki_history
    WHERE
        event_entity = "revision" 
        AND event_type = "create" 
        AND snapshot = "{snapshot}" 
        AND event_timestamp >= "{start}" 
        AND event_timestamp < "{end}" 
        AND SIZE(event_user_is_bot_by_historical) = 0 
    GROUP BY wiki_db
""".format(**query_vars))

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.
                                                                                

In [36]:
check(rr)

Unnamed: 0,database_code,revert_rate
0,enwiki,0.110091
1,eswiki,0.186721
2,jawiki,0.068036
3,dewiki,0.075174
4,frwiki,0.070056
5,ruwiki,0.104664


In [37]:
merge_in(rr)

## Edits Gini coefficient

In [39]:
user_edits = wmf.spark.run("""
SELECT
    wiki_db AS wiki,
    COUNT(*) AS user_edits
FROM
    wmf.mediawiki_history
WHERE
    event_entity = "revision" 
    AND event_type = "create" 
    AND snapshot = "{snapshot}" 
    AND event_timestamp >= "{start}" 
    AND event_timestamp < "{end}" 
    AND SIZE(event_user_is_bot_by_historical) = 0
GROUP BY
    wiki_db,
    event_user_id
""".format(**query_vars))

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.
                                                                                

In [40]:
# from https://github.com/oliviaguest/gini
# licensed under CC0 (public domain)
def gini(array):
    """Calculate the Gini coefficient of a numpy array."""
    # based on bottom eq:
    # http://www.statsdirect.com/help/generatedimages/equations/equation154.svg
    # FROM:
    # http://www.statsdirect.com/help/default.htm#nonparametric_methods/gini.htm
    # All values are treated equally, arrays must be 1d:
    array = array.flatten()
    if np.amin(array) < 0:
        # Values cannot be negative:
        array -= np.amin(array)
    # Values cannot be 0:
    array = array + 0.0000001
    # Values must be sorted:
    array = np.sort(array)
    # Index per array element:
    index = np.arange(1, array.shape[0] + 1)
    # Number of array elements:
    n = array.shape[0]
    # Gini coefficient:
    return ((np.sum((2 * index - n - 1) * array)) / (n * np.sum(array)))

egc = (
    user_edits
    .groupby("wiki")
    .apply(lambda g: gini(g["user_edits"].values))
    .reset_index()
)

egc.columns = ["database_code", "edits_Gini_coefficient"]

In [41]:
check(egc)

Unnamed: 0,database_code,edits_Gini_coefficient
0,enwiki,0.954271
1,eswiki,0.95881
2,jawiki,0.938387
3,dewiki,0.959961
4,frwiki,0.9592
5,ruwiki,0.960198


In [42]:
merge_in(egc)

## Content pages

Note that this query gives the number of content pages _at query time_, not at the end of the snapshot period. Unless the gap between those times is many months, the difference should be pretty small.

Ideally, we would query `mediawiki_history` or the AQS API instead for the count as of the end of the snapshot period. However, this would introduce a new problem: the official [content pages definition](https://www.mediawiki.org/wiki/Manual:Article_count) specifies that pages (in addition to being in a content namespace, not being deleted, and not being a redirect) must also contain one internal link. This information isn't available in `mediawiki_history`, and the article count available through the AQS API probably doesn't take it into account either.


In [43]:
wikis_list = wikis["database_code"].tolist()

ac = wmf.mariadb.run("""
SELECT
    database() AS database_code,
    ss_good_articles AS content_pages
FROM site_stats
""", wikis_list)

In [44]:
check(ac)

Unnamed: 0,database_code,content_pages
0,enwiki,6448340
1,eswiki,1751066
2,jawiki,1311723
3,dewiki,2660793
4,frwiki,2395160
5,ruwiki,1792046


In [45]:
merge_in(ac)

## All-time content edits

In [57]:
cce = wmf.spark.run("""
SELECT
    wiki_db AS database_code,
    COUNT(*) AS all_time_content_edits
FROM
    wmf.mediawiki_history
WHERE
    event_entity = "revision" 
    AND event_type = "create" 
    AND snapshot = "{snapshot}" 
    AND page_namespace_is_content
    AND SIZE(event_user_is_bot_by_historical) = 0 
GROUP BY wiki_db
""".format(**query_vars))

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.
22/02/04 21:24:50 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).
22/02/04 21:24:50 WARN Utils: Service 'sparkDriver' could not bind on port 12000. Attempting port 12001.
22/02/04 21:24:50 WARN Utils: Service 'sparkDriver' could not bind on port 12001. Attempting port 12002.
22/02/04 21:24:50 WARN Utils: Service 'sparkDriver' could not bind on port 12002. Attempting port 12003.
22/02/04 21:24:50 WARN Utils: Service 'sparkDriver' could not bind on port 12003. Attempting port 12004.
22/02/04 21:24:50 WARN Utils: Service 'sparkDriver' could not bind on port 12004. Attempting port 12005.
22/02/04 21:24:50 WARN Utils: Service 'sparkDriver' could not bind on port 12005. Attempting port 12006.
22/02/04 21:24:50 WARN Utils: Service 'sparkDriver' could not bind on port 12006. Attempting port 12007.
22/02/

In [58]:
check(cce)

Unnamed: 0,database_code,all_time_content_edits
0,enwiki,655224211
1,eswiki,94772615
2,jawiki,66956406
3,dewiki,134182420
4,frwiki,110040129
5,ruwiki,72128426


In [59]:
merge_in(cce)

## Script direction

In [49]:
rtl_wikis = get_dblist("rtl")
rtl = pd.DataFrame({"database_code": rtl_wikis, "script_direction": "right-to-left"})
merge_in(rtl)
wikis["script_direction"] = wikis["script_direction"].replace({0: "left-to-right"})

## Unique devices per editor

In [50]:
wikis["unique_devices_per_editor"] = wikis["monthly_unique_devices"] / wikis["monthly_editors"]

In [51]:
wikis = wikis.replace([np.inf], 0)

## Overall size rank

In [52]:
size = np.sqrt(wikis["monthly_unique_devices"] * wikis["monthly_active_editors"])
rank = size.rank(method="min", na_option="bottom", ascending=False)
wikis["overall_size_rank"] = rank

## Edits per content page

In [62]:
wikis["all_time_edits_per_content_page"] = wikis["all_time_content_edits"] / wikis["content_pages"]

# Readying for output

In [64]:
wikis_formatted = (
    wikis[[
        "overall_size_rank",
        "monthly_unique_devices",
        "mobile_unique_devices",
        "monthly_pageviews",
        "mobile_web_pageviews",
        "mobile_app_pageviews",
        "unique_devices_per_editor",
        "monthly_editors",
        "majority_mobile_editors",
        "monthly_active_editors",
        "monthly_active_administrators",
        "monthly_new_active_editors",
        "second_month_new_editor_retention",
        "bot_edits",
        "monthly_nonbot_edits",
        "mobile_edits",
        "visual_edits",
        "anonymous_edits",
        "revert_rate",
        "edits_Gini_coefficient",
        "content_pages",
        "all_time_content_edits",
        "all_time_edits_per_content_page",
        "script_direction",
        "database_code",
        "project_code",
        "language_code",
        "language_name",
        "domain_name",
        "wiki_name"
    ]].astype({
        # These columns are not monthly averages and will never have a fractional component
        "overall_size_rank": int,
        "content_pages": int,
        "all_time_content_edits": int
    })
    .sort_values("overall_size_rank")
)

# Convert underscores to spaces in column names in the final step, to avoid having to
# work with spaces beforehand
wikis_formatted = wikis_formatted.rename(columns=lambda c: c.replace("_", " "))

## Output CSV

In [65]:
wikis_formatted.to_csv(
    f"snapshots/{file_stem}.csv",
    float_format="%0.4f",
    index=False
)