# Edit Data Gathering

This notebook gathers edit data using MediaWiki history that allows us to answer question about our high level metrics: activation, retention, productivity, and revert proportions.

In [1]:
import json
import datetime as dt

import numpy as np
import pandas as pd

from wmfdata import spark, mariadb

In [2]:
## Configuration variables

## Start and end timestamps of user registrations from T286816-user-dataset.ipynb
## Data gathering is automatically extended to 15 days beyond the end timestamp.
exp_start_ts = dt.datetime(2021, 5, 27, 19, 12, 3)
exp_end_ts = dt.datetime(2021, 10, 14, 0, 0, 0)

## The wikis that we'll gather data for (from the above referenced notebook)
wikis = ['arwiki', 'bnwiki', 'cswiki', 'viwiki', 'fawiki', 'frwiki', 'huwiki', 'plwiki', 'rowiki', 'ruwiki']
## The snapshot of mediawiki_history that we'll use
mwh_snapshot = '2021-10'

## The name of the table with the user dataset (from the above referenced notebook)
canonical_user_table = 'nettrom_growth.addalink_exp_users'

## Filename of where the edit count dataset is stored.
edit_data_output_filename = 'datasets/add-a-link-edit-data.tsv'

## Grabbing editing data

We gather edit data in accordance with the key metrics for NEWTEA. Those are:

* Editor activation
* Editor retention
* Average number of edits in the first two weeks after registration
* Proportion of constructive edits (i.e. unreverted in 48 hours)

This means we gather edit and revert counts so we can model activation, retention, and productivity with and without reverted edits as we see fit.

Per NEWTEA we'll also separate edits by namespaces:

1. All namespaces.
2. Only Main and Talk (namespaces 0 and 1)
3. All other namespaces.

Since 2 and 3 are mutually exclusive, we'll count them separately and sum them up to get the first.

In [3]:
edit_data_query = '''
WITH edits AS (
    SELECT wiki_db, event_user_id AS user_id,
    -- ns 0 & 1 edits on the first day
    SUM(IF(page_namespace IN (0, 1)
        AND unix_timestamp(event_timestamp) - unix_timestamp(event_user_creation_timestamp) < 86400, 1, 0))
        AS num_article_edits_24hrs,
    -- ns 0 & 1 edits on the first day that were reverted
    SUM(IF(page_namespace IN (0, 1) AND revision_is_identity_reverted = true AND revision_seconds_to_identity_revert < 60*60*48
        AND unix_timestamp(event_timestamp) - unix_timestamp(event_user_creation_timestamp) < 86400, 1, 0))
        AS num_article_reverts_24hrs,
    --  other namespace edits on the first day
    SUM(IF(page_namespace NOT IN (0, 1)
        AND unix_timestamp(event_timestamp) - unix_timestamp(event_user_creation_timestamp) < 86400, 1, 0))
        AS num_other_edits_24hrs,
    -- other namespace reverts on the first day
    SUM(IF(page_namespace NOT IN (0, 1) AND revision_is_identity_reverted = true AND revision_seconds_to_identity_revert < 60*60*48
        AND unix_timestamp(event_timestamp) - unix_timestamp(event_user_creation_timestamp) < 86400, 1, 0))
        AS num_other_reverts_24hrs,
    -- ns 0 & 1 edits on days 1–15
    SUM(IF(page_namespace IN (0, 1)
        AND unix_timestamp(event_timestamp) - unix_timestamp(event_user_creation_timestamp) BETWEEN 86400 AND 15*86400, 1, 0))
        AS num_article_edits_2w,
    -- ns 0 & 1 edits on days 1–15 that were reverted
    SUM(IF(page_namespace IN (0, 1) AND revision_is_identity_reverted = true AND revision_seconds_to_identity_revert < 60*60*48
        AND unix_timestamp(event_timestamp) - unix_timestamp(event_user_creation_timestamp) BETWEEN 86400 AND 15*86400, 1, 0))
        AS num_article_reverts_2w,
    -- other namespace edits on days 1–15
    SUM(IF(page_namespace NOT IN (0, 1)
        AND unix_timestamp(event_timestamp) - unix_timestamp(event_user_creation_timestamp) BETWEEN 86400 AND 15*86400, 1, 0))
        AS num_other_edits_2w,
    -- other namespace reverts on days 1–15
    SUM(IF(page_namespace NOT IN (0, 1) AND revision_is_identity_reverted = true AND revision_seconds_to_identity_revert < 60*60*48
        AND unix_timestamp(event_timestamp) - unix_timestamp(event_user_creation_timestamp) BETWEEN 86400 AND 15*86400, 1, 0))
        AS num_other_reverts_2w
    FROM wmf.mediawiki_history
    WHERE snapshot = "{snapshot}"
    AND event_entity = "revision"
    AND event_type = "create"
    AND wiki_db IN ({wiki_list})
    AND event_timestamp > "{start_date}"
    AND event_timestamp < "{end_date}"
    GROUP BY wiki_db, event_user_id
),
users AS (
    SELECT
        wiki_db,
        user_id,
        user_registration_timestamp,
        reg_on_mobile,
        hp_enabled,
        hp_variant
    FROM {exp_user_table}
)
SELECT
    users.wiki_db,
    users.user_id,
    users.user_registration_timestamp,
    users.reg_on_mobile,
    users.hp_enabled,
    users.hp_variant,
    COALESCE(num_article_edits_24hrs, 0) AS num_article_edits_24hrs,
    COALESCE(num_article_reverts_24hrs, 0) AS num_article_reverts_24hrs,
    COALESCE(num_other_edits_24hrs, 0) AS num_other_edits_24hrs,
    COALESCE(num_other_reverts_24hrs, 0) AS num_other_reverts_24hrs,
    COALESCE(num_article_edits_2w, 0) AS num_article_edits_2w,
    COALESCE(num_article_reverts_2w, 0) AS num_article_reverts_2w,
    COALESCE(num_other_edits_2w, 0) AS num_other_edits_2w,
    COALESCE(num_other_reverts_2w, 0) AS num_other_reverts_2w
FROM users
LEFT JOIN edits
ON users.wiki_db = edits.wiki_db
AND users.user_id = edits.user_id
'''

In [4]:
## We set the start date to the experiment start date,
## and the end date to the experiment end date + 15 days
## to give users who registered within 15 days of the last
## date the same amount of time to edit as everyone else.

all_users_edit_data = spark.run(
    edit_data_query.format(
        snapshot = mwh_snapshot,
        wiki_list = ','.join(['"{}"'.format(w) for w in wikis]),
        start_date = exp_start_ts.date().isoformat(),
        end_date = (exp_end_ts.date() + dt.timedelta(days = 15)).isoformat(),
        exp_user_table = canonical_user_table
    )
)

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.


In [5]:
len(all_users_edit_data)

130179

In [None]:
all_users_edit_data.loc[all_users_edit_data['num_article_edits_24hrs'] > 0].head()

Write out the canonical edit dataset for importing into R.

In [7]:
all_users_edit_data.to_csv(edit_data_output_filename,
                           header = True, index = False, sep = '\t')