# Edits Automoderator Would Potentially Review in a Day
Task: [T352026](https://phabricator.wikimedia.org/T352026)

**Overview**

How many edits Automoderator will review ([T345092#9339050](https://phabricator.wikimedia.org/T345092#9339050)), given the various constraints that maybe put on it to improve the false positive rate?
* How many edits are made in total?
* How many of these edits are to the main namespace?
* How many of those edits are:
    * Page creations
    * Made by bots
    * Made by administrators
    * Self-reverts

## Data-Gathering

### Setup

In [30]:
import wmfdata as wmf
import pandas as pd
import numpy as np
import great_tables as gt

from IPython.display import clear_output
import warnings

pd.options.display.max_columns = None
pd.options.display.max_rows = 250

In [2]:
spark_session = wmf.spark.get_active_session()

if type(spark_session) == type(None):
    spark_session = wmf.spark.create_custom_session(
        master="yarn",
        app_name='automod-reviews-per-day',
        spark_config={
            "spark.driver.memory": "4g",
            "spark.dynamicAllocation.maxExecutors": 64,
            "spark.executor.memory": "16g",
            "spark.executor.cores": 4,
            "spark.sql.shuffle.partitions": 256,
            "spark.driver.maxResultSize": "2g"
        }
    )

spark_session.sparkContext.setLogLevel("ERROR")

clear_output()

spark_session

### Query

In [5]:
# paths to pre-calculated revert risk scores
# generated by https://gitlab.wikimedia.org/repos/research/knowledge_integrity/-/blob/mnz/examples/examples/notebooks/revertrisk_example.ipynb
rr_scores_path = '/user/paragon/riskobservatory/revertrisk_20212022_anonymous_bot.parquet'

rr_scores = spark_session.read.parquet(rr_scores_path)
rr_scores.createOrReplaceTempView('rr_scores')

rr_scores.printSchema()

                                                                                

root
 |-- rev_id: long (nullable = true)
 |-- wiki_db: string (nullable = true)
 |-- rev_timestamp: string (nullable = true)
 |-- revision_is_identity_reverted: boolean (nullable = true)
 |-- revision_seconds_to_identity_revert: long (nullable = true)
 |-- page_id: long (nullable = true)
 |-- revision_revert_risk: float (nullable = true)
 |-- user_is_anonymous: boolean (nullable = true)
 |-- user_is_bot: boolean (nullable = true)



In [14]:
wiki_comp = pd.read_csv('https://raw.githubusercontent.com/wikimedia-research/wiki-comparison/main/data-collection/snapshots/Jan_2023.tsv', sep='\t')
top15_wps = wiki_comp[wiki_comp['project code'] == 'wikipedia'][:15]['database code'].values.tolist()
top15_wps_sql = wmf.utils.sql_tuple(top15_wps)
mwh_snapshot = '2023-12'

In [37]:
%%time

query = f"""
WITH
    base AS (
        SELECT
            wiki_db,
            DATE(event_timestamp) AS rev_dt,
            revision_id,
            event_user_text,
            page_namespace,
            revision_parent_id,
            page_is_redirect,
            CASE
                WHEN SIZE(event_user_is_bot_by) > 0 THEN TRUE
                ELSE FALSE
            END AS is_bot,
            CASE 
                WHEN ARRAY_CONTAINS(event_user_groups_historical, 'sysop') THEN TRUE
                ELSE FALSE
            END AS is_sysop,
            revision_is_identity_revert AS is_revert,
            FALSE AS is_self_revert
        FROM
            wmf.mediawiki_history
        WHERE
            snapshot = '{mwh_snapshot}'
            AND event_entity = 'revision'
            AND event_type = 'create'
            AND wiki_db IN {top15_wps_sql}
            AND YEAR(event_timestamp) = 2022
    ),
    
    self_reverts AS (
        SELECT
            b.wiki_db,
            b.rev_dt,
            b.revision_id,
            b.event_user_text,
            b.page_namespace,
            b.revision_parent_id,
            b.page_is_redirect,
            b.is_bot,
            b.is_sysop,
            b.is_revert,
            CASE
                WHEN b.event_user_text = mwh.event_user_text THEN TRUE
                ELSE FALSE
            END AS is_self_revert
        FROM
            base b
        JOIN
            wmf.mediawiki_history mwh
            ON mwh.revision_first_identity_reverting_revision_id = b.revision_id
                AND mwh.wiki_db = b.wiki_db
        WHERE
            snapshot = '{mwh_snapshot}'
            AND event_entity = 'revision'
            AND event_type = 'create'
            AND b.is_revert    
    ),
    
    edits AS (
        SELECT
            *
        FROM
            base
        WHERE
            NOT is_revert
        UNION ALL
        SELECT
            *
        FROM 
            self_reverts
    )

SELECT
    wiki_db AS `Wikipedia`,
    rev_dt,
    COUNT(DISTINCT revision_id) AS `All Edits`,
    COUNT(DISTINCT CASE WHEN page_namespace = 0 THEN revision_id ELSE NULL END) AS `Main Namespace Edits`,
    COUNT(DISTINCT CASE WHEN page_namespace = 0 AND revision_parent_id = 0 THEN revision_id ELSE NULL END) AS `Page Creations`,
    COUNT(DISTINCT CASE WHEN page_namespace = 0 AND revision_parent_id = 0 AND page_is_redirect THEN revision_id ELSE NULL END) AS `Redirect Creations`,
    COUNT(DISTINCT CASE WHEN page_namespace = 0 AND revision_parent_id = 0 AND NOT page_is_redirect THEN revision_id ELSE NULL END) AS `Non Redirect Creations`,
    COUNT(DISTINCT CASE WHEN page_namespace = 0 AND is_bot THEN revision_id ELSE NULL END) AS `Bot Edits`,
    COUNT(DISTINCT CASE WHEN page_namespace = 0 AND is_sysop THEN revision_id ELSE NULL END) AS `Sysop Edits`,
    COUNT(DISTINCT CASE WHEN page_namespace = 0 AND is_revert THEN revision_id ELSE NULL END) AS `Reverts`,
    COUNT(DISTINCT CASE WHEN page_namespace = 0 AND is_self_revert THEN revision_id ELSE NULL END) AS `Self Reverts`
FROM
    edits
GROUP BY
    wiki_db,
    rev_dt
"""

reviews_info = (
    wmf.spark.run(query)
    .sort_values('All Edits', ascending=False, ignore_index=False)
)



CPU times: user 1e+03 ms, sys: 209 ms, total: 1.21 s
Wall time: 4min 8s


                                                                                

## Results

In [48]:
reviews_info = (
    reviews_info
    .groupby('Wikipedia')[reviews_info.columns[2:]]
    .mean()
    .apply(round)
    .astype({
        col: int for col in reviews_info.columns[2:]
    })
    .sort_values('All Edits', ascending=False, ignore_index=False)
)

reviews_info

Unnamed: 0_level_0,All Edits,Main Namespace Edits,Page Creations,Redirect Creations,Non Redirect Creations,Bot Edits,Sysop Edits,Reverts,Self Reverts
Wikipedia,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
enwiki,184001,112545,2114,1431,519,8728,7178,7160,934
frwiki,28491,19851,562,177,272,1212,1791,755,103
dewiki,27037,18915,564,200,286,1243,1968,757,73
ruwiki,23251,17690,1656,1291,271,4855,432,946,108
eswiki,20830,17111,454,123,234,1965,498,1942,144
itwiki,17505,14033,280,62,157,2984,1054,884,94
jawiki,15967,13655,295,112,137,1083,174,539,126
zhwiki,15917,12255,491,233,222,1729,162,396,88
arwiki,11058,7368,247,54,135,4808,3292,405,70
plwiki,9155,7196,260,88,133,2549,682,317,29
