# How many edits would Automoderator revert per day at different caution levels?

[TASK: T348869](https://phabricator.wikimedia.org/T348869)

**Purpose**<br>As part of the model testing process, we want to understand how many can we expect Automoderator revert per day on average. 
This will be helpful for community to understand the potential impact of Automoderator. For the analysis, [revert risk scores generated by WMF's Research team](https://gitlab.wikimedia.org/repos/research/knowledge_integrity/-/blob/mnz/examples/examples/notebooks/revertrisk_example.ipynb) were used, and edits made by admins, bots, self-reverts, and new page creations were excluded.

**Results**<br>Average daily number of edits Automoderator would potentially revert per day at different thresholds

# Data-Gathering

In [2]:
import pandas as pd
import wmfdata as wmf

pd.options.display.max_columns = None
from IPython.display import clear_output

import warnings

## spark_session

In [3]:
spark_session = wmf.spark.get_active_session()

if type(spark_session) != type(None):
    spark_session.stop()
else:
    print('no active session')

no active session


In [4]:
spark_session = wmf.spark.create_custom_session(
    master="yarn",
    app_name='automod-activity',
    spark_config={
        "spark.driver.memory": "4g",
        "spark.dynamicAllocation.maxExecutors": 64,
        "spark.executor.memory": "16g",
        "spark.executor.cores": 4,
        "spark.sql.shuffle.partitions": 256,
        "spark.driver.maxResultSize": "2g"
        
    }
)

clear_output()

spark_session.sparkContext.setLogLevel("ERROR")
spark_session

## query

In [5]:
# paths to pre-calculated revert risk scores
# generated by https://gitlab.wikimedia.org/repos/research/knowledge_integrity/-/blob/mnz/examples/examples/notebooks/revertrisk_example.ipynb
rr_scores_path = '/user/paragon/riskobservatory/revertrisk_20212022_anonymous_bot.parquet'

rr_scores = spark_session.read.parquet(rr_scores_path)
rr_scores.createOrReplaceTempView('rr_scores')

rr_scores.printSchema()

                                                                                

root
 |-- rev_id: long (nullable = true)
 |-- wiki_db: string (nullable = true)
 |-- rev_timestamp: string (nullable = true)
 |-- revision_is_identity_reverted: boolean (nullable = true)
 |-- revision_seconds_to_identity_revert: long (nullable = true)
 |-- page_id: long (nullable = true)
 |-- revision_revert_risk: float (nullable = true)
 |-- user_is_anonymous: boolean (nullable = true)
 |-- user_is_bot: boolean (nullable = true)



In [8]:
# risk thresholds: in steps of 0.005 until 0.95 and in steps of 0.01 from 0.95 to 0.9
risk_thresholds = [0.99 - i*0.005 for i in range(9)] + [round(0.94 - i*0.01, 2) for i in range(5)]

# generate CASE WHEN statements based on the risk thresholds
risk_case_statements = [
    f"SUM(CASE WHEN risk > {threshold} THEN 1 ELSE 0 END) AS t_{str(threshold).replace('.', '_')}"
    for threshold in risk_thresholds
]
risk_case_sql = ',\n        '.join(risk_case_statements)

# average select statements based on the risk thresholds
avg_select_statements = [
    f"CAST(ROUND(AVG(t_{str(threshold).replace('.', '_')})) AS INT) AS t_{str(threshold).split('.')[1]}"
    for threshold in risk_thresholds
]
avg_select_sql = ',\n    '.join(avg_select_statements)

In [9]:
wiki_comparision = pd.read_csv('https://raw.githubusercontent.com/wikimedia-research/wiki-comparison/main/data-collection/snapshots/Jan_2023.tsv', sep='\t')
top150_wps = (
    wiki_comparision[wiki_comparision['project code'] == 'wikipedia']
    .reset_index(drop=True)
    .iloc[:150, :]['database code']
    .values.tolist()
)

wikis_sql = wmf.utils.sql_tuple(top150_wps)
mwh_snapshot = '2023-10'

In [10]:
%%time

query = f"""
WITH base AS (
    SELECT
        rr.wiki_db,
        rr.rev_id,
        revision_revert_risk AS risk,
        event_user_text,
        DATE(event_timestamp) AS date,
        mwh.revision_is_identity_revert,
        CASE 
            WHEN mwh.revision_is_identity_revert THEN 'revert' 
            ELSE 'non_revert' 
        END AS revision_type
    FROM 
        rr_scores rr
    JOIN 
        wmf.mediawiki_history mwh 
        ON rr.wiki_db = mwh.wiki_db AND rr.rev_id = mwh.revision_id
    WHERE 
        snapshot = '{mwh_snapshot}'
        AND rr.wiki_db IN {wikis_sql}
        -- exclude page creations
        AND mwh.revision_parent_id <> 0
        -- exclude adminstrators
        AND NOT ARRAY_CONTAINS(mwh.event_user_groups, 'sysop')
        -- exclude bots
        AND SIZE(event_user_is_bot_by) = 0
        AND YEAR(event_timestamp) = 2022
),

excl_self_reverts AS (
    SELECT
        b.*
    FROM
        base b
    JOIN wmf.mediawiki_history mwh
        ON b.rev_id = mwh.revision_first_identity_reverting_revision_id
    WHERE
        snapshot = '{mwh_snapshot}'
        AND b.revision_type = 'revert'
        -- exclude self reverts
        AND b.event_user_text <> mwh.event_user_text
),

sample AS (
    SELECT * FROM base WHERE revision_type = 'non_revert'
    UNION ALL
    SELECT * FROM excl_self_reverts
),

count_score AS (
    SELECT
        date,
        wiki_db,
        {risk_case_sql}
    FROM
        base
    GROUP BY
        wiki_db,
        date
)

SELECT 
    wiki_db,
    {avg_select_sql}
FROM
    count_score
GROUP BY
    wiki_db
ORDER BY
    wiki_db
"""

result = wmf.spark.run(query)

23/11/13 07:54:36 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

CPU times: user 390 ms, sys: 110 ms, total: 501 ms
Wall time: 2min 7s


In [11]:
result.sort_values('t_99', ascending=False).to_csv('revert_risk_reverts.tsv', sep='\t', index=False)