# How many edits would Automoderator revert per day at different caution levels?

[TASK: T348869](https://phabricator.wikimedia.org/T348869)

**Purpose**<br>As part of the model testing process, we want to understand how many can we expect Automoderator revert per day on average. 
This will be helpful for community to understand the potential impact of Automoderator. For the analysis, [revert risk scores generated by WMF's Research team](https://gitlab.wikimedia.org/repos/research/knowledge_integrity/-/blob/mnz/examples/examples/notebooks/revertrisk_example.ipynb) were used, and edits made by admins, bots, self-reverts, and new page creations were excluded.

**Results**<br>Average daily number of edits Automoderator would potentially revert per day at different thresholds

# Data-Gathering

In [2]:
import pandas as pd
import wmfdata as wmf

pd.options.display.max_columns = None
from IPython.display import clear_output

import warnings

## spark_session

In [26]:
spark_session = wmf.spark.get_active_session()

if type(spark_session) != type(None):
    spark_session.stop()
else:
    print('no active session')

no active session


In [27]:
spark_session = wmf.spark.create_custom_session(
    master="yarn",
    app_name='automod-activity',
    spark_config={
        "spark.driver.memory": "4g",
        "spark.dynamicAllocation.maxExecutors": 64,
        "spark.executor.memory": "16g",
        "spark.executor.cores": 4,
        "spark.sql.shuffle.partitions": 256,
        "spark.driver.maxResultSize": "2g"
        
    }
)

clear_output()

spark_session.sparkContext.setLogLevel("ERROR")
spark_session

## query

In [28]:
# paths to pre-calculated revert risk scores
# generated by https://gitlab.wikimedia.org/repos/research/knowledge_integrity/-/blob/mnz/examples/examples/notebooks/revertrisk_example.ipynb
rr_scores_path = '/user/paragon/riskobservatory/revertrisk_20212022_anonymous_bot.parquet'

rr_scores = spark_session.read.parquet(rr_scores_path)
rr_scores.createOrReplaceTempView('rr_scores')

rr_scores.printSchema()

[Stage 0:>                                                          (0 + 1) / 1]

root
 |-- rev_id: long (nullable = true)
 |-- wiki_db: string (nullable = true)
 |-- rev_timestamp: string (nullable = true)
 |-- revision_is_identity_reverted: boolean (nullable = true)
 |-- revision_seconds_to_identity_revert: long (nullable = true)
 |-- page_id: long (nullable = true)
 |-- revision_revert_risk: float (nullable = true)
 |-- user_is_anonymous: boolean (nullable = true)
 |-- user_is_bot: boolean (nullable = true)



                                                                                

In [29]:
# risk thresholds: in steps of 0.005 until 0.95 and in steps of 0.01 from 0.95 to 0.9
risk_thresholds = [0.99 - i*0.005 for i in range(9)] + [round(0.94 - i*0.01, 2) for i in range(5)]

# generate CASE WHEN statements based on the risk thresholds
risk_case_statements = [
    f"SUM(CASE WHEN risk > {threshold} THEN 1 ELSE 0 END) AS t_{str(threshold).replace('.', '_')}"
    for threshold in risk_thresholds
]
risk_case_sql = ',\n        '.join(risk_case_statements)

# average select statements based on the risk thresholds
avg_select_statements = [
    f"CAST(ROUND(AVG(t_{str(threshold).replace('.', '_')})) AS INT) AS t_{str(threshold).split('.')[1]}"
    for threshold in risk_thresholds
]
avg_select_sql = ',\n    '.join(avg_select_statements)

In [31]:
wiki_comparision = pd.read_csv('https://raw.githubusercontent.com/wikimedia-research/wiki-comparison/main/data-collection/snapshots/Jan_2023.tsv', sep='\t')
top150_wps = (
    wiki_comparision[wiki_comparision['project code'] == 'wikipedia']
    .reset_index(drop=True)
    .iloc[:150, :]['database code']
    .values.tolist()
)

wikis_sql = wmf.utils.sql_tuple(top150_wps)
mwh_snapshot = '2023-10'

In [None]:
%%time

query = f"""
WITH 
    base AS (
        SELECT
            rr.wiki_db,
            rr.rev_id,
            revision_revert_risk AS risk,
            mwh.event_user_text,
            DATE(event_timestamp) AS date,
            mwh.revision_is_identity_revert,
            CASE 
                WHEN mwh.revision_is_identity_revert THEN 'revert' 
                ELSE 'non_revert' 
            END AS revision_type
        FROM 
            rr_scores rr
        JOIN 
            wmf.mediawiki_history mwh 
            ON rr.wiki_db = mwh.wiki_db AND rr.rev_id = mwh.revision_id
        WHERE 
            snapshot = '{mwh_snapshot}'
            AND rr.wiki_db IN {wikis_sql}

            -- exclude page creations
            AND NOT mwh.revision_parent_id = 0

            -- exclude adminstrators
            AND 
                (
                    event_user_groups IS NULL
                    OR NOT ARRAY_CONTAINS(mwh.event_user_groups_historical, 'sysop') 
                )

            -- exclude bots
            AND SIZE(event_user_is_bot_by_historical) = 0        
            AND YEAR(event_timestamp) = 2022
    ),

    excl_self_reverts AS (
        SELECT
            b.*
        FROM
            base b
        JOIN 
            wmf.mediawiki_history mwh
            ON b.rev_id = mwh.revision_first_identity_reverting_revision_id AND b.wiki_db = mwh.wiki_db
        WHERE
            snapshot = '{mwh_snapshot}'
            AND b.revision_type = 'revert'

            -- exclude self reverts
            AND NOT b.event_user_text = mwh.event_user_text
    ),

    sample AS (
        SELECT 
            DISTINCT * 
        FROM (
            SELECT * FROM base WHERE revision_type = 'non_revert'
            UNION ALL
            SELECT * FROM excl_self_reverts
        )
    ),


    count_score AS (
        SELECT
            date,
            wiki_db,
            {risk_case_sql}
        FROM
            sample
        GROUP BY
            wiki_db,
            date
    )

SELECT 
    wiki_db,
    {avg_select_sql}
FROM
    count_score
GROUP BY
    wiki_db
ORDER BY
    wiki_db
"""

result = wmf.spark.run(query)



In [9]:
result.sort_values('t_99', ascending=False).to_csv('revert_risk_reverts.tsv', sep='\t', index=False)

In [38]:
result.sort_values('t_99', ascending=False).head(20)

Unnamed: 0,wiki_db,t_99,t_985,t_98,t_975,t_97,t_965,t_96,t_955,t_95,t_94,t_93,t_92,t_91,t_9
37,enwiki,154,356,691,1098,1540,1995,2457,2922,3402,4406,5472,6592,7736,8874
39,eswiki,57,119,217,331,450,570,690,807,923,1161,1407,1657,1899,2130
68,itwiki,41,62,101,154,217,285,355,427,499,651,804,959,1119,1277
110,ruwiki,35,58,90,130,178,230,285,340,397,511,631,751,871,990
69,jawiki,28,39,50,64,83,106,133,163,196,269,348,434,523,618
47,frwiki,24,40,66,100,139,180,222,266,311,405,503,606,714,826
99,nlwiki,19,27,36,48,60,71,83,96,109,138,168,199,230,260
55,hewiki,16,23,30,39,49,61,73,86,99,125,152,179,205,232
33,dewiki,14,25,43,65,90,115,141,168,197,258,327,401,478,560
42,fawiki,13,26,45,68,94,121,149,178,206,263,316,368,419,469


In [35]:
result.sort_values('t_99', ascending=False).head(20)

Unnamed: 0,wiki_db,t_99,t_985,t_98,t_975,t_97,t_965,t_96,t_955,t_95,t_94,t_93,t_92,t_91,t_9
37,enwiki,152,350,680,1077,1509,1953,2404,2859,3327,4308,5352,6450,7572,8689
39,eswiki,57,118,215,327,445,562,680,795,909,1143,1385,1631,1869,2098
68,itwiki,40,61,99,151,211,278,345,414,485,633,782,934,1091,1247
110,ruwiki,34,57,88,128,175,225,278,332,387,499,615,732,849,966
69,jawiki,27,37,48,61,79,101,126,155,186,256,333,415,503,595
47,frwiki,24,40,66,98,136,176,218,260,305,396,492,593,699,808
99,nlwiki,19,26,36,47,58,69,80,92,105,133,162,192,223,252
55,hewiki,16,22,30,38,48,59,71,84,96,122,148,174,200,226
33,dewiki,14,25,43,65,89,114,139,166,194,255,323,395,472,553
42,fawiki,13,26,44,67,92,119,146,174,202,257,309,360,410,459


In [39]:
result.query("""wiki_db == 'cswiki'""")

Unnamed: 0,wiki_db,t_99,t_985,t_98,t_975,t_97,t_965,t_96,t_955,t_95,t_94,t_93,t_92,t_91,t_9
29,cswiki,5,8,11,16,21,26,31,37,43,54,66,78,90,103


In [49]:
result.query("""wiki_db == 'idwiki'""")

Unnamed: 0,wiki_db,t_99,t_985,t_98,t_975,t_97,t_965,t_96,t_955,t_95,t_94,t_93,t_92,t_91,t_9
63,idwiki,6,11,18,27,40,55,71,88,106,146,188,230,273,313
