## imports

In [None]:
import wmfdata as wmf
import pandas as pd
import numpy as np

In [None]:
import json
import re
import warnings

In [None]:
pd.options.display.max_columns = None
bold = '\033[1m'
end = '\033[0m'

## spark_session

In [None]:
spark_session = wmf.spark.get_active_session()

if type(spark_session) != type(None):
    spark_session.stop()
else:
    print('no active session')

In [None]:
spark_session = wmf.spark.create_custom_session(
    master="yarn",
    app_name='bot-vandal-reverts',
    spark_config={
        "spark.driver.memory": "4g",
        "spark.dynamicAllocation.maxExecutors": 64,
        "spark.executor.memory": "20g",
        "spark.executor.cores": 4,
        "spark.sql.shuffle.partitions": 256,
        "spark.driver.maxResultSize": "2g"
        
    }
)

In [None]:
# ig_warn()
# spark_session = wmf.spark.create_session(type='yarn-large')
spark_session

In [None]:
spark_session.sparkContext.setLogLevel("ERROR")

## data

In [None]:
bots = {
    'enwiki': 'ClueBot NG',
    'eswiki': 'SeroBOT',
    'frwiki': 'Salebot',
    'ptwiki': 'Salebot',
    'fawiki': 'Dexbot',
    'bgwiki': 'PSS 9',
    'simplewiki': 'ChenzwBot',
    'ruwiki': 'Рейму Хакурей',
    'rowiki': 'PatrocleBot'
}

In [105]:
%%time

query = """
WITH 
    base AS (
        SELECT 
            wiki_db,
            revision_id,
            event_timestamp,
            revision_first_identity_reverting_revision_id,
            revision_seconds_to_identity_revert
        FROM 
            wmf.mediawiki_history
        WHERE 
            snapshot = '{MW_SNAPSHOT}'
            AND wiki_db IN {DBS}
            AND event_entity = 'revision'
            AND event_type = 'create'
            AND revision_is_identity_reverted
            AND page_namespace_is_content
            AND revision_seconds_to_identity_revert <= 24 * 60 * 60
            AND DATE (event_timestamp) >= DATE ('{START_DATE}')
            AND DATE (event_timestamp) <= DATE ('{END_DATE}')
            AND NOT revision_parent_id = 0
        )
            
SELECT 
    mwh.wiki_db,
    YEAR(mwh.event_timestamp) AS year,
    MONTH(mwh.event_timestamp) AS month,
    DAY(mwh.event_timestamp) AS day,
    COUNT(DISTINCT mwh.revision_id) AS all_reverts,
    COUNT(DISTINCT (
            CASE 
                WHEN event_user_text IN {BOTS}
                     THEN mwh.revision_id
            END)) AS bot_reverts,
    COUNT(DISTINCT (
            CASE 
                WHEN event_user_text IN {BOTS}
                     AND mwh.revision_is_identity_reverted = True
                     THEN mwh.revision_id
            END)) AS false_positives
FROM 
    base
JOIN wmf.mediawiki_history mwh
     ON base.revision_first_identity_reverting_revision_id = mwh.revision_id
        AND base.wiki_db = mwh.wiki_db
WHERE 
    snapshot = '{MW_SNAPSHOT}'
GROUP BY 
    YEAR(mwh.event_timestamp),
    MONTH(mwh.event_timestamp),
    DAY(mwh.event_timestamp),
    mwh.wiki_db
"""

time_bounds = ['2020-07-01', '2023-06-30']
mw_snapshot = '2023-06'
bot_revert_counts = wmf.spark.run(query.format(DBS=wmf.utils.sql_tuple(bots.keys()),
                                               BOTS=wmf.utils.sql_tuple(set(bots.values())),
                                               START_DATE=time_bounds[0],
                                               END_DATE=time_bounds[1],
                                               MW_SNAPSHOT=mw_snapshot))



CPU times: user 555 ms, sys: 52.2 ms, total: 607 ms
Wall time: 2min 36s


                                                                                

In [107]:
bot_revert_counts.to_csv('data_outputs/anti_vandal_bot_revert_counts.tsv', sep='\t')

In [148]:
percent_bot_reverts = ((bot_revert_counts
                        .groupby('wiki_db')
                        .agg({'all_reverts': np.mean, 
                              'bot_reverts': np.mean, 
                              'false_positives': np.mean}))
                       .assign(
                           bot_reverts_percent = lambda df: df['bot_reverts'] / df['all_reverts'] * 100,
                           fpr = lambda df: df['false_positives'] / df['bot_reverts'],
                           fpr_percent = lambda df: df['false_positives'] / df['bot_reverts'] * 100)
                       .reset_index()
                       .assign(bot = lambda df: df['wiki_db'].map(bots))
                       .round({'all_reverts': 0,          
                               'bot_reverts': 0, 
                               'false_positives': 1, 
                               'bot_reverts_percent': 2,
                               'fpr': 3, 
                               'fpr_percent': 2})
                       .set_index(['wiki_db', 'bot']))
percent_bot_reverts

Unnamed: 0_level_0,Unnamed: 1_level_0,all_reverts,bot_reverts,false_positives,bot_reverts_percent,fpr,fpr_percent
wiki_db,bot,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
bgwiki,PSS 9,56.0,4.0,0.3,6.45,0.084,8.35
enwiki,ClueBot NG,6571.0,356.0,33.5,5.41,0.094,9.41
eswiki,SeroBOT,2123.0,894.0,78.3,42.09,0.088,8.76
fawiki,Dexbot,393.0,90.0,9.8,22.83,0.11,10.96
frwiki,Salebot,730.0,26.0,2.9,3.53,0.111,11.09
ptwiki,Salebot,201.0,0.0,0.0,0.0,,
rowiki,PatrocleBot,55.0,3.0,0.3,5.59,0.102,10.19
ruwiki,Рейму Хакурей,709.0,65.0,8.1,9.15,0.125,12.53
simplewiki,ChenzwBot,89.0,13.0,1.5,14.71,0.115,11.55


In [149]:
percent_bot_reverts.to_csv('data_outputs/anti_vandal_bot_revert_percentages.tsv', sep='\t')