# Baseline: Median Time to Revert (Probable Vandalism)

**Last updated on 15 February 2024**


[TASK: T348860](https://phabricator.wikimedia.org/T348860)<br>
 ➤ ➤ [View the notebook on nbviewer](https://nbviewer.org/github/wikimedia-research/automoderator-measurement/blob/main/baselines/T348860_median_time_to_revert.ipynb)

# Contents
1. [Summary](#Summary)
2. [Data Gathering](#Data-Gathering)
3. [Analysis](#Analysis)
    * [Median Time to Revert, by Wikipedia](#Median-Time-to-Revert)
    * [Time to Revert Percentiles, by Wikipedia](#Time-to-Revert-Percentiles)

## Summary

The following analysis is to determine a baseline for Median Time to Revert for Probable Vandalism for Wikipedias in consideration. The baseline will be used as a reference for evaluating the impact of Automoderator later. The [operational definition](https://phabricator.wikimedia.org/T349083) for probable vandalism (within the scope of [Automoderator](https://www.mediawiki.org/wiki/Moderator_Tools/Automoderator)) is as the following:
- edit belongs to the content namespace
- edit was reverted within 12 hours
- user is anonymous OR if registered
    - user edit count is less than 15 edits
    - time since user's first edit is less than 48 hours
- revert was made by a different editor

## Baseline: Median Time to Revert for Probable Vandalism

In [10]:
pr_centered('Median Time to Revert for Probable Vandalism (2023)', True)
display_h({
    '': median_ttr_by_wiki
})

NameError: name 'median_ttr_by_wiki' is not defined

# Data-Gathering

## Imports

In [2]:
import pandas as pd
import numpy as np
import wmfdata as wmf
import great_tables as gt

from datetime import timedelta, datetime

pd.options.display.max_columns = None
from IPython.display import clear_output

from IPython.display import display_html
from IPython.display import display, HTML
from IPython.display import clear_output

import warnings

## spark_session

In [3]:
spark_session = wmf.spark.get_active_session()

if type(spark_session) != type(None):
    spark_session.stop()
else:
    print('no active session')

no active session


In [4]:
spark_session = wmf.spark.create_custom_session(
    master="yarn",
    app_name='vandalism-time-to-revert',
    spark_config={
        "spark.driver.memory": "6g",
        "spark.dynamicAllocation.maxExecutors": 64,
        "spark.executor.memory": "24g",
        "spark.executor.cores": 4,
        "spark.sql.shuffle.partitions": 256,
        "spark.driver.maxResultSize": "2g"
        
    }
)

clear_output()

spark_session.sparkContext.setLogLevel("ERROR")
spark_session

## functions

In [5]:
# prints a string at center of the output, bold if needed
def pr_centered(content, bold=False):
    if bold:
        content = f"<b>{content}</b>"
    
    centered_html = f"<div style='text-align:center'>{content}</div>"
    
    display(HTML(centered_html))


# display dataframes horizontally with title for each
def display_h(frames, space=100):
    html = ""
    
    for key in frames.keys():
        html_df =f'<div>{key} {frames[key]._repr_html_()}</div>'
        html += html_df
        
    html = f"""
    <div style="display:flex; justify-content: space-evenly;">
    {html}
    </div>"""
    
    display_html(html, raw=True)

In [6]:
# calculate time difference in seconds between two columns
# note: the columns should be datetime formatted
def time_delta(df, start_column, end_column):
    try: 
        return df.apply(lambda row: (row[end_column] - row[start_column]).total_seconds(), axis=1)
    except:
        return np.NaN

# applies cell color to a given nth percentile
def style_percentile(i, percentile='50th'):
    return ['background-color: Aquamarine' if i.name == percentile else '' for _ in i]

# return quatiles for a given series (dataframe and column name)
def quantiles(frame, col='time_to_revert', style_median=False):    
    qdict = {
        '10th': frame[col].quantile(0.1),
        '25th': frame[col].quantile(0.25),
        '50th': frame[col].quantile(0.5),
        '75th': frame[col].quantile(0.7),
        '90th': frame[col].quantile(0.9),
        '99th': frame[col].quantile(0.99)
    }
    
    df = pd.DataFrame(qdict.values(),
                      index=qdict.keys(),
                      columns=['seconds'])
    
    df['minutes'] = round(df['seconds'] / 60, 2)
    
    df = df.astype({'seconds': int})
    df.index.name = 'percentile'
    
    if style_median:
        df = df.style.apply(style_percentile, axis=1).format("{:.1f}")
        # df = df.astype({'seconds': int})
        return df
    else:
        return df

In [7]:
def split_into_groups(dfs, group_size=4):
    return [dfs[i:i + group_size] for i in range(0, len(dfs), group_size)]

## query

In [8]:
mwh_snapshot = '2024-01'

wikis_list = [f'{lang}wiki' for lang in ['en', 'es', 'ja', 'de', 'fr', 'ru', 'zh', 'it', 'pt', 'fa', 'id']]
wikis_sql = wmf.utils.sql_tuple(wikis_list)

In [9]:
%%time

query = f"""
WITH 
    base AS (
        SELECT
            wiki_db,
            event_user_text AS user_name,
            event_user_is_anonymous AS is_anon,
            revision_seconds_to_identity_revert AS time_to_revert,
            revision_first_identity_reverting_revision_id AS reverting_edit_id,
            event_timestamp,
            event_user_first_edit_timestamp
        FROM 
            wmf.mediawiki_history
        WHERE 
            snapshot = '{mwh_snapshot}'
            AND wiki_db IN {wikis_sql}
            AND event_entity = 'revision'
            AND event_type = 'create'
            AND page_namespace_is_content
            AND 
                (
                    event_user_is_anonymous 
                    OR event_user_revision_count <= 15
                )
            AND SIZE(event_user_is_bot_by_historical) = 0
            AND revision_is_identity_reverted
            AND revision_seconds_to_identity_revert <= 12*60*60
            AND revision_seconds_to_identity_revert >= 0
            AND YEAR(event_timestamp) = 2023
    )
SELECT
    base.*
FROM 
    base
JOIN
    wmf.mediawiki_history mwh
    ON base.wiki_db = mwh.wiki_db 
        AND base.reverting_edit_id = mwh.revision_id
WHERE
    snapshot = '{mwh_snapshot}'
    AND NOT base.user_name = mwh.event_user_text
"""

vandal_edits = wmf.spark.run(query).drop(['user_name', 'reverting_edit_id'], axis=1)
vandal_edits_df1 = vandal_edits.copy()

                                                                                

CPU times: user 23.6 s, sys: 3.41 s, total: 27.1 s
Wall time: 3min 36s


In [10]:
vandal_edits = (
    vandal_edits
    .assign(
        event_timestamp=pd.to_datetime(vandal_edits['event_timestamp'], utc=True),
        event_user_first_edit_timestamp=pd.to_datetime(vandal_edits['event_user_first_edit_timestamp'], utc=True),
        is_anon=pd.Categorical(vandal_edits['is_anon'])
    )
)

vandal_edits['elapsed_user_first_rev'] = time_delta(vandal_edits, 'event_user_first_edit_timestamp', 'event_timestamp')

vandal_edits = (
    vandal_edits
    .query("(elapsed_user_first_rev <= 48*60*60) | (is_anon == True)")
    .drop(['event_timestamp', 'event_user_first_edit_timestamp'], axis=1)
    .reset_index(drop=True)
)

vandal_edits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3598086 entries, 0 to 3598085
Data columns (total 4 columns):
 #   Column                  Dtype   
---  ------                  -----   
 0   wiki_db                 object  
 1   is_anon                 category
 2   time_to_revert          int64   
 3   elapsed_user_first_rev  float64 
dtypes: category(1), float64(1), int64(1), object(1)
memory usage: 85.8+ MB


In [12]:
for k, v in dict(vandal_edits.is_anon.value_counts()).items():
    assert v > 0, f'is_anon=={k} has {v} records'

In [13]:
db_names = (
    pd
    .read_csv('https://raw.githubusercontent.com/wikimedia-research/canonical-data/master/wiki/wikis.tsv', sep='\t')
    .query("""database_code == @wikis_list""")[['database_code', 'english_name']]
    .set_index('database_code')['english_name']
    .to_dict()
)

all_dbs_ttr = {db_names[db]: quantiles(vandal_edits.query(f"wiki_db == '{db}'"), style_median=True) for db in vandal_edits.wiki_db.unique()}

# Analysis

## Median-Time-to-Revert

In [14]:
median_ttr_by_wiki = (
    vandal_edits
    .groupby('wiki_db')['time_to_revert']
    .median()
    .reset_index(name='seconds')
    .assign(
        **{
            'minutes': lambda x: round(x['seconds'] / 60, 2),
            'Wikipedia': lambda x: x['wiki_db'].map(db_names)
        }
    )
    .astype({'seconds': int})
    .set_index('Wikipedia')
    .drop('wiki_db', axis=1)
)

In [15]:
pr_centered('Median Time to Revert (by Wikipedia)', True)
display_h({
    '': median_ttr_by_wiki
})

Unnamed: 0_level_0,seconds,minutes
Wikipedia,Unnamed: 1_level_1,Unnamed: 2_level_1
German Wikipedia,189,3.15
English Wikipedia,464,7.73
Spanish Wikipedia,78,1.3
Persian Wikipedia,59,0.98
French Wikipedia,473,7.88
Indonesian Wikipedia,2935,48.92
Italian Wikipedia,641,10.68
Japanese Wikipedia,984,16.4
Portuguese Wikipedia,1147,19.12
Russian Wikipedia,783,13.05


## Time-to-Revert-Percentiles

In [16]:
pr_centered('<big>Time to Revert (by Wikipedia)<big>', True)
pr_centered('<u>coloured cell -> median</u>')

for group in split_into_groups(list(all_dbs_ttr.items())):
    display_h(dict(group))

Unnamed: 0_level_0,seconds,minutes
percentile,Unnamed: 1_level_1,Unnamed: 2_level_1
10th,17.0,0.3
25th,39.0,0.7
50th,189.0,3.1
75th,1001.0,16.7
90th,10509.0,175.2
99th,36982.0,616.4

Unnamed: 0_level_0,seconds,minutes
percentile,Unnamed: 1_level_1,Unnamed: 2_level_1
10th,17.0,0.3
25th,49.0,0.8
50th,464.0,7.7
75th,3050.0,50.8
90th,17836.0,297.3
99th,39136.0,652.3

Unnamed: 0_level_0,seconds,minutes
percentile,Unnamed: 1_level_1,Unnamed: 2_level_1
10th,2.0,0.0
25th,3.0,0.1
50th,78.0,1.3
75th,1354.0,22.6
90th,13562.0,226.1
99th,37859.0,631.0

Unnamed: 0_level_0,seconds,minutes
percentile,Unnamed: 1_level_1,Unnamed: 2_level_1
10th,15.0,0.2
25th,31.0,0.5
50th,59.0,1.0
75th,469.0,7.8
90th,9994.0,166.6
99th,37803.0,630.1


Unnamed: 0_level_0,seconds,minutes
percentile,Unnamed: 1_level_1,Unnamed: 2_level_1
10th,28.0,0.5
25th,76.0,1.3
50th,473.0,7.9
75th,2513.0,41.9
90th,15629.0,260.5
99th,38911.0,648.5

Unnamed: 0_level_0,seconds,minutes
percentile,Unnamed: 1_level_1,Unnamed: 2_level_1
10th,139.0,2.3
25th,604.0,10.1
50th,2935.0,48.9
75th,8515.0,141.9
90th,23516.0,391.9
99th,40379.0,673.0

Unnamed: 0_level_0,seconds,minutes
percentile,Unnamed: 1_level_1,Unnamed: 2_level_1
10th,23.0,0.4
25th,75.0,1.2
50th,641.0,10.7
75th,3854.0,64.2
90th,20826.0,347.1
99th,40404.0,673.4

Unnamed: 0_level_0,seconds,minutes
percentile,Unnamed: 1_level_1,Unnamed: 2_level_1
10th,65.0,1.1
25th,199.0,3.3
50th,984.0,16.4
75th,4379.0,73.0
90th,20091.0,334.9
99th,39475.0,657.9


Unnamed: 0_level_0,seconds,minutes
percentile,Unnamed: 1_level_1,Unnamed: 2_level_1
10th,68.0,1.1
25th,235.0,3.9
50th,1147.0,19.1
75th,3622.0,60.4
90th,16144.0,269.1
99th,38562.0,642.7

Unnamed: 0_level_0,seconds,minutes
percentile,Unnamed: 1_level_1,Unnamed: 2_level_1
10th,12.0,0.2
25th,94.0,1.6
50th,783.0,13.1
75th,3560.0,59.3
90th,18195.0,303.3
99th,39135.0,652.2

Unnamed: 0_level_0,seconds,minutes
percentile,Unnamed: 1_level_1,Unnamed: 2_level_1
10th,82.0,1.4
25th,358.0,6.0
50th,2057.0,34.3
75th,7156.0,119.3
90th,24163.0,402.7
99th,40732.0,678.9
