# Overview
The goal is to generate a dataset for [Automoderator](https://www.mediawiki.org/wiki/Moderator_Tools/Automoderator) model testing interface. The dataset will have the following dimensions:
* revision_id: unique id of an edit
* revision_revert_risk: revert risk score provided by [Language-agnostic revert risk](https://meta.wikimedia.org/wiki/Machine_learning_models/Proposed/Language-agnostic_revert_risk) model
* wiki_db: Wikimedia project
* revision_is_identity_reverted: whether the edit has been reverted
* event_user_revision_count: edit count of the user who made the edit (until the edit)
* user_is_anonymous: whether the user is an anonymous (IP) user; false in this case would mean a registered user
* user_is_bot: whether the user is a bot or not
* is_self_revert: in case the edit was a revert, whether it was reverting a previous edit by the same user
* is_sysop: whether the user has admin privileges on the given wiki
* is_page_creation: whether the edit resulted in a creation of a new page
* is_newcomer_task: whether the edit was made a result of [newcomer task add-a-link task] task (https://www.mediawiki.org/wiki/Growth/Personalized_first_day/Structured_tasks/Add_a_link)
* is_cx_edit: whether the edit was made using the [Content Translation tool](https://www.mediawiki.org/wiki/Content_translation)

# Data-Gathering

In [27]:
import pandas as pd
import wmfdata as wmf

import os
import warnings

In [28]:
pd.options.display.max_columns = None
from IPython.display import clear_output

## spark_session

In [13]:
spark_session = wmf.spark.get_active_session()

if type(spark_session) != type(None):
    spark_session.stop()
else:
    print('no active session')

In [15]:
spark_session = wmf.spark.create_custom_session(
    master="yarn",
    app_name='revert-risk-data-sample',
    spark_config={
        "spark.driver.memory": "4g",
        "spark.dynamicAllocation.maxExecutors": 64,
        "spark.executor.memory": "16g",
        "spark.executor.cores": 4,
        "spark.sql.shuffle.partitions": 256,
        "spark.driver.maxResultSize": "2g"
        
    }
)

clear_output()

spark_session

In [17]:
spark_session.sparkContext.setLogLevel("ERROR")

## query

In [18]:
# paths to pre-calculated revert risk scores
# generated by https://gitlab.wikimedia.org/repos/research/knowledge_integrity/-/blob/mnz/examples/examples/notebooks/revertrisk_example.ipynb
rr_scores_path = '/user/paragon/riskobservatory/revertrisk_20212022_anonymous_bot.parquet'

rr_scores = spark_session.read.parquet(rr_scores_path)
rr_scores.createOrReplaceTempView('rr_scores')

                                                                                

In [19]:
rr_scores.printSchema()

root
 |-- rev_id: long (nullable = true)
 |-- wiki_db: string (nullable = true)
 |-- rev_timestamp: string (nullable = true)
 |-- revision_is_identity_reverted: boolean (nullable = true)
 |-- revision_seconds_to_identity_revert: long (nullable = true)
 |-- page_id: long (nullable = true)
 |-- revision_revert_risk: float (nullable = true)
 |-- user_is_anonymous: boolean (nullable = true)
 |-- user_is_bot: boolean (nullable = true)



In [20]:
def generate_dataset(wiki, session=spark_session, random_state = 910, sample_size = 25500):
    
    sample = session.sql("""
        WITH 
            base AS (
                SELECT
                    *,
                    CASE
                        WHEN revision_revert_risk >= 0.97 THEN TRUE
                        ELSE FALSE
                    END AS is_high_revert_risk
                FROM
                    rr_scores
                WHERE
                    wiki_db = '{WIKI}'
                ),

            ranking AS (
                SELECT 
                    *,
                    ROW_NUMBER() OVER (
                        PARTITION BY wiki_db, revision_is_identity_reverted, is_high_revert_risk
                        ORDER BY RAND(0910)
                    ) AS row_num
                FROM 
                    base
            ),

            sample AS (
                SELECT
                    *
                FROM
                    ranking
                WHERE 
                    row_num <= 9000
            ),

            base_sample AS (
                SELECT
                    mwh.event_user_text,
                    s.rev_id,
                    revision_revert_risk,
                    s.wiki_db,
                    s.revision_is_identity_reverted,
                    event_user_revision_count,
                    s.user_is_anonymous,
                    user_is_bot,
                    page_title,
                    CASE 
                        WHEN ARRAY_CONTAINS(mwh.event_user_groups, 'sysop') THEN TRUE
                        ELSE FALSE
                    END AS is_sysop,
                    CASE 
                        WHEN mwh.revision_parent_id = 0 THEN TRUE 
                        ELSE FALSE 
                    END AS is_page_creation,
                    CASE 
                        WHEN ARRAY_CONTAINS(mwh.revision_tags, 'newcomer task add link') THEN TRUE
                        ELSE FALSE
                    END AS is_newcomer_task,
                    CASE
                        WHEN ARRAY_CONTAINS(mwh.revision_tags, 'contenttranslation') THEN TRUE
                        ELSE FALSE
                    END AS is_cx_edit,
                    CASE
                        WHEN revision_is_identity_revert THEN TRUE
                        ELSE FALSE
                    END reverting_edit,
                    is_high_revert_risk
                FROM 
                    sample s
                JOIN 
                    wmf.mediawiki_history mwh 
                    ON s.wiki_db = mwh.wiki_db AND s.rev_id = mwh.revision_id
                WHERE 
                    snapshot = '2023-10'
                ),

            reverts AS (
                SELECT 
                    * 
                FROM 
                    base_sample 
                WHERE 
                    reverting_edit),

            non_reverts AS (
                SELECT 
                    *, 
                    NULL AS is_self_revert 
                FROM 
                    base_sample 
                    WHERE NOT reverting_edit),

            self_reverts AS (
                SELECT
                    rv.*,
                    CASE 
                        WHEN rv.event_user_text = mwh.event_user_text THEN TRUE
                        ELSE FALSE
                    END AS is_self_revert
                FROM 
                    reverts rv
                    LEFT JOIN wmf.mediawiki_history mwh
                    ON rv.wiki_db = mwh.wiki_db 
                        AND rv.rev_id = mwh.revision_first_identity_reverting_revision_id
                )

        SELECT * FROM non_reverts
        UNION ALL
        SELECT * FROM self_reverts
        """.format(WIKI=wiki))
    
    sample_frame = sample.toPandas().drop_duplicates(ignore_index=True)

    if sample_frame.shape[0] < sample_size:
        sample_size = sample_frame.shape[0]

    sampled_data = sample_frame.sample(sample_size, random_state=random_state)
    return sampled_data

# Generate

## top 150 Wikipedias

In [52]:
wiki_comparision = pd.read_csv('https://raw.githubusercontent.com/wikimedia-research/wiki-comparison/main/data-collection/snapshots/Jan_2023.tsv', sep='\t')
top150_wps = (
    wiki_comparision[wiki_comparision['project code'] == 'wikipedia']
    .reset_index(drop=True)
    .iloc[:150, :]['database code']
    .values.tolist()
)

In [26]:
for wiki in top150_wps:
    dataset = generate_dataset(wiki)
    dataset.to_csv(f'samples/25K/revert_risk_dataset_{wiki}.tsv', sep='\t', index=False)

In [47]:
dataset_all = pd.DataFrame()

for filename in os.listdir('samples/25K/'):
    if filename.endswith('.tsv'):        
        dataset = pd.read_csv(f'samples/25K/{filename}', sep='\t')
        dataset_all = pd.concat([dataset_all, dataset])
    
dataset_all.reset_index(drop=True, inplace=True)
dataset_all.to_csv('samples/revert_risk_dataset_all_25K.tsv', sep='\t', index=False)
dataset_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2264170 entries, 0 to 2264169
Data columns (total 16 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   event_user_text                object 
 1   rev_id                         int64  
 2   revision_revert_risk           float64
 3   wiki_db                        object 
 4   revision_is_identity_reverted  bool   
 5   event_user_revision_count      float64
 6   user_is_anonymous              bool   
 7   user_is_bot                    bool   
 8   page_title                     object 
 9   is_sysop                       bool   
 10  is_page_creation               bool   
 11  is_newcomer_task               bool   
 12  is_cx_edit                     bool   
 13  reverting_edit                 bool   
 14  is_high_revert_risk            bool   
 15  is_self_revert                 object 
dtypes: bool(9), float64(2), int64(1), object(4)
memory usage: 140.4+ MB


### Downsize to 5K per Wikipedia

In [48]:
revert_risk_dataset_all = pd.read_csv('samples/revert_risk_dataset_all_25K.tsv', sep='\t')

In [56]:
def conditional_resample(wiki, df=revert_risk_dataset_all, sample_size=5000, bot_false_ratio=0.95):

    wiki_df = df.query("""wiki_db == @wiki""")
    n_rows = len(wiki_df)
    
    if n_rows <= sample_size:
        return wiki_df
    else:
        n_false = int(sample_size * bot_false_ratio)
        n_true = sample_size - n_false

        false_df = wiki_df[wiki_df['user_is_bot'] == False]
        true_df = wiki_df[wiki_df['user_is_bot'] == True]

        false_sample = false_df.sample(n=n_false, random_state=42) if len(false_df) > n_false else false_df
        true_sample = true_df.sample(n=n_true, random_state=42) if len(true_df) > n_true else true_df

        return pd.concat([false_sample, true_sample], ignore_index=True)

In [57]:
dataset_all_5K = pd.DataFrame()

for wp in top150_wps:
    sample_5K = conditional_resample(wp)
    sample_5K.to_csv(f'samples/5K/revert_risk_dataset_{wp}.tsv', sep='\t', index=False)
    
    dataset_all_5K = pd.concat([dataset_all_5K, sample_5K])

dataset_all_5K.reset_index(drop=True, inplace=True)
dataset_all_5K.to_csv('samples/revert_risk_dataset_all_5K.tsv', sep='\t', index=False)
dataset_all_5K.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 719693 entries, 0 to 719692
Data columns (total 16 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   event_user_text                718634 non-null  object 
 1   rev_id                         719693 non-null  int64  
 2   revision_revert_risk           691652 non-null  float64
 3   wiki_db                        719693 non-null  object 
 4   revision_is_identity_reverted  719693 non-null  bool   
 5   event_user_revision_count      503967 non-null  float64
 6   user_is_anonymous              719693 non-null  bool   
 7   user_is_bot                    719693 non-null  bool   
 8   page_title                     719685 non-null  object 
 9   is_sysop                       719693 non-null  bool   
 10  is_page_creation               719693 non-null  bool   
 11  is_newcomer_task               719693 non-null  bool   
 12  is_cx_edit                    