# Link Rejection Reasons

The phab task for this work is [T301884](https://phabricator.wikimedia.org/T301884)

To start with, we want to aggregate rejection reasons by wiki, platform, and user experience level in the form of the number of Add a Link edits they've made. We might later want to also split by whether they completed or skipped onboarding, but we'll not plan for that.

We started out with a restriction on rejections within one week of registration. This turned out to provide us with little data on more experienced contributors. We therefore loosened the limit to within four weeks of registration.

In [1]:
import datetime as dt

import pandas as pd
import numpy as np

from collections import defaultdict

from wmfdata import spark, mariadb

from scipy import stats

In [2]:
## We'll gather data from all of 2022 because we want to learn as much as possible,
## and it's not essential that we right-truncate our data points. We will also have
## to partition ServerSideAccountCreation manually, but that's reasonable.

end_date = dt.date.today()
start_date = dt.date(2022, 1, 1)

## List of wikis that we're gathering data from:
wikis = ['arwiki', 'bnwiki', 'cswiki', 'viwiki', 'fawiki',
         'frwiki', 'huwiki', 'plwiki', 'rowiki', 'ruwiki',
         'eswiki'] # deployed to eswiki on accident, so we'll analyze that too

## Lists of known users to ignore (e.g. test accounts and experienced users)
known_users = defaultdict(set)
known_users['cswiki'].update([14, 127629, 303170, 342147, 349875, 44133, 100304, 307410, 439792, 444907,
                              454862, 456272, 454003, 454846, 92295, 387915, 398470, 416764, 44751, 132801,
                              137787, 138342, 268033, 275298, 317739, 320225, 328302, 339583, 341191,
                              357559, 392634, 398626, 404765, 420805, 429109, 443890, 448195, 448438,
                              453220, 453628, 453645, 453662, 453663, 453664, 440694, 427497, 272273,
                              458025, 458487, 458049, 59563, 118067, 188859, 191908, 314640, 390445,
                              451069, 459434, 460802, 460885, 79895, 448735, 453176, 467557, 467745,
                              468502, 468583, 468603, 474052, 475184, 475185, 475187, 475188, 294174,
                              402906, 298011])

known_users['kowiki'].update([303170, 342147, 349875, 189097, 362732, 384066, 416362, 38759, 495265,
                              515553, 537326, 566963, 567409, 416360, 414929, 470932, 472019, 485036,
                              532123, 558423, 571587, 575553, 576758, 360703, 561281, 595100, 595105,
                              595610, 596025, 596651, 596652, 596653, 596654, 596655, 596993, 942,
                              13810, 536529])

known_users['viwiki'].update([451842, 628512, 628513, 680081, 680083, 680084, 680085, 680086, 355424,
                              387563, 443216, 682713, 659235, 700934, 705406, 707272, 707303, 707681, 585762])

known_users['arwiki'].update([237660, 272774, 775023, 1175449, 1186377, 1506091, 1515147, 1538902,
                              1568858, 1681813, 1683215, 1699418, 1699419, 1699425, 1740419, 1759328, 1763990])

## Grab the user IDs of known test accounts so they can be added to the exclusion list

def get_known_users(wiki):
    '''
    Get user IDs of known test accounts and return a set of them.
    '''
    
    username_patterns = ["MMiller", "Zilant", "Roan", "KHarlan", "MWang", "SBtest",
                         "Cloud", "Rho2019", "Test"]

    known_user_query = '''
SELECT user_id
FROM user
WHERE user_name LIKE "{name_pattern}%"
    '''
    
    known_users = set()
    
    for u_pattern in username_patterns:
        new_known = mariadb.run(known_user_query.format(
            name_pattern = u_pattern), wiki)
        known_users = known_users | set(new_known['user_id'])

    return(known_users)
        
for wiki in wikis:
    known_users[wiki] = known_users[wiki] | get_known_users(wiki)

## Helper Functions

In [3]:
def make_known_users_sql(kd, wiki_column, user_column):
    '''
    Based on the dictionary `kd` mapping wiki names to sets of user IDs of known users,
    create a SQL expression to exclude users based on the name of the wiki matching `wiki_column`
    and the user ID not matching `user_column`
    '''
    
    wiki_exp = '''({w_column} = '{wiki}' AND {u_column} NOT IN ({id_list}))'''
    
    expressions = list()

    ## Iteratively build the expression for each wiki
    for wiki_name, wiki_users in kd.items():
        expressions.append(wiki_exp.format(
            w_column = wiki_column,
            wiki = wiki_name,
            u_column = user_column,
            id_list = ','.join([str(u) for u in wiki_users])
        ))
    
    ## We then join all the expressions with an OR, and we're done.
    return(' OR '.join(expressions))
    

In [4]:
def make_partition_statement(start_ts, end_ts, prefix = ''):
    '''
    This takes the two timestamps and creates a statement that selects
    partitions based on `year`, `month`, and `day` in order to make our
    data gathering not use excessive amounts of data. It assumes that
    `start_ts` and `end_ts` are either in the same year, or if spanning
    a year boundary are within a month apart.
    This assumption simplifies the code and output a lot.
    
    An optional prefix can be set to enable selecting partitions for
    multiple tables with different aliases.
    
    :param start_ts: start timestamp
    :type start_ts: datetime.datetime
    
    :param end_ts: end timestamp
    :type end_ts: datetime.datetime
    
    :param prefix: prefix to use in front of partition clauses, "." is added automatically
    :type prefix: str
    '''
    
    if prefix:
        prefix = f'{prefix}.' # adds "." after the prefix
    
    # there are three cases:
    # 1: month and year are the same, output a "BETWEEN" statement with the days
    # 2: the years are the same, and the months differ by 1: output a statement for each month
    # 3: the years are the same: create a list of statements from start_ts.month to end_ts.month,
    #    return them OR'ed together
    # 4: the years differ by 1, start_ts is December and end_ts is January, do the same as #2
    # 5: anything else, raise an exception because this isn't implemented yet.
    
    if start_ts.year == end_ts.year and start_ts.month == end_ts.month:
        return(f'''{prefix}year = {start_ts.year}
AND {prefix}month = {start_ts.month}
AND {prefix}day BETWEEN {start_ts.day} AND {end_ts.day}''')
    elif start_ts.year == end_ts.year and (end_ts.month - start_ts.month) == 1:
        return(f'''
(
    ({prefix}year = {start_ts.year}
     AND {prefix}month = {start_ts.month}
     AND {prefix}day >= {start_ts.day})
 OR ({prefix}year = {end_ts.year}
     AND {prefix}month = {end_ts.month}
     AND {prefix}day <= {end_ts.day})
)''')
    elif start_ts.year == end_ts.year:
        # do the start month as a list
        parts = [f'''({prefix}year = {start_ts.year}
     AND {prefix}month = {start_ts.month}
     AND {prefix}day >= {start_ts.day})''']
        # for month +1 to end month, add each month
        for m in range(start_ts.month+1, end_ts.month):
            parts.append(f'''({prefix}year = {start_ts.year}
            AND {prefix}month = {m})''')
        # then append the end month and return a parenthesis OR'ed together of all of it
        parts.append(f'''({prefix}year = {end_ts.year}
     AND {prefix}month = {end_ts.month}
     AND {prefix}day <= {end_ts.day})''')
        return('({})'.format(
            '\nOR\n'.join(parts)
        ))
    elif (end_ts.year - start_ts.year) == 1 and start_ts.month == 12 and end_ts.month == 1:
        return(f'''
(
    ({prefix}year = {start_ts.year}
     AND {prefix}month = {start_ts.month}
     AND {prefix}day >= {start_ts.day})
 OR ({prefix}year = {end_ts.year}
     AND {prefix}month = {end_ts.month}
     AND {prefix}day <= {end_ts.day})
)''')
    else:
        raise Exception('Difference between start and end timestamps is not implemented. See code for details.')


# Rejection Reason and Add a Link Edit query

In [105]:
rejection_query = '''
WITH rejection_events AS (
    SELECT
        hpv.wiki,
        hpv.event.user_id,
        stimg.homepage_pageview_token,
        stimg.is_mobile,
        stimg.dt AS event_dt,
        str_to_map(stimg.action_data, ";", "=") AS action_data
    FROM event.mediawiki_structured_task_article_link_suggestion_interaction AS stimg
    JOIN event.homepagevisit AS hpv
    ON stimg.homepage_pageview_token = hpv.event.homepage_pageview_token
    WHERE {stimg_partition_statement}
    AND {hpv_partition_statement}
    AND hpv.wiki IN ({wiki_list})
    AND ({hpv_known_user_id_expression})
    AND stimg.active_interface = "rejection_dialog"
    AND stimg.action = "close"
    AND str_to_map(stimg.action_data, ";", "=")["acceptance_state"] = "rejected"
    AND str_to_map(stimg.action_data, ";", "=")["rejection_reason"] != ""
),
registrations AS (
    SELECT
        wiki,
        event.userid AS user_id,
        dt AS reg_dt
    FROM event.serversideaccountcreation
    WHERE ((year = 2021 AND month = 12)
           OR year = 2022)
    AND wiki IN ({wiki_list})
    AND ({known_userid_expression})
),
rejection_reasons AS (
    SELECT
        rej.*,
        reg.reg_dt,
        EXPLODE(SPLIT(rej.action_data["rejection_reason"], ",")) AS rejection_reason
    FROM rejection_events AS rej
    JOIN registrations AS reg
    ON rej.wiki = reg.wiki
    AND rej.user_id = reg.user_id
    -- only rejections within four weeks of registration
    WHERE (unix_timestamp(rej.event_dt, "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'") -
            unix_timestamp(reg.reg_dt, "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'") < 60*60*24*28)
),
edit_revert AS (
    -- edits tagged with Add a Link (wiki, user_id, page_id, timestamp)
    -- whether the edit was reverted within 48 hours
    SELECT
        `database` AS wiki,
        rev_id,
        FIRST_VALUE(page_id) AS page_id,
        FIRST_VALUE(performer.user_id) AS user_id,
        FIRST_VALUE(performer.user_registration_dt) AS user_registration_dt,
        FIRST_VALUE(rev_timestamp) AS rev_timestamp,
        MAX(IF(array_contains(tags, 'mw-reverted') AND
               (unix_timestamp(meta.dt, "yyyy-MM-dd'T'HH:mm:ss'Z'") -
                unix_timestamp(rev_timestamp, "yyyy-MM-dd'T'HH:mm:ss'Z'") < 60*60*48), 1, 0)) AS was_reverted
    FROM event_sanitized.mediawiki_revision_tags_change
    WHERE {partition_statement}
    AND `database` IN ({wiki_list})
    AND ({known_user_database_expression})
    AND array_contains(tags, "newcomer task add link")
    -- Only counting edits made within one week of registration
    AND (unix_timestamp(rev_timestamp, "yyyy-MM-dd'T'HH:mm:ss'Z'") -
                unix_timestamp(performer.user_registration_dt, "yyyy-MM-dd'T'HH:mm:ss'Z'") < 60*60*24*28)
    GROUP BY wiki, rev_id
),
user_edit_count AS (
    SELECT
        *,
        row_number() OVER (PARTITION BY wiki, user_id ORDER BY rev_timestamp) AS edit_number
    FROM edit_revert
)
SELECT
    rej.homepage_pageview_token,
    rej.wiki,
    rej.user_id,
    rej.is_mobile,
    rej.event_dt,
    rej.reg_dt,
    rej.rejection_reason,
    COALESCE(
        MAX(
            IF(unix_timestamp(rev_timestamp, "yyyy-MM-dd'T'HH:mm:ss'Z'") <
                   unix_timestamp(rej.event_dt, "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"),
                edit_number, NULL)
        ), 0) AS tagged_edit_count
FROM rejection_reasons AS rej
LEFT JOIN user_edit_count AS edits
ON rej.wiki = edits.wiki
AND rej.user_id = edits.user_id
GROUP BY rej.homepage_pageview_token, rej.wiki, rej.user_id, rej.is_mobile,
         rej.event_dt, rej.reg_dt, rej.rejection_reason
'''

In [106]:
rejection_data = spark.run(
    rejection_query.format(
        wiki_list = ','.join(['"{}"'.format(w) for w in wikis]),
        known_user_database_expression = make_known_users_sql(known_users,
                                                              '`database`', 'performer.user_id'),
        known_user_id_expression = make_known_users_sql(known_users, 'wiki', 'event.user_id'),
        known_userid_expression = make_known_users_sql(known_users, 'wiki', 'event.userid'),
        hpv_known_user_id_expression = make_known_users_sql(known_users, 'wiki', 'hpv.event.user_id'),
        partition_statement = make_partition_statement(start_date, end_date),
        hpv_partition_statement = make_partition_statement(start_date, end_date, prefix = 'hpv'),
        stimg_partition_statement = make_partition_statement(start_date, end_date, prefix = 'stimg'),
    )
)

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.


In [107]:
len(rejection_data)

8598

In [None]:
rejection_data.head(25)

In [108]:
rejection_data.to_csv('datasets/rejection-reasons-2022-03-04.csv', index = False,
                     columns = ['wiki', 'is_mobile', 'rejection_reason', 'tagged_edit_count'])