# Add an Image Funnel Analysis Part II

This is the second part of the funnel analysis, where we look at what happens after a user reaches "on task". Users have clicked "skip all" on either step 1, 2, or 3 of the onboarding dialogue, or clicked "get started" on step 4. What do they do with the suggestion, do they complete the caption, and do they complete the task?

In [1]:
import datetime as dt

import pandas as pd
import numpy as np

from collections import defaultdict

from wmfdata import spark, mariadb

from scipy import stats

In [91]:
## We'll gather data from December 2021 and January 2022, as we per the data gathering
## time have complete data for both months.

start_date = dt.date(2021, 12, 1)
end_date = dt.date(2022, 1, 31)

## List of wikis that we're gathering data from:
wikis = ['arwiki', 'bnwiki', 'cswiki']

## Name of the temporary table we use to only gather data
## for users who got "on task"
on_task_temp_table = 'on_task_newcomers'

## Lists of known users to ignore (e.g. test accounts and experienced users)
known_users = defaultdict(set)
known_users['cswiki'].update([14, 127629, 303170, 342147, 349875, 44133, 100304, 307410, 439792, 444907,
                              454862, 456272, 454003, 454846, 92295, 387915, 398470, 416764, 44751, 132801,
                              137787, 138342, 268033, 275298, 317739, 320225, 328302, 339583, 341191,
                              357559, 392634, 398626, 404765, 420805, 429109, 443890, 448195, 448438,
                              453220, 453628, 453645, 453662, 453663, 453664, 440694, 427497, 272273,
                              458025, 458487, 458049, 59563, 118067, 188859, 191908, 314640, 390445,
                              451069, 459434, 460802, 460885, 79895, 448735, 453176, 467557, 467745,
                              468502, 468583, 468603, 474052, 475184, 475185, 475187, 475188, 294174,
                              402906, 298011])

known_users['kowiki'].update([303170, 342147, 349875, 189097, 362732, 384066, 416362, 38759, 495265,
                              515553, 537326, 566963, 567409, 416360, 414929, 470932, 472019, 485036,
                              532123, 558423, 571587, 575553, 576758, 360703, 561281, 595100, 595105,
                              595610, 596025, 596651, 596652, 596653, 596654, 596655, 596993, 942,
                              13810, 536529])

known_users['viwiki'].update([451842, 628512, 628513, 680081, 680083, 680084, 680085, 680086, 355424,
                              387563, 443216, 682713, 659235, 700934, 705406, 707272, 707303, 707681, 585762])

known_users['arwiki'].update([237660, 272774, 775023, 1175449, 1186377, 1506091, 1515147, 1538902,
                              1568858, 1681813, 1683215, 1699418, 1699419, 1699425, 1740419, 1759328, 1763990])

## Grab the user IDs of known test accounts so they can be added to the exclusion list

def get_known_users(wiki):
    '''
    Get user IDs of known test accounts and return a set of them.
    '''
    
    username_patterns = ["MMiller", "Zilant", "Roan", "KHarlan", "MWang", "SBtest",
                         "Cloud", "Rho2019", "Test"]

    known_user_query = '''
SELECT user_id
FROM user
WHERE user_name LIKE "{name_pattern}%"
    '''
    
    known_users = set()
    
    for u_pattern in username_patterns:
        new_known = mariadb.run(known_user_query.format(
            name_pattern = u_pattern), wiki)
        known_users = known_users | set(new_known['user_id'])

    return(known_users)
        
for wiki in wikis:
    known_users[wiki] = known_users[wiki] | get_known_users(wiki)

## Helper Functions

In [3]:
def make_known_users_sql(kd, wiki_column, user_column):
    '''
    Based on the dictionary `kd` mapping wiki names to sets of user IDs of known users,
    create a SQL expression to exclude users based on the name of the wiki matching `wiki_column`
    and the user ID not matching `user_column`
    '''
    
    wiki_exp = '''({w_column} = '{wiki}' AND {u_column} NOT IN ({id_list}))'''
    
    expressions = list()

    ## Iteratively build the expression for each wiki
    for wiki_name, wiki_users in kd.items():
        expressions.append(wiki_exp.format(
            w_column = wiki_column,
            wiki = wiki_name,
            u_column = user_column,
            id_list = ','.join([str(u) for u in wiki_users])
        ))
    
    ## We then join all the expressions with an OR, and we're done.
    return(' OR '.join(expressions))
    

In [4]:
def make_partition_statement(start_ts, end_ts, prefix = ''):
    '''
    This takes the two timestamps and creates a statement that selects
    partitions based on `year`, `month`, and `day` in order to make our
    data gathering not use excessive amounts of data. It assumes that
    `start_ts` and `end_ts` are either in the same year, or if spanning
    a year boundary are within a month apart.
    This assumption simplifies the code and output a lot.
    
    An optional prefix can be set to enable selecting partitions for
    multiple tables with different aliases.
    
    :param start_ts: start timestamp
    :type start_ts: datetime.datetime
    
    :param end_ts: end timestamp
    :type end_ts: datetime.datetime
    
    :param prefix: prefix to use in front of partition clauses, "." is added automatically
    :type prefix: str
    '''
    
    if prefix:
        prefix = f'{prefix}.' # adds "." after the prefix
    
    # there are three cases:
    # 1: month and year are the same, output a "BETWEEN" statement with the days
    # 2: the years are the same, and the months differ by 1: output a statement for each month
    # 3: the years are the same: create a list of statements from start_ts.month to end_ts.month,
    #    return them OR'ed together
    # 4: the years differ by 1, start_ts is December and end_ts is January, do the same as #2
    # 5: anything else, raise an exception because this isn't implemented yet.
    
    if start_ts.year == end_ts.year and start_ts.month == end_ts.month:
        return(f'''{prefix}year = {start_ts.year}
AND {prefix}month = {start_ts.month}
AND {prefix}day BETWEEN {start_ts.day} AND {end_ts.day}''')
    elif start_ts.year == end_ts.year and (end_ts.month - start_ts.month) == 1:
        return(f'''
(
    ({prefix}year = {start_ts.year}
     AND {prefix}month = {start_ts.month}
     AND {prefix}day >= {start_ts.day})
 OR ({prefix}year = {end_ts.year}
     AND {prefix}month = {end_ts.month}
     AND {prefix}day <= {end_ts.day})
)''')
    elif start_ts.year == end_ts.year:
        # do the start month as a list
        parts = [f'''({prefix}year = {start_ts.year}
     AND {prefix}month = {start_ts.month}
     AND {prefix}day >= {start_ts.day})''']
        # for month +1 to end month, add each month
        for m in range(start_ts.month+1, end_ts.month):
            parts.append(f'''({prefix}year = {start_ts.year}
            AND {prefix}month = {m})''')
        # then append the end month and return a parenthesis OR'ed together of all of it
        parts.append(f'''({prefix}year = {end_ts.year}
     AND {prefix}month = {end_ts.month}
     AND {prefix}day <= {end_ts.day})''')
        return('({})'.format(
            '\nOR\n'.join(parts)
        ))
    elif (end_ts.year - start_ts.year) == 1 and start_ts.month == 12 and end_ts.month == 1:
        return(f'''
(
    ({prefix}year = {start_ts.year}
     AND {prefix}month = {start_ts.month}
     AND {prefix}day >= {start_ts.day})
 OR ({prefix}year = {end_ts.year}
     AND {prefix}month = {end_ts.month}
     AND {prefix}day <= {end_ts.day})
)''')
    else:
        raise Exception('Difference between start and end timestamps is not implemented. See code for details.')


In [5]:
def get_variant_data(wikis, variant_property = 'growthexperiments-homepage-variant'):
    '''
    Connects to the given wikis and queries for the value of the user property that
    defines what experiment variant the users are in. This can later be used to
    filter out all users who are not in a specific variant
    (e.g. "imagerecommendation" for Add an Image)
    '''
    
    variant_query = f'''
    SELECT
      DATABASE() AS wiki,
      up_user AS user_id,
      up_value AS variant_name
    FROM user_properties
    WHERE up_property = "{variant_property}"
    '''
    
    return(mariadb.run(variant_query, wikis))

In [6]:
def round_perc(x, y, prec = 1):
    return(round(100.0 * x / y, prec))

In [7]:
def round_perc_df(df_x, df_y, prec = 1):
    return(round(100.0 * len(df_x) / len(df_y), prec))

## Onboarding to On Task Funnel

In [80]:
onboarding_funnel_query = '''
WITH hp_visits AS (
    SELECT
        -- HomepageVisit is the authoritative source here, and we're grouping by
        -- homepage_pageview_token to deduplicate the multiple module impressions
        hpv.event.homepage_pageview_token,
        FIRST_VALUE(hpv.wiki) AS wiki,
        FIRST_VALUE(hpv.event.user_id) AS user_id,
        FIRST_VALUE(hpv.event.is_mobile) AS is_mobile,
        FIRST_VALUE(hpv.dt) AS visit_dt,
        FIRST_VALUE(1) AS homepage_visit,
        FIRST_VALUE(IF(ssac.event.userid IS NOT NULL, 1, 0)) AS is_newcomer,
        FIRST_VALUE(IF(unix_timestamp(hpv.dt, "yyyy-MM-dd'T'HH:mm:ss.SSSS'Z'") -
                        unix_timestamp(ssac.dt, "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'") < 60*60*24, 1, 0))
            AS is_24hr_visit
    FROM event.homepagevisit AS hpv
    JOIN event.homepagemodule AS hpm
    ON hpv.event.homepage_pageview_token = hpm.event.homepage_pageview_token
    LEFT JOIN event.serversideaccountcreation AS ssac
    ON hpv.wiki = ssac.wiki
    AND hpv.event.user_id = ssac.event.userid
    WHERE {hpv_partition_statement}
    AND hpv.wiki IN ({wiki_list})
    AND {hpv_known_user_id_expression}
    AND {hpm_partition_statement}
    AND {ssac_partition_statement}
    AND ssac.wiki IN ({wiki_list})
    AND hpv.event.is_mobile = true
    AND hpm.event.action = "impression"
    GROUP BY hpv.event.homepage_pageview_token
),
newcomer_tasks AS (
    -- grab unique task token/task type data from newcomer tasks
    SELECT
        DISTINCT event.newcomer_task_token, event.task_type, event.page_id
    FROM event.newcomertask
    WHERE {partition_statement}
),
addimage_task_clicks AS (
    -- clicks to Add an Image tasks
    SELECT
        event.homepage_pageview_token,
        dt AS click_dt,
        nt.page_id,
        row_number() OVER (PARTITION BY hpm.wiki, hpm.event.user_id ORDER BY hpm.dt) AS click_number
    FROM hp_visits AS hpv
    JOIN event.homepagemodule AS hpm
    ON hpv.homepage_pageview_token = hpm.event.homepage_pageview_token
    JOIN newcomer_tasks AS nt
    ON str_to_map(hpm.event.action_data, ";", "=")["newcomerTaskToken"] = nt.newcomer_task_token
    WHERE {partition_statement}
    AND hpm.wiki IN ({wiki_list})
    AND event.action IN ("se-task-click", "se-edit-button-click")
    AND nt.task_type = "image-recommendation"
    AND dt > hpv.visit_dt
),
first_task_click AS (
    SELECT
        *
    FROM addimage_task_clicks
    WHERE click_number = 1
),
nosuggestion_impression AS (
    SELECT
        stimg.homepage_pageview_token,
        MIN(dt) AS event_dt
    FROM event.mediawiki_structured_task_article_image_suggestion_interaction AS stimg
    JOIN first_task_click AS ftc
    ON stimg.homepage_pageview_token = ftc.homepage_pageview_token
    WHERE {partition_statement}
    AND active_interface = "nosuggestions_dialog"
    AND action = "impression"
    AND stimg.dt > ftc.click_dt
    GROUP BY stimg.homepage_pageview_token
),
addimage_impression AS (
    SELECT
        stimg.homepage_pageview_token,
        MIN(dt) AS event_dt
    FROM event.mediawiki_structured_task_article_image_suggestion_interaction AS stimg
    JOIN first_task_click AS ftc
    ON stimg.homepage_pageview_token = ftc.homepage_pageview_token
    WHERE {partition_statement}
    AND active_interface IN ("machinesuggestions_mode", "recommendedimagetoolbar_dialog")
    AND action = "impression"
    AND stimg.dt > ftc.click_dt
    GROUP BY stimg.homepage_pageview_token
),
onb_step1_impression AS (
    SELECT
        stimg.homepage_pageview_token,
        MIN(dt) AS event_dt
    FROM event.mediawiki_structured_task_article_image_suggestion_interaction AS stimg
    JOIN addimage_impression AS ai
    ON stimg.homepage_pageview_token = ai.homepage_pageview_token
    WHERE {partition_statement}
    AND active_interface = "onboarding_step_1_dialog"
    AND action = "impression"
    -- not limiting by timestamp because I'm unsure if it always occurs after
    -- the first impression of the interface
    GROUP BY stimg.homepage_pageview_token 
),
onb_step1_skip AS (
    SELECT
        stimg.homepage_pageview_token,
        MIN(dt) AS event_dt
    FROM event.mediawiki_structured_task_article_image_suggestion_interaction AS stimg
    JOIN onb_step1_impression AS onsi
    ON stimg.homepage_pageview_token = onsi.homepage_pageview_token
    WHERE {partition_statement}
    AND active_interface = "onboarding_step_1_dialog"
    AND action = "skip_all"
    AND stimg.dt > onsi.event_dt
    GROUP BY stimg.homepage_pageview_token
),
onb_step2_impression AS (
    SELECT
        stimg.homepage_pageview_token,
        MIN(dt) AS event_dt
    FROM event.mediawiki_structured_task_article_image_suggestion_interaction AS stimg
    JOIN onb_step1_impression AS onsi
    ON stimg.homepage_pageview_token = onsi.homepage_pageview_token
    WHERE {partition_statement}
    AND active_interface = "onboarding_step_2_dialog"
    AND action = "impression"
    AND stimg.dt > onsi.event_dt
    GROUP BY stimg.homepage_pageview_token
),
onb_step2_skip AS (
    SELECT
        stimg.homepage_pageview_token,
        MIN(dt) AS event_dt
    FROM event.mediawiki_structured_task_article_image_suggestion_interaction AS stimg
    JOIN onb_step2_impression AS onsi
    ON stimg.homepage_pageview_token = onsi.homepage_pageview_token
    WHERE {partition_statement}
    AND active_interface = "onboarding_step_2_dialog"
    AND action = "skip_all"
    AND stimg.dt > onsi.event_dt
    GROUP BY stimg.homepage_pageview_token
),
onb_step3_impression AS (
    SELECT
        stimg.homepage_pageview_token,
        MIN(dt) AS event_dt
    FROM event.mediawiki_structured_task_article_image_suggestion_interaction AS stimg
    JOIN onb_step2_impression AS onsi
    ON stimg.homepage_pageview_token = onsi.homepage_pageview_token
    WHERE {partition_statement}
    AND active_interface = "onboarding_step_3_dialog"
    AND action = "impression"
    AND stimg.dt > onsi.event_dt
    GROUP BY stimg.homepage_pageview_token
),
onb_step3_skip AS (
    SELECT
        stimg.homepage_pageview_token,
        MIN(dt) AS event_dt
    FROM event.mediawiki_structured_task_article_image_suggestion_interaction AS stimg
    JOIN onb_step3_impression AS onsi
    ON stimg.homepage_pageview_token = onsi.homepage_pageview_token
    WHERE {partition_statement}
    AND active_interface = "onboarding_step_3_dialog"
    AND action = "skip_all"
    AND stimg.dt > onsi.event_dt
    GROUP BY stimg.homepage_pageview_token
),
onb_step4_impression AS (
    SELECT
        stimg.homepage_pageview_token,
        MIN(dt) AS event_dt
    FROM event.mediawiki_structured_task_article_image_suggestion_interaction AS stimg
    JOIN onb_step3_impression AS onsi
    ON stimg.homepage_pageview_token = onsi.homepage_pageview_token
    WHERE {partition_statement}
    AND active_interface = "onboarding_step_4_dialog"
    AND action = "impression"
    AND stimg.dt > onsi.event_dt
    GROUP BY stimg.homepage_pageview_token
),
onb_step4_getstarted AS (
    SELECT
        stimg.homepage_pageview_token,
        MIN(dt) AS event_dt
    FROM event.mediawiki_structured_task_article_image_suggestion_interaction AS stimg
    JOIN onb_step4_impression AS onsi
    ON stimg.homepage_pageview_token = onsi.homepage_pageview_token
    WHERE {partition_statement}
    AND active_interface = "onboarding_step_4_dialog"
    AND action = "get_started"
    AND stimg.dt > onsi.event_dt
    GROUP BY stimg.homepage_pageview_token
),
on_task AS (
    SELECT
        homepage_pageview_token,
        MIN(event_dt) AS event_dt
    FROM (
        SELECT homepage_pageview_token, event_dt
        FROM onb_step1_skip
        UNION ALL
        SELECT homepage_pageview_token, event_dt
        FROM onb_step2_skip
        UNION ALL
        SELECT homepage_pageview_token, event_dt
        FROM onb_step3_skip
        UNION ALL
        SELECT homepage_pageview_token, event_dt
        FROM onb_step4_getstarted
    ) AS ontask_events
    GROUP BY homepage_pageview_token
)
SELECT
    hpv.*,
    ftc.click_dt,
    ftc.click_number,
    ftc.page_id,
    IF(nosuggestion_impression.homepage_pageview_token IS NOT NULL, 1, 0) AS nosuggestion_impression,
    nosuggestion_impression.event_dt AS nosuggestion_impression_dt,
    IF(addimage_impression.homepage_pageview_token IS NOT NULL, 1, 0) AS addimage_impression,
    addimage_impression.event_dt AS addimage_impression_dt,
    IF(onb_step1_impression.homepage_pageview_token IS NOT NULL, 1, 0) AS onboarding_step1_impression,
    onb_step1_impression.event_dt AS onboarding_step1_impression_dt,
    IF(onb_step1_skip.homepage_pageview_token IS NOT NULL, 1, 0) AS onboarding_step1_skipall,
    onb_step1_skip.event_dt AS onboarding_step1_skipall_dt,
    IF(onb_step2_impression.homepage_pageview_token IS NOT NULL, 1, 0) AS onboarding_step2_impression,
    onb_step2_impression.event_dt AS onboarding_step2_impression_dt,
    IF(onb_step2_skip.homepage_pageview_token IS NOT NULL, 1, 0) AS onboarding_step2_skipall,
    onb_step2_skip.event_dt AS onboarding_step2_skipall_dt,
    IF(onb_step3_impression.homepage_pageview_token IS NOT NULL, 1, 0) AS onboarding_step3_impression,
    onb_step3_impression.event_dt AS onboarding_step3_impression_dt,
    IF(onb_step3_skip.homepage_pageview_token IS NOT NULL, 1, 0) AS onboarding_step3_skipall,
    onb_step3_skip.event_dt AS onboarding_step3_skipall_dt,
    IF(onb_step4_impression.homepage_pageview_token IS NOT NULL, 1, 0) AS onboarding_step4_impression,
    onb_step4_impression.event_dt AS onboarding_step4_impression_dt,
    IF(onb_step4_getstarted.homepage_pageview_token IS NOT NULL, 1, 0) AS onboarding_step4_getstarted,
    onb_step4_getstarted.event_dt AS onboarding_step4_getstarted_dt,
    IF(on_task.homepage_pageview_token IS NOT NULL, 1, 0) AS on_task,
    on_task.event_dt AS on_task_dt
FROM hp_visits AS hpv
LEFT JOIN first_task_click AS ftc
ON hpv.homepage_pageview_token = ftc.homepage_pageview_token
LEFT JOIN nosuggestion_impression
ON hpv.homepage_pageview_token = nosuggestion_impression.homepage_pageview_token
LEFT JOIN addimage_impression
ON hpv.homepage_pageview_token = addimage_impression.homepage_pageview_token
LEFT JOIN onb_step1_impression
ON hpv.homepage_pageview_token = onb_step1_impression.homepage_pageview_token
LEFT JOIN onb_step1_skip
ON hpv.homepage_pageview_token = onb_step1_skip.homepage_pageview_token
LEFT JOIN onb_step2_impression
ON hpv.homepage_pageview_token = onb_step2_impression.homepage_pageview_token
LEFT JOIN onb_step2_skip
ON hpv.homepage_pageview_token = onb_step2_skip.homepage_pageview_token
LEFT JOIN onb_step3_impression
ON hpv.homepage_pageview_token = onb_step3_impression.homepage_pageview_token
LEFT JOIN onb_step3_skip
ON hpv.homepage_pageview_token = onb_step3_skip.homepage_pageview_token
LEFT JOIN onb_step4_impression
ON hpv.homepage_pageview_token = onb_step4_impression.homepage_pageview_token
LEFT JOIN onb_step4_getstarted
ON hpv.homepage_pageview_token = onb_step4_getstarted.homepage_pageview_token
LEFT JOIN on_task
ON hpv.homepage_pageview_token = on_task.homepage_pageview_token
'''

## TODO

1. [x] Run the mobile onboarding funnel query.
2. [x] Join with variant data as before.
3. [x] Select mobile Add an Image newcomers first visit as before.
4. [x] Select those who got on task from the previous step.
5. [x] Create a PySpark dataframe from that.
6. [x] Register the dataframe as a temporary table.

In [81]:
mobile_onb_funnel_data = spark.run(
    onboarding_funnel_query.format(
        wiki_list = ','.join(['"{}"'.format(w) for w in wikis]),
        hpv_known_user_id_expression = make_known_users_sql(known_users, 'hpv.wiki', 'hpv.event.user_id'),
        hpv_partition_statement = make_partition_statement(start_date, end_date, 'hpv'),
        hpm_partition_statement = make_partition_statement(start_date, end_date, 'hpm'),
        ssac_partition_statement = make_partition_statement(start_date, end_date, 'ssac'),
        partition_statement = make_partition_statement(start_date, end_date),
    ), session_type = 'yarn-large'
)

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.


In [82]:
variant_data = get_variant_data(wikis)

In [83]:
variant_data['variant_name'] = variant_data['variant_name'].apply(lambda v: v.decode('utf-8'))

In [None]:
variant_data.head()

In [84]:
mobile_onb_funnel_data = mobile_onb_funnel_data.merge(variant_data,
                                                     on = ['wiki', 'user_id'])

## Discarding Invalid Sessions

Later investigation of task completion based on whether users skipped or completed onboarding revealed that we have some invalid sessions where users did both. We'll discard those sessions.

In [None]:
mobile_onb_funnel_data.loc[
    ((mobile_onb_funnel_data['onboarding_step1_skipall'] == 1) |
     (mobile_onb_funnel_data['onboarding_step2_skipall'] == 1) |
     (mobile_onb_funnel_data['onboarding_step3_skipall'] == 1)
    ) &
    (mobile_onb_funnel_data['onboarding_step4_getstarted'] == 1),
    'homepage_pageview_token'
]

In [85]:
def addimage_mobile_newcomers(df):
    return(df.loc[
        (df['variant_name'] == 'imagerecommendation') &
        (df['is_newcomer'] == 1) &
        (df['is_24hr_visit'] == 1) &
        (df['is_mobile'] == True)])

In [219]:
mob_newcomers = addimage_mobile_newcomers(mobile_onb_funnel_data.loc[
    ~mobile_onb_funnel_data['homepage_pageview_token'].isin(
        mobile_onb_funnel_data.loc[
            ((mobile_onb_funnel_data['onboarding_step1_skipall'] == 1) |
             (mobile_onb_funnel_data['onboarding_step2_skipall'] == 1) |
             (mobile_onb_funnel_data['onboarding_step3_skipall'] == 1)
            ) &
            (mobile_onb_funnel_data['onboarding_step4_getstarted'] == 1),
            'homepage_pageview_token'
        ]
    )
])

We limit the columns to the ones we need in the subsequent query, select for users who got "on task", make a copy, then enforce the `page_id` to `int` so that it has the right data type for the later join.

In [220]:
on_task_colnames = [
    'homepage_pageview_token', 'wiki', 'user_id', 'is_mobile',
    'click_dt', 'page_id', 'on_task', 'on_task_dt'
]

In [221]:
on_task_newcomers = mob_newcomers.loc[mob_newcomers['on_task'] == 1, on_task_colnames].copy()

In [222]:
on_task_newcomers['page_id'] = on_task_newcomers['page_id'].astype(int)

In [223]:
spark_session = spark.get_session()
on_task_sdf = spark_session.createDataFrame(on_task_newcomers)
on_task_sdf.createOrReplaceGlobalTempView(on_task_temp_table)

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.


## Funnel Definition

We have a complex definition of "on task", and a query that gives us that. What I want to do here is to rerun the onboarding funnel query but store the resulting dataframe on the cluster. Then I'll use that dataframe to query only the sessions of newcomers on mobile who got "on task", and from that identify what they did throughout the remainder of the session.

Let's define that funnel more specfiically:

1. User is on task.
2. Makes a decision (yes/no/not sure).
    1. User chose "not sure" (aka "skip"), confirms , and is returned to the Homepage/Suggested Edits.
    2. User chose "no", and confirms the rejection. Post-edit dialogue is shown with "Thanks for reviewing…" message.
    3. User chose "yes", and is taken to the caption help.
3. User writes a caption, clicks "Publish changes…"
4. User writes an edit summary and clicks "Publish changes" to complete the task.
5. Post-edit dialogue is shown with "You've published an edit" message.
    1. Edit is rejected within 48 hours.
    2. User sees post-edit dialogue, makes choices.

Similarly as for the onboarding funnel, we have multiple ways to task completion, and so they'll come together in a `UNION` step to make aggregation of those straigthforward if we want to do so.

There are two ways into the post-edit dialogue, one is by rejecting the suggestion, the other is by completing the task. Options at the post-edit dialogue step are the same. We'll most likely want to make this into two separate funnels so we can identify differences in behaviour at that step based on how users got there.

In [224]:
task_completion_funnel_query = '''
WITH on_task_sessions AS (
    SELECT
        wiki,
        user_id,
        page_id,
        homepage_pageview_token,
        on_task_dt AS event_dt
    FROM global_temp.{on_task_table}
),
choose_unsure AS (
    SELECT
        stimg.homepage_pageview_token,
        MIN(dt) AS event_dt
    FROM event.mediawiki_structured_task_article_image_suggestion_interaction AS stimg
    JOIN on_task_sessions
    ON stimg.homepage_pageview_token = on_task_sessions.homepage_pageview_token
    WHERE {partition_statement}
    AND active_interface = "recommendedimagetoolbar_dialog"
    AND action = "suggestion_skip"
    AND dt > on_task_sessions.event_dt
    GROUP BY stimg.homepage_pageview_token
),
confirm_skip AS (
    SELECT
        stimg.homepage_pageview_token,
        MIN(dt) AS event_dt
    FROM event.mediawiki_structured_task_article_image_suggestion_interaction AS stimg
    JOIN choose_unsure
    ON stimg.homepage_pageview_token = choose_unsure.homepage_pageview_token
    WHERE {partition_statement}
    AND active_interface = "skip_dialog"
    AND action = "confirm_skip_suggestion"
    AND dt > choose_unsure.event_dt
    GROUP BY stimg.homepage_pageview_token
),
choose_reject AS (
    SELECT
        stimg.homepage_pageview_token,
        MIN(dt) AS event_dt
    FROM event.mediawiki_structured_task_article_image_suggestion_interaction AS stimg
    JOIN on_task_sessions
    ON stimg.homepage_pageview_token = on_task_sessions.homepage_pageview_token
    WHERE {partition_statement}
    AND active_interface = "recommendedimagetoolbar_dialog"
    AND action = "suggestion_reject"
    AND dt > on_task_sessions.event_dt
    GROUP BY stimg.homepage_pageview_token
),
rejection_impression AS (
    SELECT
        stimg.homepage_pageview_token,
        MIN(dt) AS event_dt
    FROM event.mediawiki_structured_task_article_image_suggestion_interaction AS stimg
    JOIN choose_reject
    ON stimg.homepage_pageview_token = choose_reject.homepage_pageview_token
    WHERE {partition_statement}
    AND active_interface = "rejection_dialog"
    AND action = "impression"
    AND dt > choose_reject.event_dt
    GROUP BY stimg.homepage_pageview_token
),
confirm_reject AS (
    SELECT
        stimg.homepage_pageview_token,
        MIN(dt) AS event_dt
    FROM event.mediawiki_structured_task_article_image_suggestion_interaction AS stimg
    JOIN rejection_impression
    ON stimg.homepage_pageview_token = rejection_impression.homepage_pageview_token
    WHERE {partition_statement}
    AND active_interface = "rejection_dialog"
    AND action = "close"
    AND str_to_map(action_data, ";", "=")["acceptance_state"] = "rejected"
    AND dt > rejection_impression.event_dt
    GROUP BY stimg.homepage_pageview_token
),
choose_accept AS (
    SELECT
        stimg.homepage_pageview_token,
        MIN(dt) AS event_dt
    FROM event.mediawiki_structured_task_article_image_suggestion_interaction AS stimg
    JOIN on_task_sessions
    ON stimg.homepage_pageview_token = on_task_sessions.homepage_pageview_token
    WHERE {partition_statement}
    AND active_interface = "recommendedimagetoolbar_dialog"
    AND action = "suggestion_accept"
    AND dt > on_task_sessions.event_dt
    GROUP BY stimg.homepage_pageview_token
),
captioninfo_impression AS (
    SELECT
        stimg.homepage_pageview_token,
        MIN(dt) AS event_dt
    FROM event.mediawiki_structured_task_article_image_suggestion_interaction AS stimg
    JOIN choose_accept
    ON stimg.homepage_pageview_token = choose_accept.homepage_pageview_token
    WHERE {partition_statement}
    AND active_interface = "captioninfo_dialog"
    AND action = "impression"
    AND dt > choose_accept.event_dt
    GROUP BY stimg.homepage_pageview_token
),
captioninfo_close AS (
    SELECT
        stimg.homepage_pageview_token,
        MIN(dt) AS event_dt
    FROM event.mediawiki_structured_task_article_image_suggestion_interaction AS stimg
    JOIN captioninfo_impression
    ON stimg.homepage_pageview_token = captioninfo_impression.homepage_pageview_token
    WHERE {partition_statement}
    AND active_interface = "captioninfo_dialog"
    AND action = "close"
    AND dt > captioninfo_impression.event_dt
    GROUP BY stimg.homepage_pageview_token
),
caption_entry_focus AS (
    SELECT
        stimg.homepage_pageview_token,
        MIN(dt) AS event_dt
    FROM event.mediawiki_structured_task_article_image_suggestion_interaction AS stimg
    JOIN captioninfo_close
    ON stimg.homepage_pageview_token = captioninfo_close.homepage_pageview_token
    WHERE {partition_statement}
    AND active_interface = "caption_entry"
    AND action = "focus"
    AND dt > captioninfo_close.event_dt
    GROUP BY stimg.homepage_pageview_token
),
editsummary_impression AS (
    SELECT
        stimg.homepage_pageview_token,
        MIN(dt) AS event_dt
    FROM event.mediawiki_structured_task_article_image_suggestion_interaction AS stimg
    JOIN caption_entry_focus
    ON stimg.homepage_pageview_token = caption_entry_focus.homepage_pageview_token
    WHERE {partition_statement}
    AND active_interface = "editsummary_dialog"
    AND action = "impression"
    AND dt > caption_entry_focus.event_dt
    GROUP BY stimg.homepage_pageview_token
),
editsummary_save AS (
    SELECT
        stimg.homepage_pageview_token,
        MIN(dt) AS event_dt
    FROM event.mediawiki_structured_task_article_image_suggestion_interaction AS stimg
    JOIN editsummary_impression
    ON stimg.homepage_pageview_token = editsummary_impression.homepage_pageview_token
    WHERE {partition_statement}
    AND active_interface = "editsummary_dialog"
    AND action = "editsummary_save"
    AND dt > editsummary_impression.event_dt
    GROUP BY stimg.homepage_pageview_token
),
edit_revert AS (
    -- edits tagged with add image (wiki, user_id, page_id, timestamp)
    -- whether the edit was reverted within 48 hours
    -- should be able to match editsummary_save event within say 30 seconds?
    SELECT
        `database` AS wiki,
        rev_id,
        FIRST_VALUE(page_id) AS page_id,
        FIRST_VALUE(performer.user_id) AS user_id,
        FIRST_VALUE(rev_timestamp) AS rev_timestamp,
        MAX(IF(array_contains(tags, 'mw-reverted') AND
               (unix_timestamp(meta.dt, "yyyy-MM-dd'T'HH:mm:ss'Z'") -
                unix_timestamp(rev_timestamp, "yyyy-MM-dd'T'HH:mm:ss'Z'") < 60*60*48), 1, 0)) AS was_reverted
    FROM event_sanitized.mediawiki_revision_tags_change
    WHERE {partition_statement}
    AND `database` IN ({wiki_list})
    AND ({known_user_database_expression})
    AND array_contains(tags, "newcomer task image suggestion")
    GROUP BY wiki, rev_id
),
completed_task AS (
    -- union of confirm_reject and editsummary_save
    -- as both of those paths leads to the post-edit dialogue
    SELECT
        homepage_pageview_token,
        MIN(event_dt) AS event_dt
    FROM (
        SELECT homepage_pageview_token, event_dt
        FROM confirm_reject
        UNION ALL
        SELECT homepage_pageview_token, event_dt
        FROM editsummary_save
    ) AS comptask_events
    GROUP BY homepage_pageview_token
)
SELECT
    ots.*,
    IF(choose_unsure.homepage_pageview_token IS NOT NULL, 1, 0) AS chose_unsure,
    choose_unsure.event_dt AS chose_unsure_dt,
    IF(confirm_skip.homepage_pageview_token IS NOT NULL, 1, 0) AS confirmed_skip,
    confirm_skip.event_dt AS confirmed_skip_dt,
    IF(choose_reject.homepage_pageview_token IS NOT NULL, 1, 0) AS chose_reject,
    choose_reject.event_dt AS chose_reject_dt,
    IF(rejection_impression.homepage_pageview_token IS NOT NULL, 1, 0) AS rejection_dialog_impression,
    rejection_impression.event_dt AS rejection_dialog_impression_dt,
    IF(confirm_reject.homepage_pageview_token IS NOT NULL, 1, 0) AS confirmed_reject,
    confirm_reject.event_dt AS confirmed_reject_dt,
    IF(choose_accept.homepage_pageview_token IS NOT NULL, 1, 0) AS choose_accept,
    choose_accept.event_dt AS choose_accept_dt,
    IF(captioninfo_impression.homepage_pageview_token IS NOT NULL, 1, 0) AS captioninfo_dialog_impression,
    captioninfo_impression.event_dt AS captioninfo_dialog_impression_dt,
    IF(captioninfo_close.homepage_pageview_token IS NOT NULL, 1, 0) AS closed_captioninfo_dialog,
    captioninfo_close.event_dt AS closed_captioninfo_dialog_dt,
    IF(caption_entry_focus.homepage_pageview_token IS NOT NULL, 1, 0) AS caption_entry_focused,
    caption_entry_focus.event_dt AS caption_entry_focused_dt,
    IF(editsummary_impression.homepage_pageview_token IS NOT NULL, 1, 0) AS editsummary_dialog_impression,
    editsummary_impression.event_dt AS editsummary_dialog_impression_dt,
    IF(editsummary_save.homepage_pageview_token IS NOT NULL, 1, 0) AS editsummary_chose_save,
    editsummary_save.event_dt AS editsummary_chose_save_dt,
    IF(edit_revert.page_id IS NOT NULL, 1, 0) AS tagged_edit_saved,
    edit_revert.rev_timestamp AS tagged_edit_saved_dt,
    COALESCE(edit_revert.was_reverted, 0) AS tagged_edit_reverted,
    IF(completed_task.homepage_pageview_token IS NOT NULL, 1, 0) AS task_completed,
    completed_task.event_dt AS task_completed_dt
FROM on_task_sessions AS ots
LEFT JOIN choose_unsure
ON ots.homepage_pageview_token = choose_unsure.homepage_pageview_token
LEFT JOIN confirm_skip
ON ots.homepage_pageview_token = confirm_skip.homepage_pageview_token
LEFT JOIN choose_reject
ON ots.homepage_pageview_token = choose_reject.homepage_pageview_token
LEFT JOIN rejection_impression
ON ots.homepage_pageview_token = rejection_impression.homepage_pageview_token
LEFT JOIN confirm_reject
ON ots.homepage_pageview_token = confirm_reject.homepage_pageview_token
LEFT JOIN choose_accept
ON ots.homepage_pageview_token = choose_accept.homepage_pageview_token
LEFT JOIN captioninfo_impression
ON ots.homepage_pageview_token = captioninfo_impression.homepage_pageview_token
LEFT JOIN captioninfo_close
ON ots.homepage_pageview_token = captioninfo_close.homepage_pageview_token
LEFT JOIN caption_entry_focus
ON ots.homepage_pageview_token = caption_entry_focus.homepage_pageview_token
LEFT JOIN editsummary_impression
ON ots.homepage_pageview_token = editsummary_impression.homepage_pageview_token
LEFT JOIN editsummary_save
ON ots.homepage_pageview_token = editsummary_save.homepage_pageview_token
LEFT JOIN edit_revert
ON ots.wiki = edit_revert.wiki
AND ots.user_id = edit_revert.user_id
AND ots.page_id = edit_revert.page_id
LEFT JOIN completed_task
ON ots.homepage_pageview_token = completed_task.homepage_pageview_token
WHERE (edit_revert.page_id IS NULL
       OR editsummary_save.homepage_pageview_token IS NULL
       -- within 1 minute
       OR abs(unix_timestamp(edit_revert.rev_timestamp, "yyyy-MM-dd'T'HH:mm:ss'Z'") -
              unix_timestamp(editsummary_save.event_dt, "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'")) < 60)             
'''

In [225]:
task_completion_data = spark.run(
    task_completion_funnel_query.format(
        wiki_list = ','.join(['"{}"'.format(w) for w in wikis]),
        hpv_known_user_id_expression = make_known_users_sql(known_users, 'hpv.wiki', 'hpv.event.user_id'),
        hpv_partition_statement = make_partition_statement(start_date, end_date, 'hpv'),
        hpm_partition_statement = make_partition_statement(start_date, end_date, 'hpm'),
        ssac_partition_statement = make_partition_statement(start_date, end_date, 'ssac'),
        partition_statement = make_partition_statement(start_date, end_date),
        on_task_table = on_task_temp_table,
        known_user_database_expression = make_known_users_sql(known_users,
                                                              '`database`', 'performer.user_id'),
    ), session_type = 'yarn-large'
)

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.


In [None]:
task_completion_data.head()

## Inspecting Erroneous Conditions

Confirming to skip the suggestion should end the session and take the user back to Suggested Edits. Confirming to reject the suggestion should end the session and show the post-edit dialogue. Saving the edit works similarly as rejecting the suggestion. This gives us three combinations of pairs of conditions that should not occur in the same session.

In [None]:
task_completion_data.loc[
    (task_completion_data['confirmed_skip'] == 1) &
    (task_completion_data['confirmed_reject'] == 1),
    ['homepage_pageview_token',
     'chose_unsure', 'chose_unsure_dt', 'confirmed_skip', 'confirmed_skip_dt',
     'chose_reject', 'confirmed_reject_dt', 'editsummary_chose_save', 'editsummary_chose_save_dt']
]

In [None]:
task_completion_data.loc[
    (task_completion_data['confirmed_skip'] == 1) &
    (task_completion_data['editsummary_chose_save'] == 1),
    ['chose_unsure', 'chose_unsure_dt', 'confirmed_skip', 'confirmed_skip_dt',
     'chose_reject', 'confirmed_reject_dt', 'editsummary_chose_save', 'editsummary_chose_save_dt']
]

In [None]:
task_completion_data.loc[
    (task_completion_data['confirmed_reject'] == 1) &
    (task_completion_data['editsummary_chose_save'] == 1),
    ['chose_unsure', 'chose_unsure_dt', 'confirmed_skip', 'confirmed_skip_dt',
     'chose_reject', 'confirmed_reject_dt', 'editsummary_chose_save', 'editsummary_chose_save_dt']
]

In [None]:
task_completion_data.loc[
    (task_completion_data['confirmed_reject'] == 1) &
    (task_completion_data['editsummary_chose_save'] == 1) &
    (task_completion_data['confirmed_skip'] == 1),
    ['chose_unsure', 'chose_unsure_dt', 'confirmed_skip', 'confirmed_skip_dt',
     'chose_reject', 'confirmed_reject_dt', 'editsummary_chose_save', 'editsummary_chose_save_dt']
]

We find no session has all three, but there are sessions that have combinations of these. This indicates that we have some data that appears to be combinations of multiple sessions. One approach to dealing with these could be to identify the first of these events and discard anything after it. Another approach would be to discard everything from these sessions.

In this case I choose to do the latter, even though that means that we should also discard them from the analysis of the onboarding funnel in order to keep the numbers consistent. We'll live with this for now, and in future analyses perhaps have a look at whether a Homepage session (as defined by its token) has multiple task clicks. That would allow us to discard them earlier.

## Valid On Task Sessions

In [226]:
valid_ontask_sessions = task_completion_data.loc[
    ~(
        ((task_completion_data['confirmed_skip'] == 1) &
         (task_completion_data['confirmed_reject'] == 1))
        |
        ((task_completion_data['confirmed_reject'] == 1) &
         (task_completion_data['editsummary_chose_save'] == 1))
        |
        ((task_completion_data['confirmed_skip'] == 1) &
         (task_completion_data['editsummary_chose_save'] == 1))
    )
]

In [229]:
len(valid_ontask_sessions)

1092

## Link Decision

For calculations of proportions out of users who clicked a task (and didn't get the "no suggestions" dialogue, we can conveniently calculate that from the `mob_newcomers` dataset we gathered earlier:

In [227]:
len(
    mob_newcomers.loc[
        (mob_newcomers['click_number'] > 0) &
        (mob_newcomers['nosuggestion_impression'] == 0)
    ]
)

1454

Number of users who clicked "not sure", and proportion out of all On Task users, and out of users who clicked a task:

In [228]:
len(valid_ontask_sessions.loc[valid_ontask_sessions['chose_unsure'] == 1])

138

In [230]:
round_perc_df(
    valid_ontask_sessions.loc[valid_ontask_sessions['chose_unsure'] == 1],
    valid_ontask_sessions
)

12.6

In [231]:
round_perc_df(
    valid_ontask_sessions.loc[valid_ontask_sessions['chose_unsure'] == 1],
    mob_newcomers.loc[
        (mob_newcomers['click_number'] > 0) &
        (mob_newcomers['nosuggestion_impression'] == 0)
    ]
)

9.5

Number of users who clicked to reject the suggestion, and proportion out of all On Task users, and out of users who clicked a task:

In [232]:
len(valid_ontask_sessions.loc[valid_ontask_sessions['chose_reject'] == 1])

306

In [233]:
round_perc_df(
    valid_ontask_sessions.loc[valid_ontask_sessions['chose_reject'] == 1],
    valid_ontask_sessions
)

28.0

In [234]:
round_perc_df(
    valid_ontask_sessions.loc[valid_ontask_sessions['chose_reject'] == 1],
    mob_newcomers.loc[
        (mob_newcomers['click_number'] > 0) &
        (mob_newcomers['nosuggestion_impression'] == 0)
    ]
)

21.0

Number of users who accepted the suggestion, and proportion out of all On Task users, and out of users who clicked a task:

In [235]:
len(valid_ontask_sessions.loc[valid_ontask_sessions['choose_accept'] == 1])

422

In [236]:
round_perc_df(
    valid_ontask_sessions.loc[valid_ontask_sessions['choose_accept'] == 1],
    valid_ontask_sessions
)

38.6

In [237]:
round_perc_df(
    valid_ontask_sessions.loc[valid_ontask_sessions['choose_accept'] == 1],
    mob_newcomers.loc[
        (mob_newcomers['click_number'] > 0) &
        (mob_newcomers['nosuggestion_impression'] == 0)
    ]
)

29.0

In [238]:
len(
    valid_ontask_sessions.loc[
        (valid_ontask_sessions['chose_unsure'] == 1) |
        (valid_ontask_sessions['chose_reject'] == 1) |
        (valid_ontask_sessions['choose_accept'] == 1)
    ]
)

717

In [239]:
round_perc_df(
    valid_ontask_sessions.loc[
        (valid_ontask_sessions['chose_unsure'] == 1) |
        (valid_ontask_sessions['chose_reject'] == 1) |
        (valid_ontask_sessions['choose_accept'] == 1)
    ],
    valid_ontask_sessions
)

65.7

In [240]:
round_perc_df(
    valid_ontask_sessions.loc[
        (valid_ontask_sessions['chose_unsure'] == 1) |
        (valid_ontask_sessions['chose_reject'] == 1) |
        (valid_ontask_sessions['choose_accept'] == 1)
    ],
    mob_newcomers.loc[
        (mob_newcomers['click_number'] > 0) &
        (mob_newcomers['nosuggestion_impression'] == 0)
    ]
)

49.3

The bounce rate is then the users who didn't do any of these things:

In [241]:
(
    len(valid_ontask_sessions) -
    len(
        valid_ontask_sessions.loc[
            (valid_ontask_sessions['chose_unsure'] == 1) |
            (valid_ontask_sessions['chose_reject'] == 1) |
            (valid_ontask_sessions['choose_accept'] == 1)
        ]
    )
)

375

In [242]:
round_perc(
    len(valid_ontask_sessions) -
    len(
        valid_ontask_sessions.loc[
            (valid_ontask_sessions['chose_unsure'] == 1) |
            (valid_ontask_sessions['chose_reject'] == 1) |
            (valid_ontask_sessions['choose_accept'] == 1)
        ]
    ),
    len(valid_ontask_sessions)
)

34.3

In [243]:
round_perc(
    len(valid_ontask_sessions) -
    len(
        valid_ontask_sessions.loc[
            (valid_ontask_sessions['chose_unsure'] == 1) |
            (valid_ontask_sessions['chose_reject'] == 1) |
            (valid_ontask_sessions['choose_accept'] == 1)
        ]
    ),
    len(mob_newcomers.loc[
        (mob_newcomers['click_number'] > 0) &
        (mob_newcomers['nosuggestion_impression'] == 0)
    ])
)

25.8

## Task Review for Skip and Reject Sessions

We'll treat these separately from other sessions and exclude them from further analysis. In other words, while users in these sessions might've accepted the suggestion, gone to the caption stage, then gone back to reject it and confirm the rejection, we'll ignore "accept then caption" part of their session when we later look at that part of the funnel. Getting a deeper understanding of this kind of behaviour would be possible if we switch from a funnel perspective to a flow perspective, meaning that we only look at pairwise transitions between states in the workflow. We'll leave that for a future analysis.

We'll give "not sure" precedence over rejecting the suggestion, because the former takes the user back to Suggested Edits on the Homepage, whereas the latter workflow has a confirmation step and the post-edit dialogue. Later on, we'll analyze the post-edit dialogue separately.

In [244]:
len(
    valid_ontask_sessions.loc[
        (valid_ontask_sessions['chose_unsure'] == 1) &
        (valid_ontask_sessions['confirmed_skip'] == 1)
    ]
)

103

In [245]:
round_perc_df(
    valid_ontask_sessions.loc[
        (valid_ontask_sessions['chose_unsure'] == 1) &
        (valid_ontask_sessions['confirmed_skip'] == 1)
    ],
    valid_ontask_sessions.loc[valid_ontask_sessions['chose_unsure'] == 1]
)

74.6

In [246]:
round_perc_df(
    valid_ontask_sessions.loc[
        (valid_ontask_sessions['chose_unsure'] == 1) &
        (valid_ontask_sessions['confirmed_skip'] == 1)
    ],
    mob_newcomers.loc[
        (mob_newcomers['click_number'] > 0) &
        (mob_newcomers['nosuggestion_impression'] == 0)
    ]
)

7.1

The rejection path means they see the rejection dialog, then confirms it.

In [247]:
len(
    valid_ontask_sessions.loc[
        (valid_ontask_sessions['chose_reject'] == 1) &
        (valid_ontask_sessions['rejection_dialog_impression'] == 1)
    ]
)

306

In this case, everyone who clicked "reject" also saw the dialog, so that makes proportion calculations easy. 

*Note:* After doing the reject path I noticed that I did not have the `skip_dialog` impression in the skip path. Since we saw that all the users who clicked also saw the rejection dialogue, I've similarly forwarded all the users who clicked "not sure" to also have seen the skip dialogue.

Let's look at how many confirmed once they saw the rejection dialog:

In [248]:
len(
    valid_ontask_sessions.loc[
        (valid_ontask_sessions['chose_reject'] == 1) &
        (valid_ontask_sessions['rejection_dialog_impression'] == 1) &
        (valid_ontask_sessions['confirmed_reject'] == 1)
    ]
)

181

Proportion out of all users who saw the rejection dialogue, then out of all users who clicked a task:

In [249]:
round_perc_df(
    valid_ontask_sessions.loc[
        (valid_ontask_sessions['chose_reject'] == 1) &
        (valid_ontask_sessions['rejection_dialog_impression'] == 1) &
        (valid_ontask_sessions['confirmed_reject'] == 1)
    ],
    valid_ontask_sessions.loc[
        (valid_ontask_sessions['chose_reject'] == 1) &
        (valid_ontask_sessions['rejection_dialog_impression'] == 1)
    ]
)

59.2

In [250]:
round_perc_df(
    valid_ontask_sessions.loc[
        (valid_ontask_sessions['chose_reject'] == 1) &
        (valid_ontask_sessions['rejection_dialog_impression'] == 1) &
        (valid_ontask_sessions['confirmed_reject'] == 1)
    ],
    mob_newcomers.loc[
        (mob_newcomers['click_number'] > 0) &
        (mob_newcomers['nosuggestion_impression'] == 0)
    ]
)

12.4

The acceptance path is the main path with several steps. First is seeing the caption dialog:

In [251]:
len(
    valid_ontask_sessions.loc[
        (valid_ontask_sessions['choose_accept'] == 1) &
        (valid_ontask_sessions['captioninfo_dialog_impression'] == 1)
    ]
)

417

Unlike the rejection dialogue, not all users saw the caption info dialogue. This is somewhat concerning, although we note that it's about 2% of the users.

Proportion out of all users who clicked "yes", then out of all users who clicked a task:

In [252]:
round_perc_df(
    valid_ontask_sessions.loc[
        (valid_ontask_sessions['choose_accept'] == 1) &
        (valid_ontask_sessions['captioninfo_dialog_impression'] == 1)
    ],
    valid_ontask_sessions.loc[
        (valid_ontask_sessions['choose_accept'] == 1)
    ]
)

98.8

In [253]:
round_perc_df(
    valid_ontask_sessions.loc[
        (valid_ontask_sessions['choose_accept'] == 1) &
        (valid_ontask_sessions['captioninfo_dialog_impression'] == 1)
    ],
    mob_newcomers.loc[
        (mob_newcomers['click_number'] > 0) &
        (mob_newcomers['nosuggestion_impression'] == 0)
    ]
)

28.7

After seeing the caption dialogue, the user must close it:

In [255]:
len(
    valid_ontask_sessions.loc[
        (valid_ontask_sessions['choose_accept'] == 1) &
        (valid_ontask_sessions['captioninfo_dialog_impression'] == 1) &
        (valid_ontask_sessions['closed_captioninfo_dialog'] == 1)
    ]
)

413

Proportion out of users who saw the dialogue and users who clicked a task:

In [256]:
round_perc_df(
    valid_ontask_sessions.loc[
        (valid_ontask_sessions['choose_accept'] == 1) &
        (valid_ontask_sessions['captioninfo_dialog_impression'] == 1) &
        (valid_ontask_sessions['closed_captioninfo_dialog'] == 1)
    ],
    valid_ontask_sessions.loc[
        (valid_ontask_sessions['choose_accept'] == 1) &
        (valid_ontask_sessions['captioninfo_dialog_impression'] == 1)
    ]
)

99.0

In [257]:
round_perc_df(
    valid_ontask_sessions.loc[
        (valid_ontask_sessions['choose_accept'] == 1) &
        (valid_ontask_sessions['captioninfo_dialog_impression'] == 1) &
        (valid_ontask_sessions['closed_captioninfo_dialog'] == 1)
    ],
    mob_newcomers.loc[
        (mob_newcomers['click_number'] > 0) &
        (mob_newcomers['nosuggestion_impression'] == 0)
    ]
)

28.4

To enter the caption, the caption entry has to get focus:

In [258]:
len(
    valid_ontask_sessions.loc[
        (valid_ontask_sessions['choose_accept'] == 1) &
        (valid_ontask_sessions['captioninfo_dialog_impression'] == 1) &
        (valid_ontask_sessions['closed_captioninfo_dialog'] == 1) &
        (valid_ontask_sessions['caption_entry_focused'] == 1)
    ]
)

157

Proportion out of users who closed the caption dialogue and users who clicked a task:

In [259]:
round_perc_df(
    valid_ontask_sessions.loc[
        (valid_ontask_sessions['choose_accept'] == 1) &
        (valid_ontask_sessions['captioninfo_dialog_impression'] == 1) &
        (valid_ontask_sessions['closed_captioninfo_dialog'] == 1) &
        (valid_ontask_sessions['caption_entry_focused'] == 1)
    ],
    valid_ontask_sessions.loc[
        (valid_ontask_sessions['choose_accept'] == 1) &
        (valid_ontask_sessions['captioninfo_dialog_impression'] == 1) &
        (valid_ontask_sessions['closed_captioninfo_dialog'] == 1)
    ]
)

38.0

In [260]:
round_perc_df(
    valid_ontask_sessions.loc[
        (valid_ontask_sessions['choose_accept'] == 1) &
        (valid_ontask_sessions['captioninfo_dialog_impression'] == 1) &
        (valid_ontask_sessions['closed_captioninfo_dialog'] == 1) &
        (valid_ontask_sessions['caption_entry_focused'] == 1)
    ],
    mob_newcomers.loc[
        (mob_newcomers['click_number'] > 0) &
        (mob_newcomers['nosuggestion_impression'] == 0)
    ]
)

10.8

After entering a caption, the user can click "Publish…" to bring up the edit summary. In the funnel query above, getting to the edit summary dialogue *requires* having focused on the caption entry box. I'm verifying with the team whether that's reasonable.

In [261]:
len(
    valid_ontask_sessions.loc[
        (valid_ontask_sessions['choose_accept'] == 1) &
        (valid_ontask_sessions['captioninfo_dialog_impression'] == 1) &
        (valid_ontask_sessions['closed_captioninfo_dialog'] == 1) &
        (valid_ontask_sessions['caption_entry_focused'] == 1) &
        (valid_ontask_sessions['editsummary_dialog_impression'] == 1)
    ]
)

109

Proportion out of users who focused on the caption entry box, and out of all users who clicked a task:

In [262]:
round_perc_df(
    valid_ontask_sessions.loc[
        (valid_ontask_sessions['choose_accept'] == 1) &
        (valid_ontask_sessions['captioninfo_dialog_impression'] == 1) &
        (valid_ontask_sessions['closed_captioninfo_dialog'] == 1) &
        (valid_ontask_sessions['caption_entry_focused'] == 1) &
        (valid_ontask_sessions['editsummary_dialog_impression'] == 1)
    ],
    valid_ontask_sessions.loc[
        (valid_ontask_sessions['choose_accept'] == 1) &
        (valid_ontask_sessions['captioninfo_dialog_impression'] == 1) &
        (valid_ontask_sessions['closed_captioninfo_dialog'] == 1) &
        (valid_ontask_sessions['caption_entry_focused'] == 1)
    ]
)

69.4

In [263]:
round_perc_df(
    valid_ontask_sessions.loc[
        (valid_ontask_sessions['choose_accept'] == 1) &
        (valid_ontask_sessions['captioninfo_dialog_impression'] == 1) &
        (valid_ontask_sessions['closed_captioninfo_dialog'] == 1) &
        (valid_ontask_sessions['caption_entry_focused'] == 1) &
        (valid_ontask_sessions['editsummary_dialog_impression'] == 1)
    ],
    mob_newcomers.loc[
        (mob_newcomers['click_number'] > 0) &
        (mob_newcomers['nosuggestion_impression'] == 0)
    ]
)

7.5

Total number of users who go to the "Task review" stage, meaning they either saw the Skip or Reject dialogues, or reached the Edit summary after entering a caption. Hence it's a union of these three:

In [264]:
len(
    valid_ontask_sessions.loc[
        ( # got to the Edit Summary
            (valid_ontask_sessions['choose_accept'] == 1) &
            (valid_ontask_sessions['captioninfo_dialog_impression'] == 1) &
            (valid_ontask_sessions['closed_captioninfo_dialog'] == 1) &
            (valid_ontask_sessions['caption_entry_focused'] == 1) &
            (valid_ontask_sessions['editsummary_dialog_impression'] == 1)
        ) |
        ( # Saw the Rejection dialogue
            (valid_ontask_sessions['chose_reject'] == 1) &
            (valid_ontask_sessions['rejection_dialog_impression'] == 1)
        ) |
        ( # Saw the Skip dialogue
            (valid_ontask_sessions['chose_unsure'] == 1)
        )
    ]
)

510

Proportion out of all users who made some kind of link decision, and out of users who clicked a task:

In [265]:
round_perc_df(
    valid_ontask_sessions.loc[
        ( # got to the Edit Summary
            (valid_ontask_sessions['choose_accept'] == 1) &
            (valid_ontask_sessions['captioninfo_dialog_impression'] == 1) &
            (valid_ontask_sessions['closed_captioninfo_dialog'] == 1) &
            (valid_ontask_sessions['caption_entry_focused'] == 1) &
            (valid_ontask_sessions['editsummary_dialog_impression'] == 1)
        ) |
        ( # Saw the Rejection dialogue
            (valid_ontask_sessions['chose_reject'] == 1) &
            (valid_ontask_sessions['rejection_dialog_impression'] == 1)
        ) |
        ( # Saw the Skip dialogue
            (valid_ontask_sessions['chose_unsure'] == 1)
        )
    ],
    valid_ontask_sessions.loc[
        (valid_ontask_sessions['chose_unsure'] == 1) |
        (valid_ontask_sessions['chose_reject'] == 1) |
        (valid_ontask_sessions['choose_accept'] == 1)
    ]
)

71.1

In [266]:
round_perc_df(
    valid_ontask_sessions.loc[
        ( # got to the Edit Summary
            (valid_ontask_sessions['choose_accept'] == 1) &
            (valid_ontask_sessions['captioninfo_dialog_impression'] == 1) &
            (valid_ontask_sessions['closed_captioninfo_dialog'] == 1) &
            (valid_ontask_sessions['caption_entry_focused'] == 1) &
            (valid_ontask_sessions['editsummary_dialog_impression'] == 1)
        ) |
        ( # Saw the Rejection dialogue
            (valid_ontask_sessions['chose_reject'] == 1) &
            (valid_ontask_sessions['rejection_dialog_impression'] == 1)
        ) |
        ( # Saw the Skip dialogue
            (valid_ontask_sessions['chose_unsure'] == 1)
        )
    ],
    mob_newcomers.loc[
        (mob_newcomers['click_number'] > 0) &
        (mob_newcomers['nosuggestion_impression'] == 0)
    ]
)

35.1

Number of users who bounced after a link decision is then the difference between users who made one and users who saw the task review dialogues:

In [267]:
(
    len(
        valid_ontask_sessions.loc[
            (valid_ontask_sessions['chose_unsure'] == 1) |
            (valid_ontask_sessions['chose_reject'] == 1) |
            (valid_ontask_sessions['choose_accept'] == 1)
        ]    
    ) -
    len(
        valid_ontask_sessions.loc[
            ( # got to the Edit Summary
                (valid_ontask_sessions['choose_accept'] == 1) &
                (valid_ontask_sessions['captioninfo_dialog_impression'] == 1) &
                (valid_ontask_sessions['closed_captioninfo_dialog'] == 1) &
                (valid_ontask_sessions['caption_entry_focused'] == 1) &
                (valid_ontask_sessions['editsummary_dialog_impression'] == 1)
            ) |
            ( # Saw the Rejection dialogue
                (valid_ontask_sessions['chose_reject'] == 1) &
                (valid_ontask_sessions['rejection_dialog_impression'] == 1)
            ) |
            ( # Saw the Skip dialogue
                (valid_ontask_sessions['chose_unsure'] == 1)
            )
        ]
    )
)

207

Bounce proportion out of users who made a link decision and users who clicked a task:

In [268]:
round_perc(
    len(
        valid_ontask_sessions.loc[
            (valid_ontask_sessions['chose_unsure'] == 1) |
            (valid_ontask_sessions['chose_reject'] == 1) |
            (valid_ontask_sessions['choose_accept'] == 1)
        ]    
    ) -
    len(
        valid_ontask_sessions.loc[
            ( # got to the Edit Summary
                (valid_ontask_sessions['choose_accept'] == 1) &
                (valid_ontask_sessions['captioninfo_dialog_impression'] == 1) &
                (valid_ontask_sessions['closed_captioninfo_dialog'] == 1) &
                (valid_ontask_sessions['caption_entry_focused'] == 1) &
                (valid_ontask_sessions['editsummary_dialog_impression'] == 1)
            ) |
            ( # Saw the Rejection dialogue
                (valid_ontask_sessions['chose_reject'] == 1) &
                (valid_ontask_sessions['rejection_dialog_impression'] == 1)
            ) |
            ( # Saw the Skip dialogue
                (valid_ontask_sessions['chose_unsure'] == 1)
            )
        ]
    ),
    len(
        valid_ontask_sessions.loc[
            (valid_ontask_sessions['chose_unsure'] == 1) |
            (valid_ontask_sessions['chose_reject'] == 1) |
            (valid_ontask_sessions['choose_accept'] == 1)
        ]    
    )
)

28.9

In [269]:
round_perc(
    len(
        valid_ontask_sessions.loc[
            (valid_ontask_sessions['chose_unsure'] == 1) |
            (valid_ontask_sessions['chose_reject'] == 1) |
            (valid_ontask_sessions['choose_accept'] == 1)
        ]    
    ) -
    len(
        valid_ontask_sessions.loc[
            ( # got to the Edit Summary
                (valid_ontask_sessions['choose_accept'] == 1) &
                (valid_ontask_sessions['captioninfo_dialog_impression'] == 1) &
                (valid_ontask_sessions['closed_captioninfo_dialog'] == 1) &
                (valid_ontask_sessions['caption_entry_focused'] == 1) &
                (valid_ontask_sessions['editsummary_dialog_impression'] == 1)
            ) |
            ( # Saw the Rejection dialogue
                (valid_ontask_sessions['chose_reject'] == 1) &
                (valid_ontask_sessions['rejection_dialog_impression'] == 1)
            ) |
            ( # Saw the Skip dialogue
                (valid_ontask_sessions['chose_unsure'] == 1)
            )
        ]
    ),
    len(
        mob_newcomers.loc[
        (mob_newcomers['click_number'] > 0) &
        (mob_newcomers['nosuggestion_impression'] == 0)
    ]
    )
)

14.2

The last step in the "Yes" path is to save the edit:

In [270]:
len(
    valid_ontask_sessions.loc[
        (valid_ontask_sessions['choose_accept'] == 1) &
        (valid_ontask_sessions['captioninfo_dialog_impression'] == 1) &
        (valid_ontask_sessions['closed_captioninfo_dialog'] == 1) &
        (valid_ontask_sessions['caption_entry_focused'] == 1) &
        (valid_ontask_sessions['editsummary_dialog_impression'] == 1) &
        (valid_ontask_sessions['editsummary_chose_save'] == 1)
    ]
)

99

Proportion out of users who saw the edit summary dialogue, and out of all users who clicked a task:

In [271]:
round_perc_df(
    valid_ontask_sessions.loc[
        (valid_ontask_sessions['choose_accept'] == 1) &
        (valid_ontask_sessions['captioninfo_dialog_impression'] == 1) &
        (valid_ontask_sessions['closed_captioninfo_dialog'] == 1) &
        (valid_ontask_sessions['caption_entry_focused'] == 1) &
        (valid_ontask_sessions['editsummary_dialog_impression'] == 1) &
        (valid_ontask_sessions['editsummary_chose_save'] == 1)
    ],
    valid_ontask_sessions.loc[
        (valid_ontask_sessions['choose_accept'] == 1) &
        (valid_ontask_sessions['captioninfo_dialog_impression'] == 1) &
        (valid_ontask_sessions['closed_captioninfo_dialog'] == 1) &
        (valid_ontask_sessions['caption_entry_focused'] == 1) &
        (valid_ontask_sessions['editsummary_dialog_impression'] == 1)
    ]
)

90.8

In [272]:
round_perc_df(
    valid_ontask_sessions.loc[
        (valid_ontask_sessions['choose_accept'] == 1) &
        (valid_ontask_sessions['captioninfo_dialog_impression'] == 1) &
        (valid_ontask_sessions['closed_captioninfo_dialog'] == 1) &
        (valid_ontask_sessions['caption_entry_focused'] == 1) &
        (valid_ontask_sessions['editsummary_dialog_impression'] == 1) &
        (valid_ontask_sessions['editsummary_chose_save'] == 1)
    ],
    mob_newcomers.loc[
        (mob_newcomers['click_number'] > 0) &
        (mob_newcomers['nosuggestion_impression'] == 0)
    ]
)

6.8

Total number of users who completed the task is the union of users who confirmed skip, reject, or saved the edit:

In [273]:
len(
    valid_ontask_sessions.loc[
        (valid_ontask_sessions['confirmed_skip'] == 1) |
        (valid_ontask_sessions['confirmed_reject'] == 1) |
        (valid_ontask_sessions['editsummary_chose_save'] == 1)
    ]
)

383

Proportion out of all users who got to the task review stage, and out of all users who clicked a task:

In [274]:
round_perc_df(
    valid_ontask_sessions.loc[
        (valid_ontask_sessions['confirmed_skip'] == 1) |
        (valid_ontask_sessions['confirmed_reject'] == 1) |
        (valid_ontask_sessions['editsummary_chose_save'] == 1)
    ],
    valid_ontask_sessions.loc[
        ( # got to the Edit Summary
            (valid_ontask_sessions['choose_accept'] == 1) &
            (valid_ontask_sessions['captioninfo_dialog_impression'] == 1) &
            (valid_ontask_sessions['closed_captioninfo_dialog'] == 1) &
            (valid_ontask_sessions['caption_entry_focused'] == 1) &
            (valid_ontask_sessions['editsummary_dialog_impression'] == 1)
        ) |
        ( # Saw the Rejection dialogue
            (valid_ontask_sessions['chose_reject'] == 1) &
            (valid_ontask_sessions['rejection_dialog_impression'] == 1)
        ) |
        ( # Saw the Skip dialogue
            (valid_ontask_sessions['chose_unsure'] == 1)
        )
    ]
)

75.1

In [275]:
round_perc_df(
    valid_ontask_sessions.loc[
        (valid_ontask_sessions['confirmed_skip'] == 1) |
        (valid_ontask_sessions['confirmed_reject'] == 1) |
        (valid_ontask_sessions['editsummary_chose_save'] == 1)
    ],
    mob_newcomers.loc[
        (mob_newcomers['click_number'] > 0) &
        (mob_newcomers['nosuggestion_impression'] == 0)
    ]
)

26.3

The bounce rate for task completion then becomes the difference:

In [276]:
(
    len(valid_ontask_sessions.loc[
        ( # got to the Edit Summary
            (valid_ontask_sessions['choose_accept'] == 1) &
            (valid_ontask_sessions['captioninfo_dialog_impression'] == 1) &
            (valid_ontask_sessions['closed_captioninfo_dialog'] == 1) &
            (valid_ontask_sessions['caption_entry_focused'] == 1) &
            (valid_ontask_sessions['editsummary_dialog_impression'] == 1)
        ) |
        ( # Saw the Rejection dialogue
            (valid_ontask_sessions['chose_reject'] == 1) &
            (valid_ontask_sessions['rejection_dialog_impression'] == 1)
        ) |
        ( # Saw the Skip dialogue
            (valid_ontask_sessions['chose_unsure'] == 1)
        )
    ]) -
    len(valid_ontask_sessions.loc[
        (valid_ontask_sessions['confirmed_skip'] == 1) |
        (valid_ontask_sessions['confirmed_reject'] == 1) |
        (valid_ontask_sessions['editsummary_chose_save'] == 1)
    ])
)

127

And the proportions:

In [277]:
round_perc(
    len(
        valid_ontask_sessions.loc[
            ( # got to the Edit Summary
                (valid_ontask_sessions['choose_accept'] == 1) &
                (valid_ontask_sessions['captioninfo_dialog_impression'] == 1) &
                (valid_ontask_sessions['closed_captioninfo_dialog'] == 1) &
                (valid_ontask_sessions['caption_entry_focused'] == 1) &
                (valid_ontask_sessions['editsummary_dialog_impression'] == 1)
            ) |
            ( # Saw the Rejection dialogue
                (valid_ontask_sessions['chose_reject'] == 1) &
                (valid_ontask_sessions['rejection_dialog_impression'] == 1)
            ) |
            ( # Saw the Skip dialogue
                (valid_ontask_sessions['chose_unsure'] == 1)
            )
        ]
    ) -
    len(valid_ontask_sessions.loc[
        (valid_ontask_sessions['confirmed_skip'] == 1) |
        (valid_ontask_sessions['confirmed_reject'] == 1) |
        (valid_ontask_sessions['editsummary_chose_save'] == 1)
    ]),
    len(
        valid_ontask_sessions.loc[
            ( # got to the Edit Summary
                (valid_ontask_sessions['choose_accept'] == 1) &
                (valid_ontask_sessions['captioninfo_dialog_impression'] == 1) &
                (valid_ontask_sessions['closed_captioninfo_dialog'] == 1) &
                (valid_ontask_sessions['caption_entry_focused'] == 1) &
                (valid_ontask_sessions['editsummary_dialog_impression'] == 1)
            ) |
            ( # Saw the Rejection dialogue
                (valid_ontask_sessions['chose_reject'] == 1) &
                (valid_ontask_sessions['rejection_dialog_impression'] == 1)
            ) |
            ( # Saw the Skip dialogue
                (valid_ontask_sessions['chose_unsure'] == 1)
            )
        ]
    )
)

24.9

In [278]:
round_perc(
    len(valid_ontask_sessions.loc[
        ( # got to the Edit Summary
            (valid_ontask_sessions['choose_accept'] == 1) &
            (valid_ontask_sessions['captioninfo_dialog_impression'] == 1) &
            (valid_ontask_sessions['closed_captioninfo_dialog'] == 1) &
            (valid_ontask_sessions['caption_entry_focused'] == 1) &
            (valid_ontask_sessions['editsummary_dialog_impression'] == 1)
        ) |
        ( # Saw the Rejection dialogue
            (valid_ontask_sessions['chose_reject'] == 1) &
            (valid_ontask_sessions['rejection_dialog_impression'] == 1)
        ) |
        ( # Saw the Skip dialogue
            (valid_ontask_sessions['chose_unsure'] == 1)
        )
    ]) -
    len(valid_ontask_sessions.loc[
        (valid_ontask_sessions['confirmed_skip'] == 1) |
        (valid_ontask_sessions['confirmed_reject'] == 1) |
        (valid_ontask_sessions['editsummary_chose_save'] == 1)
    ]),
    len(
        mob_newcomers.loc[
            (mob_newcomers['click_number'] > 0) &
            (mob_newcomers['nosuggestion_impression'] == 0)
        ]
    )
)

8.7

Lastly, the number of reverts and the revert rate for saved edits. In hindsight, we might be able to get the exact revision ID we should look for through EditAttemptStep, as it's found in that schema's `revision_id` field. For now, we're joining on wiki, user ID, page ID, and restricting it by time. As seen below, we appear to have lost 3 edits that way, or about 3% of our data.

In [280]:
len(
    valid_ontask_sessions.loc[
        (valid_ontask_sessions['choose_accept'] == 1) &
        (valid_ontask_sessions['captioninfo_dialog_impression'] == 1) &
        (valid_ontask_sessions['closed_captioninfo_dialog'] == 1) &
        (valid_ontask_sessions['caption_entry_focused'] == 1) &
        (valid_ontask_sessions['editsummary_dialog_impression'] == 1) &
        (valid_ontask_sessions['editsummary_chose_save'] == 1)
    ]
)

99

In [281]:
len(
    valid_ontask_sessions.loc[
        (valid_ontask_sessions['choose_accept'] == 1) &
        (valid_ontask_sessions['captioninfo_dialog_impression'] == 1) &
        (valid_ontask_sessions['closed_captioninfo_dialog'] == 1) &
        (valid_ontask_sessions['caption_entry_focused'] == 1) &
        (valid_ontask_sessions['editsummary_dialog_impression'] == 1) &
        (valid_ontask_sessions['editsummary_chose_save'] == 1) &
        (valid_ontask_sessions['tagged_edit_saved'] == 1)
    ]
)

96

Number of those edits that got reverted:

In [282]:
len(
    valid_ontask_sessions.loc[
        (valid_ontask_sessions['choose_accept'] == 1) &
        (valid_ontask_sessions['captioninfo_dialog_impression'] == 1) &
        (valid_ontask_sessions['closed_captioninfo_dialog'] == 1) &
        (valid_ontask_sessions['caption_entry_focused'] == 1) &
        (valid_ontask_sessions['editsummary_dialog_impression'] == 1) &
        (valid_ontask_sessions['editsummary_chose_save'] == 1) &
        (valid_ontask_sessions['tagged_edit_saved'] == 1) &
        (valid_ontask_sessions['tagged_edit_reverted'] == 1)
    ]
)

13

And then we get the overall revert rate:

In [283]:
round_perc_df(
    valid_ontask_sessions.loc[
        (valid_ontask_sessions['choose_accept'] == 1) &
        (valid_ontask_sessions['captioninfo_dialog_impression'] == 1) &
        (valid_ontask_sessions['closed_captioninfo_dialog'] == 1) &
        (valid_ontask_sessions['caption_entry_focused'] == 1) &
        (valid_ontask_sessions['editsummary_dialog_impression'] == 1) &
        (valid_ontask_sessions['editsummary_chose_save'] == 1) &
        (valid_ontask_sessions['tagged_edit_saved'] == 1) &
        (valid_ontask_sessions['tagged_edit_reverted'] == 1)
    ],
    valid_ontask_sessions.loc[
        (valid_ontask_sessions['choose_accept'] == 1) &
        (valid_ontask_sessions['captioninfo_dialog_impression'] == 1) &
        (valid_ontask_sessions['closed_captioninfo_dialog'] == 1) &
        (valid_ontask_sessions['caption_entry_focused'] == 1) &
        (valid_ontask_sessions['editsummary_dialog_impression'] == 1) &
        (valid_ontask_sessions['editsummary_chose_save'] == 1) &
        (valid_ontask_sessions['tagged_edit_saved'] == 1)
    ]
)

13.5