# Analytics for Section Level Image Suggestions Notifications¶

[T331290](https://phabricator.wikimedia.org/T331290)

# Purpose

Section-level image suggestion were sent out on July 5th, 2023. These suggestions are sent weekly as notifications to experienced contributors for pages on their watchlist and are page level, section level or bundled together.

In this report, we will take a look at data in July 2023 from Structured Content's pilot wikis: CA, PT, RU, ID, NO, HU, FI wikis and answer the following question:

- Number of notifications sent
    - Filtered by article vs section-level suggestion
- Percentage of notifications read
    - Filtered by article vs section-level suggestion
- Notificaton opt out rate
- Number of images suggested that are added to the matched article within a month of receiving the notification
    - Filtered by article vs section-level suggestion
- Revert rate for image additions
    - Filtered by article vs section-level suggestion

# Data Preparation

In [1]:
import re

from wmfdata import hive, mariadb, spark
import wmfdata 

import math
import pandas as pd
import numpy as np

from datetime import datetime, timedelta, date

In [3]:
spark_session = wmfdata.spark.create_session(app_name='pyspark regular; media-changes',
                                  type='yarn-large')  

SPARK_HOME: /usr/lib/spark3
Using Hadoop client lib jars at 3.2.0, provided by Spark.
PYSPARK_PYTHON=/opt/conda-analytics/bin/python3


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/08/01 05:17:28 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).
23/08/01 05:17:28 WARN Utils: Service 'sparkDriver' could not bind on port 12000. Attempting port 12001.
23/08/01 05:17:28 WARN Utils: Service 'sparkDriver' could not bind on port 12001. Attempting port 12002.
23/08/01 05:17:28 WARN Utils: Service 'sparkDriver' could not bind on port 12002. Attempting port 12003.
23/08/01 05:17:29 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/08/01 05:17:29 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
23/08/01 05:17:29 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
23/08/01 05:17:36 WARN Utils: Service 'org.apache.spark.netwo

## Parameters

In [4]:
mw_snapshot = '2023-07'  
wiki_dbs = ('ptwiki', 'ruwiki', 'idwiki','fiwiki', 'huwiki','cawiki', 'nowiki')

wiki_db_str = "('" + "','".join(wiki_dbs) + "')"  # otherwise single wiki leads to confusing syntax errors
wiki_db_list = list(wiki_dbs)

notification_timestamp = 20220720000000

#notification ts
m_start_timestamp = 20230701000000
n_start_timestamp = 20230601000000
n_end_timestamp = 20230801000000

##edits ts
start_timestamp = '2023-07-01' 
end_timestamp = '2023-08-01'

media_list_table = 'cchen.media_jul_2023'

## Notification Data

In [5]:
notification_query = """

SELECT
   notification_event,
   notification_bundle_hash,
   notification_user,
   notification_timestamp,
   notification_read_timestamp,
   event_extra,
   event_page_id
FROM echo_notification
JOIN echo_event on event_id = notification_event
WHERE notification_timestamp >= {start_timestamp}
  AND notification_timestamp < {end_timestamp}
  AND event_type = 'image-suggestions'

"""

In [6]:
notification_data = pd.DataFrame()

for i in range(len(wiki_db_list)):
        
    print('getting data for %s' % wiki_db_list[i])
    
    
    noti_result = mariadb.run( 
        notification_query.format(
            start_timestamp = n_start_timestamp,
            end_timestamp = n_end_timestamp
        ),wiki_db_list[i],'wikishared','pandas'
    )
    
    noti_result.insert(0, 'wiki_db', wiki_db_list[i])
    
    notification_data = pd.concat([notification_data, noti_result], sort=False)

getting data for ptwiki




getting data for ruwiki




getting data for idwiki




getting data for fiwiki




getting data for huwiki




getting data for cawiki




getting data for nowiki




In [7]:
notification_data['event_extra'] = notification_data['event_extra'].astype(str)
# extract name of suggested images from data
notification_data['suggested_image'] = notification_data['event_extra'].str.extract(r'(?<=File:)(.+?)(?=\")')
# filtering article level image suggestion notifications
notification_data['is_article_level'] = notification_data['event_extra'].str.endswith(";N;}")

In [8]:
# store in global temp view
notification_sdf = spark_session.createDataFrame(notification_data)
notification_sdf.createGlobalTempView("notification_data")

## User Pref Data¶

In [9]:
pref_query = """

SELECT 
    up_property,
    up_value,
    up_user AS local_user_id
FROM user_properties
WHERE up_property like '%image_suggestions%'

"""

In [10]:
local_pref_data = pd.DataFrame()

for i in range(len(wiki_db_list)):
        
    print('getting data for %s' % wiki_db_list[i])
    
    data = pd.DataFrame()
    
    pref_result = mariadb.run( 
        pref_query,wiki_db_list[i]
    )
    
    pref_result.insert(0, 'wiki_db', wiki_db_list[i])
    
    local_pref_data = pd.concat([local_pref_data, pref_result], sort=False)

getting data for ptwiki




getting data for ruwiki




getting data for idwiki




getting data for fiwiki




getting data for huwiki




getting data for cawiki




getting data for nowiki




In [11]:
## convert up_value to a string in order to store in GlobalTempView
local_pref_data['up_value'] = local_pref_data['up_value'].astype(str)

In [12]:
# store in global temp view
local_pref_sdf = spark_session.createDataFrame(local_pref_data)
local_pref_sdf.createGlobalTempView("local_pref_temp")

In [13]:
global_pref_query = """

SELECT
    gp_user,
    gp_property,
    gp_value,
    lu_wiki,
    lu_local_id
FROM global_preferences gp RIGHT JOIN localuser lu ON gp_user = lu_global_id
WHERE gp_property LIKE '%image-suggestions%'
  AND lu_wiki IN {wiki_db}
  
"""

In [14]:
global_pref_data = mariadb.run( 
        global_pref_query.format(
          wiki_db = wiki_dbs
        ),'centralauth','pandas'
    )



In [15]:
## convert up_value to a string in order to store in GlobalTempView
global_pref_data['gp_value'] = global_pref_data['gp_value'].astype(str)

In [16]:
global_pref_sdf = spark_session.createDataFrame(global_pref_data)
global_pref_sdf.createGlobalTempView("global_pref_temp")

## Notification User Data

In [17]:
notification_user_query = """
SELECT 
   notification_event,
   notification_user
FROM echo_notification
JOIN echo_event on event_id = notification_event
WHERE notification_timestamp >= {start_timestamp}
  AND notification_timestamp < {end_timestamp}
  AND event_type = 'image-suggestions'
"""

In [18]:
notification_user_data = pd.DataFrame()

for i in range(len(wiki_db_list)):
        
    print('getting data for %s' % wiki_db_list[i])
    
    
    noti_result = mariadb.run( 
        notification_user_query.format(
            start_timestamp = notification_timestamp,
            end_timestamp = n_end_timestamp
        ),wiki_db_list[i],'wikishared','pandas'
    )
    
    noti_result.insert(0, 'wiki_db', wiki_db_list[i])
    
    notification_user_data = pd.concat([notification_user_data, noti_result], sort=False)

getting data for ptwiki




getting data for ruwiki




getting data for idwiki




getting data for fiwiki




getting data for huwiki




getting data for cawiki




getting data for nowiki




In [19]:
# store in global temp view
notification_user_sdf = spark_session.createDataFrame(notification_user_data)
notification_user_sdf.createGlobalTempView("notification_user_data")

# Caculation

## Number of Notifications


In [20]:
daily_notification_query = """

WITH noti AS (
    SELECT 
        wiki_db,
        notification_user,
        notification_bundle_hash,
        MIN(notification_timestamp) AS send_ts,
        MIN(notification_read_timestamp) AS read_ts,
        SUM(CASE WHEN is_article_level THEN 1 ELSE 0 END) AS noti_type, 
        COUNT(*) AS count
    FROM global_temp.notification_data
    WHERE notification_timestamp >= {noti_timestamp}
    GROUP BY wiki_db, notification_user,notification_bundle_hash
)

SELECT 
   wiki_db,
   FROM_UNIXTIME(UNIX_TIMESTAMP(SUBSTR(send_ts,0,8), 'yyyyMMdd')) AS date,
   COUNT(1) AS all_noti,
   COUNT(read_ts) AS all_noti_read,
   COUNT(CASE WHEN noti_type = 1 AND count = 1 THEN 1 END) AS article_noti,
   COUNT(CASE WHEN noti_type = 1 AND count = 1 THEN read_ts END) AS article_noti_read,
   COUNT(CASE WHEN noti_type = 0  THEN 1 END) AS section_noti,
   COUNT(CASE WHEN noti_type = 0  THEN read_ts END) AS section_read,
   COUNT(CASE WHEN noti_type = 1 AND count > 1 THEN 1 END) AS combined_noti,
   COUNT(CASE WHEN noti_type = 1 AND count > 1 THEN read_ts END) AS combined_noti_read
FROM noti
GROUP BY wiki_db, FROM_UNIXTIME(UNIX_TIMESTAMP(SUBSTR(send_ts,0,8), 'yyyyMMdd'))
   
"""

In [21]:
daily_notification = spark.run(
        daily_notification_query.format(
            noti_timestamp = m_start_timestamp))

23/08/01 05:20:01 WARN TaskSetManager: Stage 0 contains a task of very large size (11947 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [22]:
notification_stats = daily_notification.groupby('wiki_db').agg(
     notification_sent = ('all_noti', 'sum'),
     notification_read = ('all_noti_read', 'sum'),
     article_sent = ('article_noti', 'sum'),
     article_read = ('article_noti_read', 'sum'),
     section_sent = ('section_noti', 'sum'),
     section_read = ('section_read', 'sum'),
     combined_sent = ('combined_noti', 'sum'),
     combined_read = ('combined_noti_read', 'sum')
).reset_index()

In [23]:
notification_stats

Unnamed: 0,wiki_db,notification_sent,notification_read,article_sent,article_read,section_sent,section_read,combined_sent,combined_read
0,cawiki,4007,1066,96,41,3889,1019,22,6
1,fiwiki,1217,277,1217,277,0,0,0,0
2,huwiki,6205,1173,221,77,5934,1081,50,15
3,idwiki,3843,802,101,51,3690,738,52,13
4,nowiki,4070,647,138,30,3886,609,46,8
5,ptwiki,13950,2213,570,181,13233,2000,147,32
6,ruwiki,32994,7075,1070,346,31590,6636,334,93


## Number of opt-outs

In [24]:
pref_type_list = ("push", "web")

In [25]:
pref_query = """

WITH noti_users AS ( --total notification users

SELECT 
    wiki_db,
    notification_user
FROM global_temp.notification_user_data
GROUP BY wiki_db, notification_user

), local_pref AS ( -- local preference 

SELECT
    wiki_db,
    local_user_id,
    up_value AS local_pref
FROM global_temp.local_pref_temp
WHERE up_property = "echo-subscriptions-{type}-image-suggestions"
  and up_value != ''

), global_pref AS ( -- global preference

SELECT
    lu_wiki,
    lu_local_id,
    gp_value AS global_pref
FROM global_temp.global_pref_temp
WHERE gp_property  = "echo-subscriptions-{type}-image-suggestions"
  and gp_value != ''

), local_ex AS ( -- local exceptions

SELECT
    wiki_db,
    local_user_id,
    up_value AS local_ex   
FROM global_temp.local_pref_temp
WHERE up_property = "echo-subscriptions-{type}-image-suggestions-local-exception"
  and up_value != ''

), global_all_pref AS ( -- compare local exception and global preference

SELECT 
    COALESCE(lu_wiki,wiki_db) AS wiki,
    COALESCE(lu_local_id,local_user_id) AS user_id,
    COALESCE(local_ex,global_pref) AS all_pref
FROM global_pref gp FULL OUTER JOIN local_ex le ON (gp.lu_wiki = le.wiki_db AND gp.lu_local_id = le.local_user_id)

)

SELECT 
    nu.wiki_db,
    nu.notification_user,
    gp.all_pref,
    lp.local_pref
FROM noti_users nu 
    LEFT JOIN local_pref lp ON (nu.wiki_db = lp.wiki_db AND nu.notification_user = lp.local_user_id)
    LEFT JOIN global_all_pref gp ON (nu.wiki_db = gp.wiki AND nu.notification_user = gp.user_id)
"""

In [26]:
pref_stats = pd.DataFrame()

for i in range(len(pref_type_list)):
               
    pref_result = spark.run(pref_query.format(
                           type = pref_type_list[i]
                        ))
    
    pref_result.insert(0, 'type', pref_type_list[i])
    
    pref_stats = pd.concat([pref_stats, pref_result], sort=False)

23/08/01 05:20:11 WARN TaskSetManager: Stage 4 contains a task of very large size (2486 KiB). The maximum recommended task size is 1000 KiB.
23/08/01 05:20:17 WARN TaskSetManager: Stage 13 contains a task of very large size (2486 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [27]:
## total number of users received notifications
total_users = pref_stats[(pref_stats['type'] == "web")].groupby(
    ['wiki_db']
).agg(
    total_users = ('notification_user', 'count')
).reset_index()

In [28]:
## opt-out for push notifications and web notifications
pref = pref_stats.groupby(
    ['wiki_db','type', 'all_pref', 'local_pref']
).agg(
    optout_users = ('wiki_db', 'count')
).reset_index()

pref = pref[(pref['all_pref'] == '0')|(pref['local_pref'] == '0')]

In [29]:
mobile_optout = pref_stats[(pref_stats['type'] == "push")]

mobile_optout= mobile_optout[(mobile_optout['all_pref'] == '0')|(mobile_optout['local_pref'] == '0')].groupby(
    ['wiki_db']
).agg(
    mobile_optouts = ( 'wiki_db','count')
).reset_index()

In [30]:
desktop_optout = pref_stats[(pref_stats['type'] == "web")]

desktop_optout= desktop_optout[(desktop_optout['all_pref'] == '0')|(desktop_optout['local_pref'] == '0')].groupby(
    ['wiki_db']
).agg(
    desktop_optout = ( 'wiki_db','count')
).reset_index()

In [31]:
pd.merge(pd.merge(total_users,mobile_optout,on='wiki_db'),desktop_optout,on='wiki_db')

Unnamed: 0,wiki_db,total_users,mobile_optouts,desktop_optout
0,cawiki,789,7,5
1,fiwiki,1059,38,44
2,huwiki,1278,21,26
3,idwiki,959,13,14
4,nowiki,817,21,21
5,ptwiki,3990,36,38
6,ruwiki,7299,162,171
