In [2]:
from wmfdata import hive, spark

### Issues
- The number of events on editattemptstep is higher than mediawiki_edit_attempt. This is because metrics platform is not capturing events on wikitext editor interface. This was discovered in Data QA testing Round 3 (see notebook in this folder)

##### Notes
- editattemptstep: event.session_token = mediawiki_edit_attempt: performer.session_id

## Compare metrics between editattemptstep and event.mediawiki_edit_attempt

First, we check a few events at the hour level. Here I am looking at visualeditor events

In [11]:
name_query1=spark.run('''
SELECT 
  DISTINCT name, count(1) AS ve_events
  FROM event.mediawiki_edit_attempt
  WHERE name like 'eas.ve%'
  AND year = 2022
  AND month= 11
  AND day = 3
  AND hour = 4
  AND normalized_host.project = 'test'
  GROUP BY name 
''')

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.
                                                                                

In [12]:
name_query1.sort_values(by=['ve_events','name'],ascending=False)

Unnamed: 0,name,ve_events
5,eas.ve.ready,17
7,eas.ve.loaded,17
3,eas.ve.init,17
2,eas.ve.save_success,15
4,eas.ve.save_intent,15
6,eas.ve.save_attempt,15
0,eas.ve.first_change,15
1,eas.ve.abort,2


In [15]:
action_query1=spark.run('''
SELECT 
  DISTINCT event.action AS action, count(1) AS ve_events
  FROM event.editattemptstep
  WHERE event.editor_interface= 'visualeditor'
  AND year = 2022
  AND month= 11
  AND day = 3
  AND hour = 4
  AND wiki = 'testwiki' 
  GROUP BY action 
''')

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.
                                                                                

This matches exactly with the events from mediawiki_edit_attempt

In [16]:
action_query1.sort_values(by=['ve_events','action'],ascending=False)

Unnamed: 0,action,ve_events
2,ready,17
1,loaded,17
3,init,17
5,saveSuccess,15
6,saveIntent,15
0,saveAttempt,15
7,firstChange,15
4,abort,2


Next I compared events from 2022-10-09 hour "8" 

In [84]:
name_query2=spark.run('''
SELECT 
  DISTINCT name, count(1) AS wikitext_events
  FROM event.mediawiki_edit_attempt
  WHERE name like 'eas.mf%'
  AND year = 2022
  AND month= 10
  AND day = 9
  AND hour = 8
  AND normalized_host.project = 'test'
  GROUP BY name 
''')

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.
                                                                                

In [85]:
name_query2.sort_values(by=['wikitext_events','name'],ascending=False)

Unnamed: 0,name,wikitext_events
0,eas.mf.ready,5
6,eas.mf.loaded,5
1,eas.mf.init,5
3,eas.mf.save_success,2
2,eas.mf.save_intent,2
5,eas.mf.save_attempt,2
4,eas.mf.first_change,2


In [39]:
action_query2=spark.run('''
SELECT 
  DISTINCT event.action AS action, count(1) AS wikitext_events
  FROM event.editattemptstep
  WHERE event.editor_interface = 'wikitext'
  AND year = 2022
  AND month= 10
  AND day = 9
  AND hour = 8 
  AND wiki = 'testwiki' 
  GROUP BY action 
''')

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.


Note that the number of events captured in the same "hour" is different in both tables.

In [40]:
action_query2.sort_values(by=['wikitext_events','action'],ascending=False)

Unnamed: 0,action,wikitext_events
2,ready,1
1,loaded,1
0,init,1


In [74]:
eas_query1=spark.run('''
SELECT 
  *
  FROM event.editattemptstep
  WHERE year = 2022
  AND month= 10
  AND day = 9
  AND hour = 8 
  AND wiki = 'testwiki'  
''')

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.
                                                                                

In [75]:
eas_query1

Unnamed: 0,dt,event,ip,recvfrom,revision,schema,seqid,useragent,uuid,webhost,...,_schema,client_dt,http,meta,user_agent_map,normalized_host,year,month,day,hour
0,2022-10-09T08:45:38.473Z,"(init, ecc907e1cd3cc9b88ac4, wikitext, click, ...",,,,EditAttemptStep,,"(Chrome, 106, None, Lenovo TB-7104F, False, Fa...",,test.wikipedia.org,...,,2022-10-09T08:45:44.524Z,"(None, None, None, None, {'user-agent': 'Mozil...","(None, 195de194-b0e5-4239-81e2-d9e99ac6f282, 2...","{'os_family': 'Android', 'browser_major': '106...","(wikipedia, test, [], org, wikipedia)",2022,10,9,8
1,2022-10-09T08:45:38.483Z,"(ready, ecc907e1cd3cc9b88ac4, wikitext, None, ...",,,,EditAttemptStep,,"(Chrome, 106, None, Lenovo TB-7104F, False, Fa...",,test.wikipedia.org,...,,2022-10-09T08:45:45.137Z,"(None, None, None, None, {'user-agent': 'Mozil...","(None, 93c276f5-6e37-4c5d-a693-7d1820f6c8f7, e...","{'os_family': 'Android', 'browser_major': '106...","(wikipedia, test, [], org, wikipedia)",2022,10,9,8
2,2022-10-09T08:45:38.823Z,"(loaded, ecc907e1cd3cc9b88ac4, wikitext, None,...",,,,EditAttemptStep,,"(Chrome, 106, None, Lenovo TB-7104F, False, Fa...",,test.wikipedia.org,...,,2022-10-09T08:45:45.144Z,"(None, None, None, None, {'user-agent': 'Mozil...","(None, deec3806-b63d-42c9-9bb8-ef56c4935a5c, c...","{'os_family': 'Android', 'browser_major': '106...","(wikipedia, test, [], org, wikipedia)",2022,10,9,8


In [86]:
mea_query1=spark.run('''
SELECT 
  *
  FROM event.mediawiki_edit_attempt
  WHERE year = 2022
  AND month= 10
  AND day = 9
  AND hour = 8
  AND normalized_host.project = 'test'
  AND dt like '2022-10-09T08:45%'
   
''')

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.
                                                                                

In [87]:
mea_query1

Unnamed: 0,_schema,agent,custom_data,dt,http,mediawiki,meta,name,page,performer,user_agent_map,is_wmf_domain,normalized_host,datacenter,year,month,day,hour
0,,"(None, mediawiki_js, mobile_browser)","{'integration': ('string', 'page'), 'editing_s...",2022-10-09T08:45:14.517Z,"(None, None, None, {'user-agent': 'Mozilla/5.0...","(None, None, None, None, None, 1.40.0-wmf.4, N...","(test.wikipedia.org, 2022-10-09T08:45:11.643Z,...",eas.mf.save_intent,"(Requests/Permissions/SK fanella, None, 0, Non...","(None, 0, None, None, 55333, None, True, None,...","{'os_family': 'Android', 'browser_major': '106...",True,"(wikipedia, test, [], org, wikipedia)",eqiad,2022,10,9,8
1,,"(None, mediawiki_js, mobile_browser)","{'integration': ('string', 'page'), 'editing_s...",2022-10-09T08:45:45.133Z,"(None, None, None, {'user-agent': 'Mozilla/5.0...","(None, None, None, None, None, 1.40.0-wmf.4, N...","(test.wikipedia.org, 2022-10-09T08:45:38.483Z,...",eas.mf.ready,"(Requests/Permissions/SK fanella, None, 146702...","(None, 1, None, None, 55333, None, True, None,...","{'os_family': 'Android', 'browser_major': '106...",True,"(wikipedia, test, [], org, wikipedia)",eqiad,2022,10,9,8
2,,"(None, mediawiki_js, mobile_browser)","{'integration': ('string', 'page'), 'editing_s...",2022-10-09T08:45:17.384Z,"(None, None, None, {'user-agent': 'Mozilla/5.0...","(None, None, None, None, None, 1.40.0-wmf.4, N...","(test.wikipedia.org, 2022-10-09T08:45:11.645Z,...",eas.mf.save_attempt,"(Requests/Permissions/SK fanella, None, 0, Non...","(None, 0, None, None, 55333, None, True, None,...","{'os_family': 'Android', 'browser_major': '106...",True,"(wikipedia, test, [], org, wikipedia)",eqiad,2022,10,9,8
3,,"(None, mediawiki_js, mobile_browser)","{'integration': ('string', 'page'), 'editing_s...",2022-10-09T08:45:18.690Z,"(None, None, None, {'user-agent': 'Mozilla/5.0...","(None, None, None, None, None, 1.40.0-wmf.4, N...","(test.wikipedia.org, 2022-10-09T08:45:11.656Z,...",eas.mf.save_success,"(Requests/Permissions/SK fanella, None, 0, Non...","(None, 0, None, None, 55333, None, True, None,...","{'os_family': 'Android', 'browser_major': '106...",True,"(wikipedia, test, [], org, wikipedia)",eqiad,2022,10,9,8
4,,"(None, mediawiki_js, mobile_browser)","{'integration': ('string', 'page'), 'editing_s...",2022-10-09T08:45:44.516Z,"(None, None, None, {'user-agent': 'Mozilla/5.0...","(None, None, None, None, None, 1.40.0-wmf.4, N...","(test.wikipedia.org, 2022-10-09T08:45:38.458Z,...",eas.mf.init,"(Requests/Permissions/SK fanella, None, 146702...","(None, 1, None, None, 55333, None, True, None,...","{'os_family': 'Android', 'browser_major': '106...",True,"(wikipedia, test, [], org, wikipedia)",eqiad,2022,10,9,8
5,,"(None, mediawiki_js, mobile_browser)","{'integration': ('string', 'page'), 'editing_s...",2022-10-09T08:45:45.139Z,"(None, None, None, {'user-agent': 'Mozilla/5.0...","(None, None, None, None, None, 1.40.0-wmf.4, N...","(test.wikipedia.org, 2022-10-09T08:45:38.813Z,...",eas.mf.loaded,"(Requests/Permissions/SK fanella, None, 146702...","(None, 1, None, None, 55333, None, True, None,...","{'os_family': 'Android', 'browser_major': '106...",True,"(wikipedia, test, [], org, wikipedia)",eqiad,2022,10,9,8


In [91]:
eas_query2=spark.run('''
SELECT 
  *
  FROM event.editattemptstep
  WHERE 
  event.session_token='0ace36c3e6f09d3668b0'
''')

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.
                                                                                

In [92]:
eas_query2

Unnamed: 0,dt,event,ip,recvfrom,revision,schema,seqid,useragent,uuid,webhost,...,_schema,client_dt,http,meta,user_agent_map,normalized_host,year,month,day,hour
0,2022-10-09T08:45:38.473Z,"(init, ecc907e1cd3cc9b88ac4, wikitext, click, ...",,,,EditAttemptStep,,"(Chrome, 106, None, Lenovo TB-7104F, False, Fa...",,test.wikipedia.org,...,,2022-10-09T08:45:44.524Z,"(None, None, None, None, {'user-agent': 'Mozil...","(None, 195de194-b0e5-4239-81e2-d9e99ac6f282, 2...","{'os_family': 'Android', 'browser_major': '106...","(wikipedia, test, [], org, wikipedia)",2022,10,9,8
1,2022-10-09T08:45:38.483Z,"(ready, ecc907e1cd3cc9b88ac4, wikitext, None, ...",,,,EditAttemptStep,,"(Chrome, 106, None, Lenovo TB-7104F, False, Fa...",,test.wikipedia.org,...,,2022-10-09T08:45:45.137Z,"(None, None, None, None, {'user-agent': 'Mozil...","(None, 93c276f5-6e37-4c5d-a693-7d1820f6c8f7, e...","{'os_family': 'Android', 'browser_major': '106...","(wikipedia, test, [], org, wikipedia)",2022,10,9,8
2,2022-10-09T08:45:38.823Z,"(loaded, ecc907e1cd3cc9b88ac4, wikitext, None,...",,,,EditAttemptStep,,"(Chrome, 106, None, Lenovo TB-7104F, False, Fa...",,test.wikipedia.org,...,,2022-10-09T08:45:45.144Z,"(None, None, None, None, {'user-agent': 'Mozil...","(None, deec3806-b63d-42c9-9bb8-ef56c4935a5c, c...","{'os_family': 'Android', 'browser_major': '106...","(wikipedia, test, [], org, wikipedia)",2022,10,9,8


#### Checking overall data

In [7]:
eas_query1=spark.run('''
SELECT 
  --date_format(dt, 'yyyy-MM-dd') AS date, 
  date_format(dt, 'yyyy-MM') AS date, 
  count(1) AS events
  FROM event.editattemptstep
  WHERE 
  year = 2022 
  AND wiki = 'testwiki' 
  AND month=11
  GROUP BY date 
''')

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.
                                                                                

In [8]:
eas_query1.sort_values(by=['date'],ascending=False)

Unnamed: 0,date,events
0,2022-11,3619


In [9]:
mea_query1=spark.run('''
SELECT 
  date_format(dt, 'yyyy-MM') AS date,
  count(1) AS events
  FROM event.mediawiki_edit_attempt
  WHERE 
  year = 2022
  AND normalized_host.project = 'test'
  AND month=11
  GROUP BY date 
''')

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.
                                                                                

In [10]:
mea_query1.sort_values(by=['date'],ascending=False)

Unnamed: 0,date,events
0,2022-11,751


MEA and EAS group 0 wikis are sampled at 100% however, 
### The number of events on MEA is far lesser, almost 5 times lesser than EAS. This is a huge data QA issue and needs to be looked into. 

One potential reason this could be happening is because Metrics Platform is not firing wikitext editor events which I discovered in round 3. 

In [11]:
mea_query2=spark.run('''
SELECT 
  date_format(dt, 'yyyy-MM') AS date,
  name,
  count(1) AS events
  FROM event.mediawiki_edit_attempt
  WHERE 
  year = 2022
  AND normalized_host.project = 'test'
  AND name like 'eas.mf%'
  AND month=11
  GROUP BY date, name 
''')

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.
                                                                                

All wikitext events in 2022-11

In [15]:
mea_query2.sort_values(by=['events'],ascending=False)

Unnamed: 0,date,name,events
4,2022-11,eas.mf.init,21
0,2022-11,eas.mf.ready,20
3,2022-11,eas.mf.loaded,20
5,2022-11,eas.mf.abort,7
1,2022-11,eas.mf.save_intent,5
2,2022-11,eas.mf.first_change,5
6,2022-11,eas.mf.save_attempt,4
7,2022-11,eas.mf.save_success,4


MEA has 21 wikitext init events as of 11-09-2022, which is very low. 