# Collect all cx translations

In [60]:
from wmfdata import hive, mariadb
import pandas as pd

In [61]:
#collect content translation and associated MT engine
# Need to query cx_corpus table and cross it with cx_translations table and then group and sort

query = """ 
    SELECT
    translation_start_timestamp AS translation_start_time,
    translation_id,
    translation_target_revision_id  AS target_revision_id,
    translation_target_title AS page_title,
    json_extract(translation_progress, '$.mt') AS mt_translated_percent, 
    json_extract(translation_progress, '$.human') AS human_translated_percent, 
    translation_status AS translation_status,
    CASE
    WHEN cc.cxc_origin = 'source' THEN 'source'
    WHEN cc.cxc_origin = 'user' THEN 'final'
    ELSE 'mt_engine'
    END AS translation_type,
    cc.cxc_origin AS mt_service,
    translation_source_language AS source_language,
    translation_target_language AS target_language
    FROM 
    cx_translations
    JOIN 
    cx_corpora AS cc
    ON translation_id = cc.cxc_translation_id
"""


In [None]:
mt_data = mariadb.run(commands = query, dbs = "wikishared", use_x1 = False, format="pandas", date_col=None,
 index_col=None)

In [7]:
#save data to csv
mt_data.to_csv('mt_data.csv', index = False)

# Collect Deletion Ratios

In [74]:
# final all published flores articles
flores_data = mt_data[(mt_data["mt_service"]=="Flores") & (mt_data["translation_status"]=='published')]


In [68]:
# identify all revision ids and create lis
revision_ids = flores_data["target_revision_id"].round().astype(int)
revision_list = ','.join([str(u) for u in revision_ids])

In [69]:
#obtain deletion ratio for all articles on target language wikis since FLores was deployed
query = """
-- find both cx and non-cx created articles 
WITH created_articles AS (

SELECT
    wiki_db AS wiki,
    COUNT(*) AS created_cx_total
FROM wmf.mediawiki_history
WHERE
    snapshot = '2022-03'
-- only look at new page creations
    AND revision_parent_id = 0
    AND event_entity = 'revision'
    AND event_type = 'create' 
-- identified as Flores revision_id
    AND revision_id in ({revisions})
    AND wiki_db IN ('igwiki','iswiki', 'lgwiki', 'ocwiki', 'zhwiki', 'zuwiki')
    AND ARRAY_CONTAINS(revision_tags, 'contenttranslation')
GROUP BY  
  wiki_db
),

--find all deleted articles that were created with cx 

deleted_articles AS (

SELECT
    wiki_db AS wiki,
    COUNT(*) AS deleted_cx_total
FROM wmf.mediawiki_history
WHERE
       snapshot = '2022-03'
-- only look at new page creations
    AND revision_parent_id = 0
    AND event_entity = 'revision'
-- find revisions moved to the archive table
    AND event_type = 'create'
    AND revision_is_deleted_by_page_deletion = TRUE
    -- identified as Flores revision_id
    AND revision_id in ({revisions})
    AND ARRAY_CONTAINS(revision_tags, 'contenttranslation')
-- remove all bots
    AND SIZE(event_user_is_bot_by_historical) = 0  -- not a bot
    AND wiki_db IN ('igwiki','iswiki', 'lgwiki', 'ocwiki', 'zhwiki', 'zuwiki')
GROUP BY  
  wiki_db
)

-- main query to aggregate and join sources above
SELECT
    created_articles.wiki,
    created_cx_total,
    deleted_cx_total
FROM created_articles
LEFT JOIN deleted_articles ON 
    created_articles.wiki = deleted_articles.wiki
"""

In [70]:
flores_deletion_data = hive.run(query.format(revisions=revision_list))

In [71]:
flores_deletion_data

Unnamed: 0,wiki,created_cx_total,deleted_cx_total
0,zhwiki,12,
1,lgwiki,29,
2,igwiki,19,
3,iswiki,13,


In [75]:
#save data to csv
flores_deletion_data.to_csv('flores_deletion_data.csv', index = False)