# Collect all cx translations

In [45]:
import pandas as pd
import numpy as np
import math

import datetime as dt

from wmfdata import hive, mariadb, spark

In [2]:
#collect content translation and associated MT engine
# Need to query cx_corpus table and cross it with cx_translations table and then group and sort

query = """ 
    SELECT
    translation_start_timestamp AS translation_start_time,
    translation_id,
    translation_target_revision_id  AS target_revision_id,
    translation_target_title AS page_title,
    json_extract(translation_progress, '$.mt') AS mt_translated_percent, 
    json_extract(translation_progress, '$.human') AS human_translated_percent, 
    translation_status AS translation_status,
    CASE
    WHEN cc.cxc_origin = 'source' THEN 'source'
    WHEN cc.cxc_origin = 'user' THEN 'final'
    ELSE 'mt_engine'
    END AS translation_type,
    cc.cxc_origin AS mt_service,
    translation_source_language AS source_language,
    translation_target_language AS target_language
    FROM 
    cx_translations
    JOIN 
    cx_corpora AS cc
    ON translation_id = cc.cxc_translation_id
"""


In [3]:
mt_data = mariadb.run(commands = query, dbs = "wikishared", use_x1 = False, format="pandas", date_col=None,
 index_col=None)

In [4]:
#save data to csv
mt_data.to_csv('mt_data.csv', index = False)

# Collect Deletion Ratios

## Elia Deletion Ratios

In [51]:

query = """
-- find both cx and non-cx created articles 
WITH created_articles AS (

SELECT
    wiki_db AS wiki,
    COUNT(*) AS created_cx_total
FROM wmf.mediawiki_history mwh
-- limit to only cx revision ids
JOIN mneisler.cx_revision_ids_by_mtservice mtc
ON mwh.revision_id = mtc.target_revision_id
WHERE
    snapshot = '2022-09'
-- only look at new page creations
    AND revision_parent_id = 0
-- review articles published  in February 2022
    AND event_timestamp >= '2022-02-01'
    AND event_entity = 'revision'
    AND event_type = 'create' 
-- identified as Elia
    AND mtc.mt_service = 'Elia'
    AND ARRAY_CONTAINS(revision_tags, 'contenttranslation')
GROUP BY  
  wiki_db
),

--find all deleted articles that were created with cx 

deleted_articles AS (

SELECT
    wiki_db AS wiki,
    COUNT(*) AS deleted_cx_total
FROM wmf.mediawiki_history mwh
JOIN mneisler.cx_revision_ids_by_mtservice mtc
ON mwh.revision_id = mtc.target_revision_id
WHERE
       snapshot = '2022-09'
-- only look at new page creations
    AND revision_parent_id = 0
    AND event_entity = 'revision'
-- find revisions moved to the archive table
    AND event_type = 'create'
    AND event_timestamp >= '2022-02-01'
    AND revision_is_deleted_by_page_deletion = TRUE
-- identified as Elia
    AND mtc.mt_service = 'Elia'
    AND ARRAY_CONTAINS(revision_tags, 'contenttranslation')
-- remove all bots
    AND SIZE(event_user_is_bot_by_historical) = 0  -- not a bot
GROUP BY  
  wiki_db
)

-- main query to aggregate and join sources above
SELECT
    created_articles.wiki,
    created_cx_total,
    deleted_cx_total
FROM created_articles
LEFT JOIN deleted_articles ON 
    created_articles.wiki = deleted_articles.wiki
"""

In [52]:
elia_deletion_data = spark.run(query)

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.
                                                                                

In [53]:
#save data to csv
elia_deletion_data.to_csv('elia_deletion_data.csv', index = False)

## Yandex

In [55]:
#obtain deletion ratio for all articles on target language wikis since FLores was deployed
query = """
-- find both cx and non-cx created articles 
WITH created_articles AS (

SELECT
    wiki_db AS wiki,
    COUNT(*) AS created_cx_total
FROM wmf.mediawiki_history mwh
JOIN mneisler.cx_revision_ids_by_mtservice mtc
ON mwh.revision_id = mtc.target_revision_id
WHERE
    snapshot = '2022-09'
-- only look at new page creations
    AND revision_parent_id = 0
    AND event_entity = 'revision'
    AND event_type = 'create' 
    AND event_timestamp >= '2022-02-01'
-- identified as Elia
    AND mtc.mt_service = 'Yandex'
    AND ARRAY_CONTAINS(revision_tags, 'contenttranslation')
GROUP BY  
  wiki_db
),

--find all deleted articles that were created with cx 

deleted_articles AS (

SELECT
    wiki_db AS wiki,
    COUNT(*) AS deleted_cx_total
FROM wmf.mediawiki_history mwh
JOIN mneisler.cx_revision_ids_by_mtservice mtc
ON mwh.revision_id = mtc.target_revision_id
WHERE
       snapshot = '2022-09'
-- only look at new page creations
    AND revision_parent_id = 0
    AND event_entity = 'revision'
    AND event_timestamp >= '2022-02-01'
-- find revisions moved to the archive table
    AND event_type = 'create'
    AND revision_is_deleted_by_page_deletion = TRUE
-- identified as Yandex
    AND mtc.mt_service = 'Yandex'
    AND ARRAY_CONTAINS(revision_tags, 'contenttranslation')
-- remove all bots
    AND SIZE(event_user_is_bot_by_historical) = 0  -- not a bot
GROUP BY  
  wiki_db
)

-- main query to aggregate and join sources above
SELECT
    created_articles.wiki,
    created_cx_total,
    deleted_cx_total
FROM created_articles
LEFT JOIN deleted_articles ON 
    created_articles.wiki = deleted_articles.wiki
"""

In [56]:
yandex_deletion_data = spark.run(query)

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.
                                                                                

In [57]:
#save data to csv
yandex_deletion_data.to_csv('yandex_deletion_data.csv', index = False)

## OpusMT

In [58]:
#obtain deletion ratio for all articles on target language wikis since FLores was deployed
query = """
-- find both cx and non-cx created articles 
WITH created_articles AS (

SELECT
    wiki_db AS wiki,
    COUNT(*) AS created_cx_total
FROM wmf.mediawiki_history mwh
JOIN mneisler.cx_revision_ids_by_mtservice mtc
ON mwh.revision_id = mtc.target_revision_id
WHERE
    snapshot = '2022-09'
-- only look at new page creations
    AND revision_parent_id = 0
    AND event_entity = 'revision'
    AND event_type = 'create' 
    AND event_timestamp >= '2022-02-01'
-- identified as OpusMT
    AND mtc.mt_service = 'OpusMT'
    AND ARRAY_CONTAINS(revision_tags, 'contenttranslation')
GROUP BY  
  wiki_db
),

--find all deleted articles that were created with cx 

deleted_articles AS (

SELECT
    wiki_db AS wiki,
    COUNT(*) AS deleted_cx_total
FROM wmf.mediawiki_history mwh
JOIN mneisler.cx_revision_ids_by_mtservice mtc
ON mwh.revision_id = mtc.target_revision_id
WHERE
       snapshot = '2022-09'
-- only look at new page creations
    AND revision_parent_id = 0
    AND event_entity = 'revision'
-- find revisions moved to the archive table
    AND event_type = 'create'
    AND event_timestamp >= '2022-02-01'
    AND revision_is_deleted_by_page_deletion = TRUE
-- identified as OpusmT
    AND mtc.mt_service = 'OpusMT'
    AND ARRAY_CONTAINS(revision_tags, 'contenttranslation')
-- remove all bots
    AND SIZE(event_user_is_bot_by_historical) = 0  -- not a bot
GROUP BY  
  wiki_db
)

-- main query to aggregate and join sources above
SELECT
    created_articles.wiki,
    created_cx_total,
    deleted_cx_total
FROM created_articles
LEFT JOIN deleted_articles ON 
    created_articles.wiki = deleted_articles.wiki
"""

In [60]:
opus_deletion_data = spark.run(query)

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.
                                                                                ]

In [61]:
#save data to csv
opus_deletion_data.to_csv('opus_deletion_data.csv', index = False)

## LingoCloud

In [62]:
#obtain deletion ratio for all articles on target language wikis since FLores was deployed
query = """
-- find both cx and non-cx created articles 
WITH created_articles AS (

SELECT
    wiki_db AS wiki,
    COUNT(*) AS created_cx_total
FROM wmf.mediawiki_history mwh
JOIN mneisler.cx_revision_ids_by_mtservice mtc
ON mwh.revision_id = mtc.target_revision_id
WHERE
    snapshot = '2022-09'
-- only look at new page creations
    AND revision_parent_id = 0
    AND event_entity = 'revision'
    AND event_type = 'create' 
     AND event_timestamp >= '2022-02-01'
-- identified as OpusmT
    AND mtc.mt_service = 'LingoCloud'
    AND ARRAY_CONTAINS(revision_tags, 'contenttranslation')
GROUP BY  
  wiki_db
),

--find all deleted articles that were created with cx 

deleted_articles AS (

SELECT
    wiki_db AS wiki,
    COUNT(*) AS deleted_cx_total
FROM wmf.mediawiki_history mwh
JOIN mneisler.cx_revision_ids_by_mtservice mtc
ON mwh.revision_id = mtc.target_revision_id
WHERE
       snapshot = '2022-09'
-- only look at new page creations
    AND revision_parent_id = 0
    AND event_entity = 'revision'
-- find revisions moved to the archive table
    AND event_type = 'create'
     AND event_timestamp >= '2022-02-01'
    AND revision_is_deleted_by_page_deletion = TRUE
-- identified as LingoCloud
    AND mtc.mt_service = 'LingoCloud'
    AND ARRAY_CONTAINS(revision_tags, 'contenttranslation')
-- remove all bots
    AND SIZE(event_user_is_bot_by_historical) = 0  -- not a bot
GROUP BY  
  wiki_db
)

-- main query to aggregate and join sources above
SELECT
    created_articles.wiki,
    created_cx_total,
    deleted_cx_total
FROM created_articles
LEFT JOIN deleted_articles ON 
    created_articles.wiki = deleted_articles.wiki
"""

In [63]:
lingocloud_deletion_data = spark.run(query)

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.
                                                                                

In [64]:
#save data to csv
lingocloud_deletion_data.to_csv('lingocloud_deletion_data.csv', index = False)

## Apertium

In [65]:
#obtain deletion ratio for all articles on target language wikis since FLores was deployed
query = """
-- find both cx and non-cx created articles 
WITH created_articles AS (

SELECT
    wiki_db AS wiki,
    COUNT(*) AS created_cx_total
FROM wmf.mediawiki_history mwh
JOIN mneisler.cx_revision_ids_by_mtservice mtc
ON mwh.revision_id = mtc.target_revision_id
WHERE
    snapshot = '2022-09'
-- only look at new page creations
    AND revision_parent_id = 0
    AND event_entity = 'revision'
    AND event_type = 'create' 
      AND event_timestamp >= '2022-02-01'
-- identified as Apertium
    AND mtc.mt_service = 'Apertium'
    AND ARRAY_CONTAINS(revision_tags, 'contenttranslation')
GROUP BY  
  wiki_db
),

--find all deleted articles that were created with cx 

deleted_articles AS (

SELECT
    wiki_db AS wiki,
    COUNT(*) AS deleted_cx_total
FROM wmf.mediawiki_history mwh
JOIN mneisler.cx_revision_ids_by_mtservice mtc
ON mwh.revision_id = mtc.target_revision_id
WHERE
       snapshot = '2022-09'
-- only look at new page creations
    AND revision_parent_id = 0
    AND event_entity = 'revision'
-- find revisions moved to the archive table
    AND event_type = 'create'
      AND event_timestamp >= '2022-02-01'
    AND revision_is_deleted_by_page_deletion = TRUE
-- identified as Apertium
    AND mtc.mt_service = 'Apertium'
    AND ARRAY_CONTAINS(revision_tags, 'contenttranslation')
-- remove all bots
    AND SIZE(event_user_is_bot_by_historical) = 0  -- not a bot
GROUP BY  
  wiki_db
)

-- main query to aggregate and join sources above
SELECT
    created_articles.wiki,
    created_cx_total,
    deleted_cx_total
FROM created_articles
LEFT JOIN deleted_articles ON 
    created_articles.wiki = deleted_articles.wiki
"""

In [66]:
apertium_deletion_data = spark.run(query)

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.
                                                                                

In [67]:
#save data to csv
apertium_deletion_data.to_csv('apertium_deletion_data.csv', index = False)

## Google

In [71]:
#obtain deletion ratio for all articles on target language wikis since FLores was deployed
query = """
-- find both cx and non-cx created articles 
WITH created_articles AS (

SELECT
    wiki_db AS wiki,
    COUNT(*) AS created_cx_total
FROM wmf.mediawiki_history mwh
JOIN mneisler.cx_revision_ids_by_mtservice mtc
ON mwh.revision_id = mtc.target_revision_id
WHERE
    snapshot = '2022-09'
-- only look at new page creations
    AND revision_parent_id = 0
    AND event_entity = 'revision'
    AND event_type = 'create' 
-- identified as Google
    AND mtc.mt_service = 'Google'
    AND ARRAY_CONTAINS(revision_tags, 'contenttranslation')
GROUP BY  
  wiki_db
),

--find all deleted articles that were created with cx 

deleted_articles AS (

SELECT
    wiki_db AS wiki,
    COUNT(*) AS deleted_cx_total
FROM wmf.mediawiki_history mwh
JOIN mneisler.cx_revision_ids_by_mtservice mtc
ON mwh.revision_id = mtc.target_revision_id
WHERE
       snapshot = '2022-09'
-- only look at new page creations
    AND revision_parent_id = 0
    AND event_entity = 'revision'
-- find revisions moved to the archive table
    AND event_type = 'create'
    AND revision_is_deleted_by_page_deletion = TRUE
-- identified as Google
    AND mtc.mt_service = 'Google'
    AND ARRAY_CONTAINS(revision_tags, 'contenttranslation')
-- remove all bots
    AND SIZE(event_user_is_bot_by_historical) = 0  -- not a bot
GROUP BY  
  wiki_db
)

-- main query to aggregate and join sources above
SELECT
    created_articles.wiki,
    created_cx_total,
    deleted_cx_total
FROM created_articles
LEFT JOIN deleted_articles ON 
    created_articles.wiki = deleted_articles.wiki
"""

In [72]:
google_deletion_data = spark.run(query)

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.
                                                                                ]

In [74]:
#save data to csv
google_deletion_data.to_csv('google_deletion_data.csv', index = False)

## NLLB-200

In [68]:
#obtain deletion ratio for all articles on target language wikis since FLores was deployed
query = """
-- find both cx and non-cx created articles 
WITH created_articles AS (

SELECT
    wiki_db AS wiki,
    COUNT(*) AS created_cx_total
FROM wmf.mediawiki_history mwh
JOIN mneisler.cx_revision_ids_by_mtservice mtc
ON mwh.revision_id = mtc.target_revision_id
WHERE
    snapshot = '2022-09'
-- only look at new page creations
    AND revision_parent_id = 0
    AND event_entity = 'revision'
    AND event_type = 'create' 
      AND event_timestamp >= '2022-02-01'
-- identified as NLLB-200
    AND mtc.mt_service = 'NLLB-200'
    AND ARRAY_CONTAINS(revision_tags, 'contenttranslation')
GROUP BY  
  wiki_db
),

--find all deleted articles that were created with cx 

deleted_articles AS (

SELECT
    wiki_db AS wiki,
    COUNT(*) AS deleted_cx_total
FROM wmf.mediawiki_history mwh
JOIN mneisler.cx_revision_ids_by_mtservice mtc
ON mwh.revision_id = mtc.target_revision_id
WHERE
       snapshot = '2022-09'
-- only look at new page creations
    AND revision_parent_id = 0
    AND event_entity = 'revision'
-- find revisions moved to the archive table
    AND event_type = 'create'
    AND revision_is_deleted_by_page_deletion = TRUE
-- identified as NLLB-200
    AND mtc.mt_service = 'NLLB-200'
      AND event_timestamp >= '2022-02-01'
    AND ARRAY_CONTAINS(revision_tags, 'contenttranslation')
-- remove all bots
    AND SIZE(event_user_is_bot_by_historical) = 0  -- not a bot
GROUP BY  
  wiki_db
)

-- main query to aggregate and join sources above
SELECT
    created_articles.wiki,
    created_cx_total,
    deleted_cx_total
FROM created_articles
LEFT JOIN deleted_articles ON 
    created_articles.wiki = deleted_articles.wiki
"""

In [69]:
flores_deletion_data = spark.run(query)

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.
                                                                                4]

In [70]:
#save data to csv
flores_deletion_data.to_csv('flores_deletion_data.csv', index = False)