# Collect all cx translations

In [2]:
import pandas as pd
import wmfdata as wmf
import duckdb




You are using Wmfdata v2.0.0, but v2.0.1 is available.

To update, run `pip install --upgrade git+https://github.com/wikimedia/wmfdata-python.git@release`.

To see the changes, refer to https://github.com/wikimedia/wmfdata-python/blob/release/CHANGELOG.md.


In [11]:
#collect content translation and associated MT engine
#need to query cx_corpus table and cross it with cx_translations table and then group and sort

query = """ 
SELECT
    DATE(translation_start_timestamp) AS translation_start_time,
    translation_id,
    translation_target_revision_id  AS target_revision_id,
    translation_target_title AS page_title,
    CAST(JSON_EXTRACT(translation_progress, '$.mt') AS FLOAT) AS mt_translated_percent, 
    CAST(JSON_EXTRACT(translation_progress, '$.human') AS FLOAT) AS human_translated_percent, 
    translation_status AS translation_status,
    CASE
        WHEN cc.cxc_origin = 'Yandex.Translate' THEN 'Yandex'
        WHEN cc.cxc_origin = 'Google Translate' THEN 'Google'
        WHEN cc.cxc_origin = 'Flores' THEN 'NLLB-200'
        ELSE cc.cxc_origin
    END AS mt_service,
    translation_source_language AS source_language,
    translation_target_language AS target_language
FROM 
    cx_translations
JOIN 
    cx_corpora AS cc
    ON translation_id = cc.cxc_translation_id
WHERE
    DATE(translation_start_timestamp) >= DATE('2022-07-01')
    AND translation_status = 'published'
    AND NOT cc.cxc_origin IN ('source', 'user', 'null', 'Youdao', 'original')
"""

In [12]:
%%time
mt_data = wmf.mariadb.run(commands = query, dbs = "wikishared")



CPU times: user 25.9 s, sys: 5.26 s, total: 31.2 s
Wall time: 38.9 s


In [13]:
mt_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6233548 entries, 0 to 6233547
Data columns (total 10 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   translation_start_time    object 
 1   translation_id            int64  
 2   target_revision_id        int64  
 3   page_title                object 
 4   mt_translated_percent     float64
 5   human_translated_percent  float64
 6   translation_status        object 
 7   mt_service                object 
 8   source_language           object 
 9   target_language           object 
dtypes: float64(2), int64(2), object(6)
memory usage: 475.6+ MB


## Export to a database

In [14]:
conn = duckdb.connect('secrets/mt_data.db')

In [15]:
conn.execute("""
CREATE OR REPLACE TABLE mt_logs AS
SELECT
    translation_start_time::DATE AS translation_start_time,
    * EXCLUDE(translation_start_time, translation_status)
FROM
    mt_data
""")

<duckdb.DuckDBPyConnection at 0x7fa5ae245a30>

In [16]:
conn.sql("""DESCRIBE mt_logs""")

┌──────────────────────────┬─────────────┬─────────┬─────────┬─────────┬───────┐
│       column_name        │ column_type │  null   │   key   │ default │ extra │
│         varchar          │   varchar   │ varchar │ varchar │ varchar │ int32 │
├──────────────────────────┼─────────────┼─────────┼─────────┼─────────┼───────┤
│ translation_start_time   │ DATE        │ YES     │ NULL    │ NULL    │  NULL │
│ translation_id           │ BIGINT      │ YES     │ NULL    │ NULL    │  NULL │
│ target_revision_id       │ BIGINT      │ YES     │ NULL    │ NULL    │  NULL │
│ page_title               │ VARCHAR     │ YES     │ NULL    │ NULL    │  NULL │
│ mt_translated_percent    │ DOUBLE      │ YES     │ NULL    │ NULL    │  NULL │
│ human_translated_percent │ DOUBLE      │ YES     │ NULL    │ NULL    │  NULL │
│ mt_service               │ VARCHAR     │ YES     │ NULL    │ NULL    │  NULL │
│ source_language          │ VARCHAR     │ YES     │ NULL    │ NULL    │  NULL │
│ target_language          │

In [17]:
conn.close()