# Collect Deletion Ratios for each MT engine

## imports & spark_session

In [15]:
import wmfdata as wmf
import pandas as pd
import duckdb

from IPython.display import clear_output

import warnings

In [7]:
spark_session = wmf.spark.get_active_session()

if type(spark_session) != type(None):
    spark_session.stop()
else:
    print('no active session')

no active session


In [8]:
spark_session = wmf.spark.create_custom_session(
    master="yarn",
    app_name='mt-deletion-ratios',
    spark_config={
        "spark.driver.memory": "4g",
        "spark.dynamicAllocation.maxExecutors": 64,
        "spark.executor.memory": "16g",
        "spark.executor.cores": 4,
        "spark.sql.shuffle.partitions": 256,
        "spark.driver.maxResultSize": "2g"
        
    }
)

clear_output()

spark_session


In [9]:
spark_session.sparkContext.setLogLevel("ERROR")

## Load data

In [47]:
conn = duckdb.connect('secrets/mt_data.db')

In [23]:
revs_by_mt = conn.sql("""
    SELECT 
        DISTINCT target_revision_id, 
        mt_service 
    FROM 
        mt_logs
    WHERE
        translation_start_time >= '2023-08-01'
        AND translation_start_time <= '2023-10-31'
""").df()

In [24]:
revs = spark_session.createDataFrame(revs_by_mt)
revs.createOrReplaceTempView('revs_by_mt_service')

In [30]:
mt_services = revs_by_mt.mt_service.unique().tolist()
print(f'Available machine translation services: {mt_services}')

Available machine translation services: ['Google', 'scratch', 'MinT', 'Yandex', 'Apertium', 'LingoCloud', 'Elia']


## Query deletion ratios

In [36]:
deletion_ratios_query = """
SELECT
    wiki_db AS wiki,
    mtc.mt_service AS mt_service,
    COUNT(*) AS created_cx_total,
    SUM(CASE
            WHEN revision_is_deleted_by_page_deletion THEN 1 
            ELSE 0 
        END) AS deleted_cx_total
FROM 
    wmf.mediawiki_history mwh
JOIN 
    revs_by_mt_service mtc
    ON mwh.revision_id = mtc.target_revision_id
WHERE
    snapshot = '{MW_SNAPSHOT}'
    AND revision_parent_id = 0
    AND event_entity = 'revision'
    AND event_type = 'create' 
    AND event_timestamp >= '2023-08-01'
    AND event_timestamp <= '2023-10-31'
    AND SIZE(event_user_is_bot_by) = 0
    AND mtc.mt_service IN {MT_SERVICES}
    AND ARRAY_CONTAINS(revision_tags, 'contenttranslation')
GROUP BY  
    wiki_db, 
    mtc.mt_service
"""

In [37]:
%%time 

deletion_ratios = wmf.spark.run(deletion_ratios_query
                                .format(MW_SNAPSHOT = '2023-10', 
                                        MT_SERVICES = wmf.utils.sql_tuple(mt_services)))



CPU times: user 402 ms, sys: 102 ms, total: 505 ms
Wall time: 1min 51s


                                                                                

In [39]:
deletion_ratios.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 438 entries, 0 to 437
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   wiki              438 non-null    object
 1   mt_service        438 non-null    object
 2   created_cx_total  438 non-null    int64 
 3   deleted_cx_total  438 non-null    int64 
dtypes: int64(2), object(2)
memory usage: 13.8+ KB


## Export to database

In [48]:
conn.execute("""
CREATE OR REPLACE TABLE mt_deletion_ratios AS
SELECT
    *,
    deleted_cx_total/created_cx_total AS deletion_ratio
FROM 
    deletion_ratios
""")

<duckdb.DuckDBPyConnection at 0x7f811050da30>

In [49]:
conn.sql("""DESCRIBE mt_deletion_ratios""")

┌──────────────────┬─────────────┬─────────┬─────────┬─────────┬───────┐
│   column_name    │ column_type │  null   │   key   │ default │ extra │
│     varchar      │   varchar   │ varchar │ varchar │ varchar │ int32 │
├──────────────────┼─────────────┼─────────┼─────────┼─────────┼───────┤
│ wiki             │ VARCHAR     │ YES     │ NULL    │ NULL    │  NULL │
│ mt_service       │ VARCHAR     │ YES     │ NULL    │ NULL    │  NULL │
│ created_cx_total │ BIGINT      │ YES     │ NULL    │ NULL    │  NULL │
│ deleted_cx_total │ BIGINT      │ YES     │ NULL    │ NULL    │  NULL │
│ deletion_ratio   │ DOUBLE      │ YES     │ NULL    │ NULL    │  NULL │
└──────────────────┴─────────────┴─────────┴─────────┴─────────┴───────┘

In [51]:
conn.close()