# Collect primary service information for language paris

In [4]:
import duckdb
import pandas as pd
import json
import requests

In [14]:
conn = duckdb.connect('secrets/mt_data.db')

In [6]:
# get unique pairs
unique_pairs = conn.sql("""SELECT DISTINCT source_language, target_language FROM mt_logs""").df()
unique_pairs.shape

(2837, 2)

## Get default service data
The data is https://cxserver.wikimedia.org, where the first item of the output array for machine translation engines for a language pair is the default. For example, for [en-hi](https://cxserver.wikimedia.org/v1/list/mt/en/hi), it is Google.

In [7]:
def get_default_service(lang1, lang2):
    response = requests.get(f'https://cxserver.wikimedia.org/v1/list/mt/{lang1}/{lang2}')
    output = json.loads(response.text)
    if len(output) == 0:
        return [None, 'no_mt']
    else:
        services = output['mt']
        if len(services) == 1:
            return [services[0], 'only_mt']
        else:
            return [services[0], 'default_mt']

In [9]:
%%time

unique_pairs[['service', 'service_type']] = (
    unique_pairs
    .apply(
        lambda row: pd.Series(get_default_service(row['source_language'], row['target_language'])),
        axis=1
    )
)

CPU times: user 37.3 s, sys: 1.44 s, total: 38.8 s
Wall time: 1min


In [77]:
print(unique_pairs.shape)
unique_pairs.head()

(2790, 5)


Unnamed: 0,source_language,target_language,service,service_type,language_pair
0,en,uz,Google,default_mt,en-uz
1,en,nb,MinT,default_mt,en-nb
2,en,pa,Google,default_mt,en-pa
3,en,zh,Google,default_mt,en-zh
4,en,eo,Apertium,default_mt,en-eo


## Export to database

In [15]:
conn.execute("""
CREATE OR REPLACE TABLE mt_defaults AS
SELECT
    *,
    source_language||'-'||target_language AS pair
FROM
    unique_pairs
""")

<duckdb.DuckDBPyConnection at 0x7fc8dc2812f0>

In [16]:
conn.sql("""DESCRIBE mt_defaults""")

┌─────────────────┬─────────────┬─────────┬─────────┬─────────┬───────┐
│   column_name   │ column_type │  null   │   key   │ default │ extra │
│     varchar     │   varchar   │ varchar │ varchar │ varchar │ int32 │
├─────────────────┼─────────────┼─────────┼─────────┼─────────┼───────┤
│ source_language │ VARCHAR     │ YES     │ NULL    │ NULL    │  NULL │
│ target_language │ VARCHAR     │ YES     │ NULL    │ NULL    │  NULL │
│ service         │ VARCHAR     │ YES     │ NULL    │ NULL    │  NULL │
│ service_type    │ VARCHAR     │ YES     │ NULL    │ NULL    │  NULL │
│ pair            │ VARCHAR     │ YES     │ NULL    │ NULL    │  NULL │
└─────────────────┴─────────────┴─────────┴─────────┴─────────┴───────┘

In [17]:
conn.close()