In [30]:
import duckdb
import pandas as pd
import json
import requests

In [34]:
conn = duckdb.connect('secrets/mt_data.db')

In [42]:
unique_pairs = conn.sql("""SELECT DISTINCT source_language, target_language FROM mt_logs""").df()
unique_pairs.shape

(2790, 2)

In [54]:
def get_default_service(lang1, lang2):
    response = requests.get(f'https://cxserver.wikimedia.org/v1/list/mt/{lang1}/{lang2}')
    output = json.loads(response.text)
    if len(output) == 0:
        return [None, 'no_mt']
    else:
        services = output['mt']
        if len(services) == 1:
            return [services[0], 'only_mt']
        else:
            return [services[0], 'default_mt']

In [68]:
%%time

unique_pairs[['service', 'service_type']] = (
    unique_pairs
    .apply(
        lambda row: pd.Series(get_default_service(row['source_language'], row['target_language'])),
        axis=1
    )

CPU times: user 37.9 s, sys: 1.48 s, total: 39.3 s
Wall time: 1min 2s


In [63]:
%%time

unique_pairs['service'], unique_pairs['service_type'] = '', ''

for i in unique_pairs.index:
    
    source = unique_pairs.loc[i, 'source_language']
    target = unique_pairs.loc[i, 'target_language']
    
    default_output = get_default_service(source, target)
    
    unique_pairs.loc[i, 'service'], unique_pairs.loc[i, 'service_type'] = default_output[0], default_output[1]

CPU times: user 36.2 s, sys: 1.32 s, total: 37.5 s
Wall time: 1min


In [77]:
print(unique_pairs.shape)
unique_pairs.head()

(2790, 5)


Unnamed: 0,source_language,target_language,service,service_type,language_pair
0,en,uz,Google,default_mt,en-uz
1,en,nb,MinT,default_mt,en-nb
2,en,pa,Google,default_mt,en-pa
3,en,zh,Google,default_mt,en-zh
4,en,eo,Apertium,default_mt,en-eo


In [74]:
conn.execute("""
CREATE OR REPLACE TABLE mt_defaults AS
SELECT
    *
FROM
    unique_pairs
""")

<duckdb.DuckDBPyConnection at 0x7f8f69425ab0>

In [76]:
conn.close()