In [2]:
import numpy as np
import pandas as pd
from tira.rest_api_client import Client

In [3]:
tira = Client()
   
    # loading train data
text_train = tira.pd.inputs(
        "nlpbuw-fsu-sose-24", "language-identification-train-20240429-training"
)
targets_train = tira.pd.truths(
        "nlpbuw-fsu-sose-24", "language-identification-train-20240429-training"
)
 # loading validation data (automatically replaced by test data when run on tira)
text_validation = tira.pd.inputs(
        "nlpbuw-fsu-sose-24", "language-identification-validation-20240429-training"
)
targets_validation = tira.pd.truths(
        "nlpbuw-fsu-sose-24", "language-identification-validation-20240429-training"
)

In [4]:
latin = ['af', 'az', 'cs', 'da', 'de', 'en', 'es', 'fi', 'fr', 'hr', 'it', 'nl', 'no', 'pl'] # '0041-024F'
cyrillic = ['bg', 'ru'] # '0400-04FF', '0500-052F'
non_latin_blocks = {'el': '0370-03FF', 'zh': '4E00-9FFF', 'ko': 'AC00-D7AF', 'ur': '0600-06FF'}

In [41]:
def get_block(*ranges):
    block = []
    for r in ranges:
        r = r.split('-')
        block += list(range(int(r[0], 16), int(r[1], 16) + 1))
    return block

def comp_freq(text, block):
    encoded = np.array([ord(c) for c in text])
    return np.sum(np.isin(encoded, block)) / len(encoded)

freq_vec = np.vectorize(comp_freq, excluded={1})

def is_latin(texts):
    latin_block = get_block('0041-024F')
    freqs = freq_vec(texts, latin_block)
    return freqs > 0.5

def is_cyrillic(texts):
    cyrillic_block = get_block('0400-04FF', '0500-052F')
    freqs = freq_vec(texts, cyrillic_block)
    return freqs > 0.5

def classify_remainders(texts):
    langs = np.array(list(non_latin_blocks.keys()))
    freqs = np.empty(shape=(texts.shape[0], len(langs)))
    for i, lang in enumerate(langs):
        block = get_block(non_latin_blocks[lang])
        freqs[:, i] = freq_vec(texts, block)
    preds = langs[np.argmax(freqs, axis=1)]
    return preds

In [42]:
pred_latin = is_latin(text_train['text'])
truth = np.isin(targets_train['lang'], latin)
acc_latin = np.sum(pred_latin == truth) / len(pred_latin)
acc_latin

0.999253125

In [43]:
text_non_latin = text_train['text'][~pred_latin]
targets_non_latin = targets_train['lang'][~pred_latin]
pred_cyrillic = is_cyrillic(text_non_latin)
truth = np.isin(target_non_latin, cyrillic)
acc_cyrillic = np.sum(pred_cyrillic == truth) / len(pred_cyrillic)
acc_cyrillic

0.9985713838806219

In [44]:
text_remainders = text_non_latin[~pred_cyrillic]
pred_remainders = classify_remainders(text_remainders)

In [45]:
targets_remainders = targets_non_latin[~pred_cyrillic]
acc_remainders = np.sum(pred_remainders == targets_remainders) / len(pred_remainders)
acc_remainders

0.9972975928269053