In [4]:
import numpy as np
import pandas as pd
from tira.rest_api_client import Client
from tira.third_party_integrations import get_output_directory

In [5]:
tira = Client()

In [6]:
    # loading train data
text_train = tira.pd.inputs(
        "nlpbuw-fsu-sose-24", "language-identification-train-20240429-training"
)

In [7]:
targets_train = tira.pd.truths(
        "nlpbuw-fsu-sose-24", "language-identification-train-20240429-training"
)

In [8]:
 # loading validation data (automatically replaced by test data when run on tira)
text_validation = tira.pd.inputs(
        "nlpbuw-fsu-sose-24", "language-identification-validation-20240429-training"
)

In [9]:
targets_validation = tira.pd.truths(
        "nlpbuw-fsu-sose-24", "language-identification-validation-20240429-training"
)

In [10]:
text_train

Unnamed: 0,id,text
0,1,Der Flughafen Berlin Brandenburg verfügt über ...
1,2,"Успешное развитие общества, однако, возможно л..."
2,3,I øvrigt er kendetegnene for en magnetisk svag...
3,4,Sowohl über den historischen Simon als auch üb...
4,5,"Emmure е формирана през 2003 г., когато Франки..."
...,...,...
319995,399994,Sociální náklady stejně jako soukromé náklady ...
319996,399995,"Sljedećeg dana, glumac je malo zakasnio na set..."
319997,399996,"İstiqlal Sarayı (), Yenidən Birleşme Sarayı ()..."
319998,399998,"Nella serie ""Magico Vento"" figura il personagg..."


In [11]:
targets_train

Unnamed: 0,id,lang
0,1,de
1,2,ru
2,3,da
3,4,de
4,5,bg
...,...,...
319995,399994,cs
319996,399995,hr
319997,399996,az
319998,399998,it


In [12]:
for i in range(5, 10):
    print(f"{targets_train['lang'][i]}: {text_train['text'][i]}\n")
    print(f"    {text_train['text'][i].encode('utf-8')}\n")

az: Çoxillik hündür otdur. Tamkənarlı yarpaqları növbəli düzülür. Qırmızı-bənövşəyi, göy, çəhrayı, sarı, ağ və s. çiçəkləri qıvrım çiçək qrupunda yerləşir. Ləçəkləri boruvarıdır. Meyvəsi toxumçadır. Avropa, Çənubi Asiya və Şimali Afrikada 25-dək, Azərbaycanda 3 növü məlumdur. Rütubətli yerlərdə bitir. Dərman xəndəkotunun ("Symphytum officinalis") kök və kökümsovunda alkaloid və aşı maddəsi var. Tibdə və baytarlıqda iltihaba qarşı və qankəsən dərman kimi istifadə edilir. Azərbaycanda bitən bərk xəndəkotu ("Symphytum asperum") donuz və adadovşanları üçün yemdir. Hər 2 növ yaxşı balverən, həmçinin boyaq bitkisidir. Yoğunkök xəndəkotu yeyilir. Qafqaz xəndəkotu, iriçiçək xəndəkotu və s. dekorativ bitki kimi becərilir.

    b'\xc3\x87oxillik h\xc3\xbcnd\xc3\xbcr otdur. Tamk\xc9\x99narl\xc4\xb1 yarpaqlar\xc4\xb1 n\xc3\xb6vb\xc9\x99li d\xc3\xbcz\xc3\xbcl\xc3\xbcr. Q\xc4\xb1rm\xc4\xb1z\xc4\xb1-b\xc9\x99n\xc3\xb6v\xc5\x9f\xc9\x99yi, g\xc3\xb6y, \xc3\xa7\xc9\x99hray\xc4\xb1, sar\xc4\xb1, a\xc4\x9

# Train Dataset
The dataset consists of 20 different languages. Each occures with a number of 16 000 texts in the train dataset.  

In [13]:
# pip install iso-639
from iso639 import languages
langs = np.unique(targets_train['lang'])
for l in langs:
    print(f"{l} ({np.sum(targets_train['lang'] == l)}) :  {languages.get(alpha2=l).name}")

af (16000) :  Afrikaans
az (16000) :  Azerbaijani
bg (16000) :  Bulgarian
cs (16000) :  Czech
da (16000) :  Danish
de (16000) :  German
el (16000) :  Modern Greek (1453-)
en (16000) :  English
es (16000) :  Spanish
fi (16000) :  Finnish
fr (16000) :  French
hr (16000) :  Croatian
it (16000) :  Italian
ko (16000) :  Korean
nl (16000) :  Dutch
no (16000) :  Norwegian
pl (16000) :  Polish
ru (16000) :  Russian
ur (16000) :  Urdu
zh (16000) :  Chinese


In [14]:
def get_block(*ranges):
    block = []
    for r in ranges:
        r = r.split('-')
        block += list(range(int(r[0], 16), int(r[1], 16) + 1))
    return block

# Greek
Measuring the frequency of el letters (Unicode U+0370 - U+03FF) in el train texts:

In [29]:
el_texts = text_train['text'][targets_train['lang'] == 'el']
el_block = get_block('0370-03FF')

mean_freq = 0
for el_text in el_texts:
    encoded = np.array([ord(c) for c in el_text])
    freq = np.sum(np.isin(encoded, el_block)) / len(encoded)
    other_chars = np.array([chr(c) for c in encoded[~np.isin(encoded, el_block)]]) # Leerzeichen, Komma, Punkt, Klammern, ...
    # print(f"{freq:.2f} % of characters are el\nThe rest consits of: {[(c, np.sum(other_chars == c)) for c in np.unique(other_chars)]}")
    mean_freq += freq 
mean_freq /= 16000
mean_freq

0.7820118375034697

# Chinese and Korean
Chinese, Korean and Japanese symbols and radicals are united under the term of CJK characters.
Mutually exclusive for zh characters are:
- CJK Radicals Supplement (2E80–2EFF)
- Kangxi Radicals (2F00–2FDF)
- Bopomofo (3100–312F)
- Bopomofo Extended (31A0–31BF)
- CJK Unified Ideographs Extension A (3400–4DBF)
- **CJK Unified Ideographs (4E00–9FFF)**

Mutually exlusive for korean character are:
- Hangul Jamo (1100–11FF)
- Hangul Compatibility Jamo (3130–318F)
- **Hangul Syllables (AC00–D7AF)**
- Hangul Jamo Extended-B (D7B0–D7FF)

In [45]:
zh_texts = text_train['text'][targets_train['lang'] == 'zh']
zh_block = get_block('2E80-2EFF', '2F00-2FDF', '3100-312F', '31A0-31BF', '3400-4DBF', '4E00-9FFF')
mean_freq = 0
for zh_text in zh_texts:
    encoded = np.array([ord(c) for c in zh_text])
    freq = np.sum(np.isin(encoded, zh_block)) / len(encoded)
    mean_freq += freq 
mean_freq /= 16000
mean_freq

0.8018195731077342

In [38]:
ko_texts = text_train['text'][targets_train['lang'] == 'ko']
ko_block = get_block('1100-11FF', '3130-318F', 'AC00-D7AF', 'D7B0-D7FF') 

mean_freq = 0
for ko_text in ko_texts:
    encoded = np.array([ord(c) for c in ko_text])
    freq = np.sum(np.isin(encoded, ko_block)) / len(encoded)
    mean_freq += freq 
mean_freq /= 16000
mean_freq

0.674295940409025

# Russian
- Cyrillic (0400–04FF)
- Cyrillic Supplement (0500–052F)

In [46]:
ru_texts = text_train['text'][targets_train['lang'] == 'ru']
ru_block = get_block('0400-04FF', '0500-052F') 

mean_freq = 0
for ru_text in ru_texts:
    encoded = np.array([ord(c) for c in ru_text])
    freq = np.sum(np.isin(encoded, ru_block)) / len(encoded)
    mean_freq += freq 
mean_freq /= 16000
mean_freq

0.8028722989161122

# Azerbaijani
Turkish alphabet with mostly latin letters. 

In [53]:
az_texts = text_train['text'][targets_train['lang'] == 'az']
az_block = get_block('0000-0300') 

mean_freq = 0
for az_text in az_texts:
    encoded = np.array([ord(c) for c in az_text])
    # print(az_text)
    freq = np.sum(np.isin(encoded, az_block)) / len(encoded)
    mean_freq += freq 
mean_freq /= 16000
mean_freq

0.9969088010647223

# Bulgarian
Uses also a cyrillic alphabet like russian.

In [54]:
bg_texts = text_train['text'][targets_train['lang'] == 'bg']
bg_block = get_block('0400-04FF', '0500-052F') 

mean_freq = 0
for bg_text in bg_texts:
    encoded = np.array([ord(c) for c in bg_text])
    freq = np.sum(np.isin(encoded, bg_block)) / len(encoded)
    mean_freq += freq 
mean_freq /= 16000
mean_freq

0.7739431015115734

# Czech and Croatian
Mostly latin with some supplementary letters.

# Urdu
Arabic alphabet:
- Arabic (0600–06FF)

In [60]:
ur_texts = text_train['text'][targets_train['lang'] == 'ur']
ur_block = get_block('0600-06FF')

mean_freq = 0
for ur_text in ur_texts:
    encoded = np.array([ord(c) for c in ur_text])
    freq = np.sum(np.isin(encoded, ur_block)) / len(encoded)
    mean_freq += freq 
mean_freq /= 16000
mean_freq

0.7618597741812074