In [None]:
import numpy as np
import pandas as pd
from tira.rest_api_client import Client
from tira.third_party_integrations import get_output_directory

In [2]:
tira = Client()

    # loading train data
text_train = tira.pd.inputs(
        "nlpbuw-fsu-sose-24", "language-identification-train-20240429-training"
)
targets_train = tira.pd.truths(
        "nlpbuw-fsu-sose-24", "language-identification-train-20240429-training"
)
 # loading validation data (automatically replaced by test data when run on tira)
text_validation = tira.pd.inputs(
        "nlpbuw-fsu-sose-24", "language-identification-validation-20240429-training"
)
targets_validation = tira.pd.truths(
        "nlpbuw-fsu-sose-24", "language-identification-validation-20240429-training"
)

: 

In [None]:
text_train

Unnamed: 0,id,text
0,1,Der Flughafen Berlin Brandenburg verfügt über ...
1,2,"Успешное развитие общества, однако, возможно л..."
2,3,I øvrigt er kendetegnene for en magnetisk svag...
3,4,Sowohl über den historischen Simon als auch üb...
4,5,"Emmure е формирана през 2003 г., когато Франки..."
...,...,...
319995,399994,Sociální náklady stejně jako soukromé náklady ...
319996,399995,"Sljedećeg dana, glumac je malo zakasnio na set..."
319997,399996,"İstiqlal Sarayı (), Yenidən Birleşme Sarayı ()..."
319998,399998,"Nella serie ""Magico Vento"" figura il personagg..."


In [None]:
targets_train

Unnamed: 0,id,lang
0,1,de
1,2,ru
2,3,da
3,4,de
4,5,bg
...,...,...
319995,399994,cs
319996,399995,hr
319997,399996,az
319998,399998,it


In [None]:
for i in range(5, 10):
    print(f"{targets_train['lang'][i]}: {text_train['text'][i]}\n")
    print(f"    {text_train['text'][i].encode('utf-8')}\n")

az: Çoxillik hündür otdur. Tamkənarlı yarpaqları növbəli düzülür. Qırmızı-bənövşəyi, göy, çəhrayı, sarı, ağ və s. çiçəkləri qıvrım çiçək qrupunda yerləşir. Ləçəkləri boruvarıdır. Meyvəsi toxumçadır. Avropa, Çənubi Asiya və Şimali Afrikada 25-dək, Azərbaycanda 3 növü məlumdur. Rütubətli yerlərdə bitir. Dərman xəndəkotunun ("Symphytum officinalis") kök və kökümsovunda alkaloid və aşı maddəsi var. Tibdə və baytarlıqda iltihaba qarşı və qankəsən dərman kimi istifadə edilir. Azərbaycanda bitən bərk xəndəkotu ("Symphytum asperum") donuz və adadovşanları üçün yemdir. Hər 2 növ yaxşı balverən, həmçinin boyaq bitkisidir. Yoğunkök xəndəkotu yeyilir. Qafqaz xəndəkotu, iriçiçək xəndəkotu və s. dekorativ bitki kimi becərilir.

    b'\xc3\x87oxillik h\xc3\xbcnd\xc3\xbcr otdur. Tamk\xc9\x99narl\xc4\xb1 yarpaqlar\xc4\xb1 n\xc3\xb6vb\xc9\x99li d\xc3\xbcz\xc3\xbcl\xc3\xbcr. Q\xc4\xb1rm\xc4\xb1z\xc4\xb1-b\xc9\x99n\xc3\xb6v\xc5\x9f\xc9\x99yi, g\xc3\xb6y, \xc3\xa7\xc9\x99hray\xc4\xb1, sar\xc4\xb1, a\xc4\x9

# Train Dataset
The dataset consists of 20 different languages. Each occures with a number of 16 000 texts in the train dataset.  

In [None]:
# pip install iso-639
from iso639 import languages
langs = np.unique(targets_train['lang'])
for l in langs:
    print(f"{l} ({np.sum(targets_train['lang'] == l)}) :  {languages.get(alpha2=l).name}")

af (16000) :  Afrikaans
az (16000) :  Azerbaijani
bg (16000) :  Bulgarian
cs (16000) :  Czech
da (16000) :  Danish
de (16000) :  German
el (16000) :  Modern Greek (1453-)
en (16000) :  English
es (16000) :  Spanish
fi (16000) :  Finnish
fr (16000) :  French
hr (16000) :  Croatian
it (16000) :  Italian
ko (16000) :  Korean
nl (16000) :  Dutch
no (16000) :  Norwegian
pl (16000) :  Polish
ru (16000) :  Russian
ur (16000) :  Urdu
zh (16000) :  Chinese


In [None]:
def get_block(*ranges):
    block = []
    for r in ranges:
        r = r.split('-')
        block += list(range(int(r[0], 16), int(r[1], 16) + 1))
    return block

# Greek
Measuring the frequency of greek letters (Unicode U+0370 - U+03FF) in greek train texts:

In [None]:
greek_texts = text_train['text'][targets_train['lang'] == 'el']
greek_block = get_block('0370-03FF') + [32]

for greek_text in greek_texts:
    encoded = np.array([ord(c) for c in greek_text])
    freq = np.sum(np.isin(encoded, greek_block)) / len(encoded)
    other_chars = np.array([chr(c) for c in encoded[~np.isin(encoded, greek_block)]]) # Leerzeichen, Komma, Punkt, Klammern, ...
    print(f"{freq:.2f} % of characters are greek\nThe rest consits of: {[(c, np.sum(other_chars == c)) for c in np.unique(other_chars)]}")

0.91 % of characters are greek
The rest consits of: [('(', 1), (')', 1), (',', 2), ('-', 2), ('.', 2), ('1', 2), ('4', 1), ('9', 1), ('A', 1), ('L', 1), ('a', 1), ('e', 2), ('i', 2), ('l', 1), ('n', 1), ('o', 1), ('q', 1), ('r', 1), ('t', 2), ('u', 1)]
0.95 % of characters are greek
The rest consits of: [(',', 3), ('.', 2), ('1', 2), ('2', 1), ('3', 2), ('7', 1), ('8', 1), ('9', 2)]
0.93 % of characters are greek
The rest consits of: [('"', 8), ('(', 5), (')', 5), (',', 7), ('-', 1), ('.', 8), ('«', 4), ('»', 4)]
0.95 % of characters are greek
The rest consits of: [('"', 4), ('(', 1), (')', 1), (',', 5), ('.', 2), ('1', 1), ('4', 2), ('8', 1), ('9', 1)]
0.95 % of characters are greek
The rest consits of: [("'", 1), ('(', 1), (')', 1), (',', 2), ('.', 4), ('1', 2), ('2', 2), ('3', 3), ('6', 1), ('7', 1), (':', 1), ('c', 1), ('e', 1), ('i', 1), ('l', 1), ('o', 1), ('r', 1), ('s', 1), ('t', 1), ('·', 1), ('́', 1)]
0.98 % of characters are greek
The rest consits of: [('(', 1), (')', 1), ('

# Chinese and Korean
Chinese, Korean and Japanese symbols and radicals are united under the term of CJK characters.
Mutually exclusive for chinese characters are:
- CJK Radicals Supplement (2E80–2EFF)
- Kangxi Radicals (2F00–2FDF)
- Bopomofo (3100–312F)
- Bopomofo Extended (31A0–31BF)
- CJK Unified Ideographs Extension A (3400–4DBF)
- CJK Unified Ideographs (4E00–9FFF)

Mutually exlusive for korean character are:
- Hangul Compatibility Jamo (3130–318F)

In [None]:
chinese_texts = text_train['text'][targets_train['lang'] == 'zh']
chinese_block = get_block('2E80-2EFF', '2F00-2FDF')

for chinese_text in chinese_texts:
    encoded = np.array([ord(c) for c in chinese_text])
    freq = np.sum(np.isin(encoded, chinese_block)) / len(encoded)
    other_chars = np.array([chr(c) for c in encoded[~np.isin(encoded, greek_block)]]) # Leerzeichen, Komma, Punkt, Klammern, ...
    print(f"{freq:.2f} % of characters are greek\nThe rest consits of: {[(c, np.sum(other_chars == c)) for c in np.unique(other_chars)]}")