## Data Pre-processing

In [1]:
import re
from tqdm import tqdm
import gzip
import os
import regex
import numpy as np

#### Preparing language files

In [45]:
def preparedata(lan, repeats, train=0, transl=0):
    if transl:
        with gzip.open(f'./data/translit/{lan}_part_1.txt.gz', 'rb') as f:
            text = f.readlines()
    else:
        with gzip.open(f'./OSCAR-2109/packaged/{lan}/{lan}_part_1.txt.gz', 'rb') as f:
                text = f.readlines()
                
    type = 'test'
    for repeat in range(repeats):
        if repeat > repeats-train-1:
            type = 'train'
        if transl:
            file = open(f'./data/{lan}_transl_{repeat}.{type}', 'w')
        else:    
            file = open(f'./data/{lan}_{repeat}.{type}', 'w')
        i = 0
        skip = False
        for line in (text):
            if skip:
                break
            line = line.decode('utf-8')
            line = regex.sub(r'-[\p{L}_]', '', line)
            line = line[:-3].lower().replace(' ', '\x5f')
            line = [line[i:i+10] for i in range(0, len(line), 10)]
            for string in line:
                if len(string) == 10:
                    i += 1
                    if i < 100*(repeat):
                        continue
                    if i > 100*(repeat+1):
                        skip = True
                        break
                    file.write(f'{string}\n')

In [49]:
languages = ['ar', 'de', 'el', 'en', 'fr', 'hi', 'la', 'pl', 'ru', 'sw', 'zh']
tlanguages = ['ar', 'el', 'hi', 'ru', 'zh']

#all languages
for lan in tqdm(languages):
    preparedata(lan, 3)

#only transliterated
for lan in tqdm(tlanguages):
    preparedata(lan, 3, 0, 1)

#only Esperanto, creating train set too
tqdm(preparedata('eo', 6, 3)) #FIX THIS







  0%|          | 0/11 [00:00<?, ?it/s][A[A[A[A[A[A





  9%|▉         | 1/11 [00:08<01:28,  8.87s/it][A[A[A[A[A[A





 18%|█▊        | 2/11 [00:19<01:23,  9.29s/it][A[A[A[A[A[A





 27%|██▋       | 3/11 [00:27<01:10,  8.87s/it][A[A[A[A[A[A





 36%|███▋      | 4/11 [00:37<01:05,  9.30s/it][A[A[A[A[A[A





 45%|████▌     | 5/11 [00:50<01:03, 10.57s/it][A[A[A[A[A[A





 55%|█████▍    | 6/11 [00:57<00:47,  9.43s/it][A[A[A[A[A[A





 64%|██████▎   | 7/11 [00:57<00:26,  6.65s/it][A[A[A[A[A[A





 73%|███████▎  | 8/11 [01:10<00:24,  8.31s/it][A[A[A[A[A[A





 82%|████████▏ | 9/11 [01:19<00:17,  8.64s/it][A[A[A[A[A[A





 91%|█████████ | 10/11 [01:19<00:06,  6.09s/it][A[A[A[A[A[A





100%|██████████| 11/11 [01:26<00:00,  7.89s/it][A[A[A[A[A[A






100%|██████████| 5/5 [00:00<00:00, 351.49it/s][A[A[A






0it [00:00, ?it/s][A[A[A[A[A[A

0it [00:00, ?it/s]

#### Make alphabets

In [50]:
def makealphabet():
    with open('./utf8_sequence_0-0x10ffff_assigned_including-unprintable-asis.txt', 'rb') as f:
        text = f.readlines()

    writefile = open('./alphabet.txt', 'w') 

    for line in text:
        line = line.decode('utf-8')
        line = [line[i:i+10] for i in range(0, len(line), 10)]
        for string in line:
            writefile.write(f'{string}\n')

makealphabet()

In [51]:
languages = ['ar', 'de', 'el', 'en', 'eo', 'fr', 'hi', 'la', 'pl', 'ru', 'sw'] #no chinese
transl = ['ar', 'el', 'hi', 'ru']

def makealphabet1(lan_set, transl=0): #lan_set: transl or not, if transl put transl=1
    gabet=np.array([])
    
    for lan in tqdm(lan_set):
        abet = np.array([])

        for r in range(3):
            if transl:
                with open(f'./data/{lan}_transl_{r}.test', 'r') as f:
                    text = f.readlines()
                    
            else:
                with open(f'./data/{lan}_{r}.test', 'r') as f:
                    text = f.readlines()

            with open(f'./data/eo_{r}.train', 'r') as f:
                eo = f.readlines()

            for line in text:
                chars = list(line)
                abet = np.append(abet, chars)
                abet = np.unique(abet)

            for line in eo:
                chars = list(line)
                abet = np.append(abet, chars)
                abet = np.unique(abet)

        gabet=np.append(gabet, abet)
        
        if transl:
            with open(f'./alphabets/alphabet_{lan}_transl.txt', 'w') as f:
                for char in abet:
                    f.write(char)
        else:
            with open(f'./alphabets/alphabet_{lan}.txt', 'w') as f:
                for char in abet:
                    f.write(char)
        
    return gabet

In [54]:
gabet = makealphabet1(languages) #literal
tgabet = makealphabet1(transl, 1) #transliteral
for ch in tgabet:
    gabet = np.append(gabet, ch) #combined (global)









  0%|          | 0/11 [00:00<?, ?it/s][A[A[A[A[A[A[A[A







100%|██████████| 11/11 [00:00<00:00, 57.04it/s][A[A[A[A[A[A[A[A








100%|██████████| 4/4 [00:00<00:00, 71.01it/s]A[A[A[A[A[A


In [55]:
#writing global aplphabets
c=0
c1=0

with open('./alphabet.txt', 'w') as f:
    for char in np.unique(gabet):
        c+=1
        f.write(char)
        
with open('./talphabet.txt', 'w') as f:
    for char in np.unique(tgabet):
        c1+=1
        f.write(char)
        
print('Global alphabet length:', c)
print('Global (transliterated) alphabet length:', c1)

Global alphabet length: 260
Global (transliterated) alphabet length: 79
