## Data Pre-processing

In [6]:
import re
from tqdm import tqdm
import gzip
import os
import regex
import numpy as np
#import epitran

### Prepare language files

In [51]:
def preparedata(lan, repeats, lines=100, train=0, transl=0):
    if transl:
        with gzip.open(f'./data/translit/{lan}_part_1_diacritics.txt.gz', 'rb') as f:
            text = f.readlines()
    else:
        with gzip.open(f'./OSCAR-2109/packaged/{lan}/{lan}_part_1.txt.gz', 'rb') as f:
                text = f.readlines()
                
    type = 'test'
    for repeat in range(repeats):
        if repeat > repeats-train-1:
            type = 'train'
        if transl:
            file = open(f'./data/{lan}_transl_{repeat}.{type}', 'w')
        else:    
            file = open(f'./data/{lan}_{repeat}.{type}', 'w')
        i = 0
        skip = False
        for line in (text):
            if skip:
                break
            line = line.decode('utf-8')
            line = regex.sub(r'-[\p{L}_]', '', line)
            line = line[:-3].lower().replace(' ', '\x5f')
            line = [line[i:i+10] for i in range(0, len(line), 10)]
            for string in line:
                if len(string) == 10:
                    i += 1
                    if i < lines*(repeat):
                        continue
                    if i > lines*(repeat+1):
                        skip = True
                        break
                    file.write(f'{string}\n')

In [48]:
languages = ['ar', 'de', 'el', 'en', 'fr', 'hi', 'la', 'pl', 'ru', 'sw', 'zh']
tlanguages = ['ar', 'el', 'hi', 'ru', 'zh']

#all languages
for lan in tqdm(languages):
    preparedata(lan, 3)

#only transliterated
for lan in tqdm(tlanguages):
    preparedata(lan, 3, 0, 1)

#only Esperanto test
preparedata('eo', 3) #STILL NEED TO FIX THIS, MORE DATA FOR TRAIN
#only Esperanto train
preparedata('eo', 6, 3500, 6)


  0%|          | 0/11 [00:00<?, ?it/s][A
  9%|▉         | 1/11 [00:08<01:25,  8.51s/it][A
 18%|█▊        | 2/11 [00:20<01:25,  9.52s/it][A
 27%|██▋       | 3/11 [00:28<01:12,  9.10s/it][A
 36%|███▋      | 4/11 [00:38<01:06,  9.46s/it][A
 45%|████▌     | 5/11 [00:49<00:58,  9.83s/it][A
 55%|█████▍    | 6/11 [00:56<00:44,  8.88s/it][A
 64%|██████▎   | 7/11 [00:56<00:25,  6.26s/it][A
 73%|███████▎  | 8/11 [01:07<00:23,  7.86s/it][A
 82%|████████▏ | 9/11 [01:16<00:16,  8.12s/it][A
 91%|█████████ | 10/11 [01:16<00:05,  5.73s/it][A
100%|██████████| 11/11 [01:24<00:00,  7.64s/it][A

100%|██████████| 5/5 [00:00<00:00, 319.23it/s]


### Make alphabets

In [54]:
languages = ['ar', 'de', 'el', 'en', 'eo', 'fr', 'hi', 'la', 'pl', 'ru', 'sw'] #no chinese
transl = ['ar', 'el', 'hi', 'ru']

def makealphabet1(lan_set, transl=0): #lan_set: transl or not, if transl put transl=1
    gabet=np.array([])
    
    for lan in tqdm(lan_set):
        abet = np.array([])

        for r in range(3):
            if transl:
                with open(f'./data/{lan}_transl_{r}.test', 'r') as f:
                    text = f.readlines()
                    
            else:
                with open(f'./data/{lan}_{r}.test', 'r') as f:
                    text = f.readlines()

            with open(f'./data/eo_{r}.train', 'r') as f:
                eo = f.readlines()

            for line in text:
                chars = list(line)
                abet = np.append(abet, chars)
                abet = np.unique(abet)

            for line in eo:
                chars = list(line)
                abet = np.append(abet, chars)
                abet = np.unique(abet)

        gabet=np.append(gabet, abet)
        
        if transl:
            with open(f'./alphabets/alphabet_{lan}_transl.txt', 'w') as f:
                for char in abet:
                    f.write(char)
        else:
            with open(f'./alphabets/alphabet_{lan}.txt', 'w') as f:
                for char in abet:
                    f.write(char)
        
    return gabet

In [55]:
gabet = makealphabet1(languages) #literal
tgabet = makealphabet1(transl, 1) #transliteral
for ch in tgabet:
    gabet = np.append(gabet, ch) #combined (global)


  0%|          | 0/11 [00:00<?, ?it/s][A
  9%|▉         | 1/11 [00:00<00:02,  3.78it/s][A
 18%|█▊        | 2/11 [00:00<00:02,  3.70it/s][A
 27%|██▋       | 3/11 [00:00<00:02,  3.56it/s][A
 36%|███▋      | 4/11 [00:01<00:01,  3.73it/s][A
 45%|████▌     | 5/11 [00:01<00:01,  3.89it/s][A
 55%|█████▍    | 6/11 [00:01<00:01,  4.01it/s][A
 64%|██████▎   | 7/11 [00:01<00:01,  3.72it/s][A
 73%|███████▎  | 8/11 [00:02<00:00,  3.82it/s][A
 82%|████████▏ | 9/11 [00:02<00:00,  3.66it/s][A
 91%|█████████ | 10/11 [00:02<00:00,  3.78it/s][A
100%|██████████| 11/11 [00:02<00:00,  3.83it/s][A

  0%|          | 0/4 [00:00<?, ?it/s][A
 25%|██▌       | 1/4 [00:00<00:00,  3.63it/s][A
 50%|█████     | 2/4 [00:00<00:00,  3.45it/s][A
 75%|███████▌  | 3/4 [00:00<00:00,  3.72it/s][A
100%|██████████| 4/4 [00:01<00:00,  3.81it/s][A


In [57]:
#writing global aplphabets
c=0
c1=0

#literal
with open('./alphabets/alphabet.txt', 'w') as f:
    for char in np.unique(gabet):
        c+=1
        f.write(char)

#transliteral
with open('./alphabets/talphabet.txt', 'w') as f:
    for char in np.unique(tgabet):
        c1+=1
        f.write(char)
        
print('Global alphabet length:', c)
print('Global (transliterated) alphabet length:', c1)

Global alphabet length: 330
Global (transliterated) alphabet length: 163


### Convert to phonetics (IPA)