## Data Pre-processing

In [1]:
!pip install epitran
#You furthermore need to install Flite following the epitran documentation 
#chapterInstallation of Flite (for English G2P)at https://pypi.org/project/epitran/
#if you want to use english
#
#downloadlink: http://tts.speech.cs.cmu.edu/awb/flite-2.0.5-current.tar.bz2



In [2]:
import re
from tqdm import tqdm
import gzip
import os
import regex
import numpy as np
import epitran as epi

### Convert texts to IPA

In [8]:
def filter_lines(text):
    print(f'Text length before: {len(text)}')
    filtered_text = [l for l in text if l not in ["\n", "\r\n", "\r"]]
    print(f'Text length after: {len(filtered_text)}')
    return filtered_text

In [15]:
def getLangNotSupportedByEpitran(lan, repeats, lines=100, train=0, offset=0):
    with gzip.open(f'./OSCAR-2109/packaged/{lan}/{lan}_part_1.txt.gz', 'rt', encoding="utf-8") as f:
        text = filter_lines(f.readlines())    
    typ = 'test'
    for repeat in range(repeats):
        if repeat > repeats-train-1:
            typ = 'train'
        with open(f'./data/{lan}_raw_{repeat}.{typ}', 'w') as f:
            f.writelines(text[offset+lines*repeat : offset+lines*(repeat+1)])

In [16]:
getLangNotSupportedByEpitran("eo", 3, 10)
#train data should be split from test data therefore offset 3*100
getLangNotSupportedByEpitran("eo", 6, 100, 6, 300)
#translated with https://www.internationalphoneticalphabet.org/esperanto-to-ipa-translator/
getLangNotSupportedByEpitran("el", 6), 10
#translated with 
getLangNotSupportedByEpitran("la", 6, 10)
#translated with 

Text length before: 1137969
Text length after: 1017290
Text length before: 1137969
Text length after: 1017290
Text length before: 2120779
Text length after: 1887544
Text length before: 67295
Text length after: 41856


In [30]:
from epitran.backoff import Backoff
de = epi.Epitran("deu-Latn")
de2 = epi.Epitran("deu-Latn-np")
de3 = epi.Epitran("deu-Latn-nar")
de4 = Backoff(["deu-Latn", "eng-Latn"])
# flite needs to be installed for english
en = epi.Epitran("eng-Latn")
pl = epi.Epitran("pol-Latn")
pl2 = Backoff(["pol-Latn", "eng-Latn"])
hi = epi.Epitran("hin-Deva")
hi2 = Backoff(["hin-Deva", "eng-Latn"])
fr = epi.Epitran("fra-Latn")
fr2 = epi.Epitran("fra-Latn-np")
ru = epi.Epitran("rus-Cyrl")
sw = epi.Epitran("swa-Latn")
sw2 = epi.Epitran("swa-Latn-red")
# cedict_file="cedict_1_0_ts_utf-8_mdbg.txt" needed for chinese
zh = epi.Epitran("cmn-Hant", cedict_file="cedict_1_0_ts_utf-8_mdbg.txt")
zh2 = epi.Epitran("cmn-Hans", cedict_file="cedict_1_0_ts_utf-8_mdbg.txt")
ar = epi.Epitran("ara-Arab")
   
def getIPATexts(lan, epit, variant, repeats, lines=100, train=0, punc=False, lig=False, offset=0):
    '''
    Args:
        epit(Epitran Object): has to fit the language selected
        variant(string): should correspond to epitran model
        punc(bool): toggles normpunc of transliterate function, enables punctuation normalization
        lig(bool): toggles ligatures of transliterate function, enables non-standard IPA ligatures
    '''
    with gzip.open(f'./OSCAR-2109/packaged/{lan}/{lan}_part_1.txt.gz', 'rt', encoding="utf-8") as f:
        text = filter_lines(f.readlines())
    
    pu = li = ""
    if punc:
        pu="p" 
    if lig:
        li="l"   
    typ = 'test'
    
    for repeat in range(repeats):
        if repeat > repeats-train-1:
            typ = 'train'
        with open(f'./data/{lan}_{repeat}_ipa_{variant}{pu}{li}.{typ}', 'w') as f:
            i = 0
            for line in (text[offset+lines*repeat : offset+lines*(repeat+1)]):
                i += 1
                #print(line)
                #line = line.decode('utf-8')
                #line = line.encode('utf-8')
                l = ""
                nl = "\n"
                for word in line:
                    if isinstance(epit, Backoff): 
                        l = l + epit.transliterate(word)
                    else:
                        l = l + epit.transliterate(word, normpunc=punc, ligatures=lig)
                f.writelines([l, nl])

In [31]:
languages = ['ar', 'de', 'en', 'fr', 'hi', 'pl', 'ru', 'sw', 'zh']
variants = [[ar], [de, de2, de3, de4], [en], [fr, fr2], [hi, hi2], [pl, pl2], [ru], [sw, sw2], [zh, zh2]]
for l, lan in enumerate(languages):
    for v in range(len(variants[l])):
        if v == 0:
            for lig in [True, False]:
                getIPATexts(lan, variants[l][v], v, 3, 10, lig=lig)
        else:
            getIPATexts(lan, variants[l][v], v, 3, 10)

Text length before: 2314610
Text length after: 2055182
Text length before: 2314610
Text length after: 2055182
Text length before: 3520868
Text length after: 3144435
Text length before: 3520868
Text length after: 3144435
Text length before: 3520868
Text length after: 3144435
Text length before: 3520868
Text length after: 3144435
Text length before: 3520868
Text length after: 3144435
Text length before: 3366485
Text length after: 3037083
Text length before: 3366485
Text length after: 3037083
Text length before: 3553903
Text length after: 3197087
Text length before: 3553903
Text length after: 3197087
Text length before: 3553903
Text length after: 3197087
Text length before: 1474984
Text length after: 1287035
Text length before: 1474984
Text length after: 1287035
Text length before: 1474984
Text length after: 1287035
Text length before: 3492916
Text length after: 3108816
Text length before: 3492916
Text length after: 3108816
Text length before: 3492916
Text length after: 3108816
Text lengt

In [19]:
with open(f'./data/eo_raw_0.train', 'rt', encoding='utf-8') as f:
    text = f.readlines()
    i = 0
    for line in text:
        for w in line:
            i+= 1
    print(i)

30470


### Prepare language files (chunking)

In [32]:
def preparedata(lan, repeats, lines=100, train=0, transl=0, offset=0):
    '''
    Args:
    tranls(bool): 0 orthographic literal text, 1 transliterated text
    offset(int): offset of lines train test split, should be the same as in getLangNotSupportedEpitran()
    '''    
    if transl:
        with gzip.open(f'./data/translit/{lan}_part_1_diacritics.txt.gz', 'rb') as f:
            text = filter_lines(f.readlines())
    else:
        with gzip.open(f'./OSCAR-2109/packaged/{lan}/{lan}_part_1.txt.gz', 'rb') as f:
            text = filter_lines(f.readlines())
                
    typ = 'test'
    for repeat in range(repeats):
        if repeat > repeats-train-1:
            type = 'train'
        if transl:
            file = open(f'./data/{lan}_transl_{repeat}.{typ}', 'w')
        else:    
            file = open(f'./data/{lan}_{repeat}.{typ}', 'w')
        i = 0
        skip = False
        for line in (text[offset:]):
            if skip:
                break
            line = line.decode('utf-8')
            line = regex.sub(r'-[\p{L}_]', '', line)
            line = line[:-3].lower().replace(' ', '\x5f')
            line = [line[i:i+10] for i in range(0, len(line), 10)]
            for string in line:
                if len(string) == 10:
                    i += 1
                    if i < lines*(repeat):
                        continue
                    if i > lines*(repeat+1):
                        skip = True
                        break
                    file.write(f'{string}\n')

In [35]:
def prepareipadata(lan, repeats, lines=100, typ="test"):   
    for repeat in range(repeats):
        with open(f'./data/{lan}_{repeat}_ipa_0.{typ}', 'rt', encoding='utf-8') as f:
                text = f.readlines()
        file = open(f'./data/{lan}_{repeat}.{typ}', 'w')
        i = 0
        for line in (text):
            #line = line.decode('utf-8')
            line = regex.sub(r'-[\p{L}_]', '', line)
            line = line[:-3].lower().replace(' ', '\x5f')
            line = [line[i:i+10] for i in range(0, len(line), 10)]
            for string in line:
                if len(string) == 10:
                    i += 1
                    if i < lines*(repeat):
                        continue
                    if i > lines*(repeat+1):
                        break
                    file.write(f'{string}\n')

In [36]:
languages = ['ar', 'de', 'el', 'en', 'fr', 'hi', 'la', 'pl', 'ru', 'sw', 'zh']
tlanguages = ['ar', 'el', 'hi', 'ru', 'zh']

#all languages
for lan in tqdm(languages):
    preparedata(lan, 3)
    if lan in ['el', 'la']:
        continue
    prepareipadata(lan, 3)

#only transliterated
for lan in tqdm(tlanguages):
    preparedata(lan, 3, 100, 0, 1)

#only Esperanto test
preparedata('eo', 3) 
#prepareipadata('eo', 3)
#only Esperanto train
preparedata('eo', 6, 3500, 6, offset=300)
prepareipadata('eo', 3, 3500, "train")

  0%|          | 0/11 [00:00<?, ?it/s]

Text length before: 2314610
Text length after: 2314610


  9%|▉         | 1/11 [00:04<00:41,  4.11s/it]

Text length before: 3520868
Text length after: 3520868


 18%|█▊        | 2/11 [00:09<00:42,  4.73s/it]

Text length before: 2120779
Text length after: 2120779


 27%|██▋       | 3/11 [00:13<00:35,  4.40s/it]

Text length before: 3366454
Text length after: 3366454


 36%|███▋      | 4/11 [00:18<00:32,  4.67s/it]

Text length before: 3553891
Text length after: 3553891


 45%|████▌     | 5/11 [00:23<00:28,  4.81s/it]

Text length before: 1474984
Text length after: 1474984


 55%|█████▍    | 6/11 [00:26<00:21,  4.31s/it]

Text length before: 67295
Text length after: 67295
Text length before: 3492892
Text length after: 3492892


 73%|███████▎  | 8/11 [00:32<00:10,  3.51s/it]

Text length before: 2165965
Text length after: 2165965


 82%|████████▏ | 9/11 [00:36<00:07,  3.61s/it]

Text length before: 61286
Text length after: 61286


100%|██████████| 11/11 [00:39<00:00,  3.62s/it]


Text length before: 1076683
Text length after: 1076683


100%|██████████| 5/5 [00:00<00:00, 850.19it/s]

Text length before: 10
Text length after: 10
Text length before: 50
Text length after: 50
Text length before: 18
Text length after: 18
Text length before: 50
Text length after: 50
Text length before: 25
Text length after: 25





Text length before: 1137967
Text length after: 1137967
Text length before: 1137967
Text length after: 1137967


FileNotFoundError: [Errno 2] No such file or directory: './data/eo_0_ipa_0.train'

### Make alphabets

In [54]:
languages = ['ar', 'de', 'el', 'en', 'eo', 'fr', 'hi', 'la', 'pl', 'ru', 'sw'] #no chinese
transl = ['ar', 'el', 'hi', 'ru']

def makealphabet1(lan_set, transl=0): #lan_set: transl or not, if transl put transl=1
    gabet=np.array([])
    
    for lan in tqdm(lan_set):
        abet = np.array([])

        for r in range(3):
            if transl:
                with open(f'./data/{lan}_transl_{r}.test', 'r') as f:
                    text = f.readlines()
                    
            else:
                with open(f'./data/{lan}_{r}.test', 'r') as f:
                    text = f.readlines()

            with open(f'./data/eo_{r}.train', 'r') as f:
                eo = f.readlines()

            for line in text:
                chars = list(line)
                abet = np.append(abet, chars)
                abet = np.unique(abet)

            for line in eo:
                chars = list(line)
                abet = np.append(abet, chars)
                abet = np.unique(abet)

        gabet=np.append(gabet, abet)
        
        if transl:
            with open(f'./alphabets/alphabet_{lan}_transl.txt', 'w') as f:
                for char in abet:
                    f.write(char)
        else:
            with open(f'./alphabets/alphabet_{lan}.txt', 'w') as f:
                for char in abet:
                    f.write(char)
        
    return gabet

In [55]:
gabet = makealphabet1(languages) #literal
tgabet = makealphabet1(transl, 1) #transliteral
for ch in tgabet:
    gabet = np.append(gabet, ch) #combined (global)


  0%|          | 0/11 [00:00<?, ?it/s][A
  9%|▉         | 1/11 [00:00<00:02,  3.78it/s][A
 18%|█▊        | 2/11 [00:00<00:02,  3.70it/s][A
 27%|██▋       | 3/11 [00:00<00:02,  3.56it/s][A
 36%|███▋      | 4/11 [00:01<00:01,  3.73it/s][A
 45%|████▌     | 5/11 [00:01<00:01,  3.89it/s][A
 55%|█████▍    | 6/11 [00:01<00:01,  4.01it/s][A
 64%|██████▎   | 7/11 [00:01<00:01,  3.72it/s][A
 73%|███████▎  | 8/11 [00:02<00:00,  3.82it/s][A
 82%|████████▏ | 9/11 [00:02<00:00,  3.66it/s][A
 91%|█████████ | 10/11 [00:02<00:00,  3.78it/s][A
100%|██████████| 11/11 [00:02<00:00,  3.83it/s][A

  0%|          | 0/4 [00:00<?, ?it/s][A
 25%|██▌       | 1/4 [00:00<00:00,  3.63it/s][A
 50%|█████     | 2/4 [00:00<00:00,  3.45it/s][A
 75%|███████▌  | 3/4 [00:00<00:00,  3.72it/s][A
100%|██████████| 4/4 [00:01<00:00,  3.81it/s][A


In [57]:
#writing global aplphabets
c=0
c1=0

#literal
with open('./alphabets/alphabet.txt', 'w') as f:
    for char in np.unique(gabet):
        c+=1
        f.write(char)

#transliteral
with open('./alphabets/talphabet.txt', 'w') as f:
    for char in np.unique(tgabet):
        c1+=1
        f.write(char)
        
print('Global alphabet length:', c)
print('Global (transliterated) alphabet length:', c1)

Global alphabet length: 330
Global (transliterated) alphabet length: 163


### Convert to phonetics (IPA)