In [1]:
import textgrid
import re
from pydub import AudioSegment
from pathlib import Path
from tqdm.notebook import tqdm
from unidecode import unidecode

In [2]:
def get_tier(textgrid: textgrid.textgrid.TextGrid, name: str) -> textgrid.textgrid.IntervalTier:
    for tier in textgrid:
        if tier.name == name:
            return tier

In [5]:
est_ = ['i', 'iː', 'iːː', 'æ', 'æː', 'æːː', 'ie', 'æe', 'ea', 'e', 'eː', 'eːː']
kld_ = ['ie', 'ea', 'iː', 'aː', 'eː']
skolt_ = ['eä', 'ie', 'iä', 'iõ', 'e', 'eː', 'õ', 'õː', 'â', 'âː', 'i', 'iː']

paths = [
    ('./estonian/Liisa/', 'est_liisa', est_),
    ('./estonian/Vello/', 'est_vello', est_),
    ('./kildin/', 'kld', kld_),
    ('./lithuanian/', 'lit', []),
    ('./skolt/', 'skolt', skolt_)
]

for path in tqdm(paths):
    Path('./divided/' + path[1] + '/sounds').mkdir(parents=True, exist_ok=True)
    files = Path(path[0]).iterdir()
    wavs = [file for file in files if file.suffix == '.wav']
    tgs = [Path(wav.with_suffix('.TextGrid')) for wav in wavs]

    for wav, tg in zip(wavs, tgs):
        audio = AudioSegment.from_wav(wav)
        tg = textgrid.TextGrid.fromFile(tg)
        vowels = get_tier(tg, 'vowel')
        words = get_tier(tg, 'word')
        if path[1] == 'north':
            vowels = vowels[1::2]

        for vowel, word in zip(vowels, words):
            if not vowel.mark:
                continue
            if vowel.mark not in path[2] and path[1] != 'lit':
                continue

            start = vowel.minTime * 1000 - 25
            end = vowel.maxTime * 1000 + 25
            word_ = unidecode(word.mark.replace('̄', 'ː')).replace('"', "''")
            n = 1
            name = f'./divided/{path[1]}/sounds/{path[1]}_{vowel.mark}_{word_}_{n}.wav'.replace(':', 'ː')
            while Path(name).exists():
                n += 1
                name = f'./divided/{path[1]}/sounds/{path[1]}_{vowel.mark}_{word_}_{n}.wav'.replace(':', 'ː')
            sound = audio[start:end]
            sound.export(name, format='wav')

  0%|          | 0/1 [00:00<?, ?it/s]