In [1]:
import re
import subprocess
import shutil
import json
from pathlib import Path

In [2]:
def convert_wav(src_path: Path, dst_path: Path):
    dst_path.parent.mkdir(parents=True, exist_ok=True)
    subprocess.check_call([
        "sox",
        "-t", "wav",
        str(src_path),
        "-r", "16000",
        "-e", "signed-integer",
        "-c", "1",
        "-t", "wav",
        str(dst_path)
    ])

In [3]:
def clean_string(s: str) -> str:
    s = s.strip().lower()
    return re.sub(r"[.,?':]", "", s)

---

In [4]:
transcriptions = {}

## LapsBM

In [5]:
dataset_dir = Path("wav/lapsbm")
for speaker_dir in Path("data/lapsbm").glob("*"):
    speaker_id = speaker_dir.name
    for wav_file in speaker_dir.glob("*.wav"):
        utterance_id = wav_file.stem
        file_id = f"lapsbm/{speaker_id}/{utterance_id}"
        text_file = speaker_dir / f"{utterance_id}.txt"
        transcriptions[file_id] = clean_string(text_file.read_text())
        dst_path = dataset_dir / speaker_id / f"{utterance_id}.wav"
        if not dst_path.exists():
            convert_wav(wav_file, dst_path)

In [6]:
len(transcriptions)

700

## Sid

In [7]:
dataset_dir = Path("wav/sid")
for speaker_dir in Path("data/sid").glob("*"):
    speaker_id = speaker_dir.name
    with open(speaker_dir / "prompts.txt", "r") as prompts_file:
        for line in prompts_file:
            line = line.strip()
            prompt_num, prompt = line.split("=", maxsplit=1)
            prompt_num = int(prompt_num)
            utterance_id = "{0}{1:03d}".format(speaker_id, prompt_num)
            file_id = f"sid/{speaker_id}/{utterance_id}"
            src_path = speaker_dir / f"{utterance_id}.wav"
            if src_path.exists():
                transcriptions[file_id] = clean_string(prompt)
                dst_path = dataset_dir / speaker_id / f"{utterance_id}.wav"
                
                if not dst_path.exists():
                    convert_wav(src_path, dst_path)

In [8]:
len(transcriptions)

5793

## Voxforge pt-br

In [9]:
dataset_dir = Path("wav/voxforge-ptbr")
for speaker_dir in Path("data/voxforge-ptbr").glob("*"):
    speaker_id = speaker_dir.name
    prompts_path = speaker_dir / "PROMPTS"
    if not prompts_path.exists():
        prompts_path = speaker_dir / "etc" / "PROMPTS"
    with open(prompts_path, "r") as prompts_file:
        for line in prompts_file:
            line = line.strip()
            prompt_id, prompt = line.split(" ", maxsplit=1)
            prompt_num = prompt_id.split("/")[-1]
            file_id = f"voxforge-ptbr/{speaker_id}/{prompt_num}"
            src_path = speaker_dir / f"{prompt_num}.wav"
            if not src_path.exists():
                src_path = speaker_dir / "wav" / f"{prompt_num}.wav"
            if src_path.exists():
                transcriptions[file_id] = clean_string(prompt)
                dst_path = dataset_dir / speaker_id / f"{prompt_num}.wav"
                if not dst_path.exists():
                    convert_wav(src_path, dst_path)

In [10]:
len(transcriptions)

9923

## Test

In [43]:
dataset_dir = Path("data/test")
for wav_file in dataset_dir.glob("*.wav"):
    utterance_id = wav_file.stem
    file_id = f"test/{utterance_id}"
    text_file = dataset_dir / f"{utterance_id}.txt"
    transcriptions[file_id] = clean_string(text_file.read_text())
    dst_path = Path("wav") / "test" / f"{utterance_id}.wav"
    if not dst_path.exists():
        convert_wav(wav_file, dst_path)

In [41]:
len(transcriptions)

9933

## Sphinx

In [39]:
with open("etc/pt-synesthesiam.fileids", "w") as fileids_file:
    for key in sorted(transcriptions):
        print(key, file=fileids_file)

In [40]:
with open("etc/pt-synesthesiam.transcription", "w") as trans_file:
    for key in sorted(transcriptions):
        t = transcriptions[key]
        print(f"<s> {t} </s>", f"({key})", file=trans_file)

## Missing Words

In [24]:
known_words = set()
known_dict = {}
with open("etc/pt-synesthesiam.dic.full", "r") as dict_file:
    for line in dict_file:
        line = line.strip()
        if (len(line) == 0) or ("(" in line):
            continue
            
        word, phonemes = re.split(r"\s+", line, maxsplit=1)
        known_words.add(word)
        known_dict[word] = phonemes

In [25]:
len(known_words)

30710

In [26]:
trans_words = set()
for sentence in transcriptions.values():
    words = [w for w in re.split(r"\s+", sentence) if len(w) > 0]
    trans_words.update(words)

In [27]:
unknown_words = trans_words - known_words
len(unknown_words)

0

In [28]:
unknown_words

set()

In [95]:
with open("etc/unknown_words.txt", "w") as unknown_file:
    for word in unknown_words:
        print(word, file=unknown_file)

In [48]:
shutil.copy("etc/pt-synesthesiam.dic.original", "etc/pt-synesthesiam.dic")

'etc/pt-synesthesiam.dic'

In [50]:
with open("etc/pt-synesthesiam.dic", "a") as dict_file:
    with open("etc/guess.json", "r") as guess_file:
        guesses = json.load(guess_file)
        for word, prons in guesses.items():
            print(word, prons[0], file=dict_file)

## Language Model

In [29]:
with open("etc/corpus.txt", "w") as corpus_file:
    for sentence in transcriptions.values():
        print(sentence + ".", file=corpus_file)

In [30]:
!ngramsymbols etc/corpus.txt etc/corpus.syms

In [31]:
!farcompilestrings -keep_symbols=1 -symbols=etc/corpus.syms etc/corpus.txt etc/corpus.far

In [32]:
!ngramcount -order=3 etc/corpus.far etc/corpus.cnts

In [33]:
!ngrammake etc/corpus.cnts etc/corpus.mod

In [34]:
!ngramprint --ARPA etc/corpus.mod etc/pt-synesthesiam.lm

In [35]:
!sphinx_lm_convert -i etc/pt-synesthesiam.lm -o etc/pt-synesthesiam.lm.DMP

Current configuration:
[NAME]		[DEFLT]	[VALUE]
-case			
-help		no	no
-i			etc/pt-synesthesiam.lm
-ifmt			
-logbase	1.0001	1.000100e+00
-mmap		no	no
-o			etc/pt-synesthesiam.lm.DMP
-ofmt			

INFO: ngram_model_trie.c(354): Trying to read LM in trie binary format
INFO: ngram_model_trie.c(365): Header doesn't match
INFO: ngram_model_trie.c(177): Trying to read LM in arpa format
INFO: ngram_model_trie.c(193): LM of order 3
INFO: ngram_model_trie.c(195): #1-grams: 7611
INFO: ngram_model_trie.c(195): #2-grams: 21052
INFO: ngram_model_trie.c(195): #3-grams: 25744
INFO: lm_trie.c(474): Training quantizer
INFO: lm_trie.c(482): Building LM trie


## Dictionary

In [36]:
with open("etc/pt-synesthesiam.dic", "w") as dict_file:
    for word in trans_words:
        print(word, known_dict[word], file=dict_file)

## Transcriptions

In [44]:
with open("etc/pt-synesthesiam_train.transcription", "w") as train_file:
    with open("etc/pt-synesthesiam_test.transcription", "w") as test_file:
        with open("etc/pt-synesthesiam_test.fileids", "w") as test_ids_file:
            for key in sorted(transcriptions):
                sentence = transcriptions[key]
                if key.startswith("test/"):
                    print("<s>", sentence, "</s>", f"({key})", file=test_file)
                    print(key, file=test_ids_file)
                else:
                    print("<s>", sentence, "</s>", f"({key})", file=train_file)