-
Notifications
You must be signed in to change notification settings - Fork 5
/
mk-g2p-model.sh
52 lines (39 loc) · 1.56 KB
/
mk-g2p-model.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#!/bin/bash
# Building g2p model with Phonetisaurus
# Written by Ye Kyaw Thu, LST Lab., NECTEC, Thailand
# Last updated: 17 Jan 2020
# Used for ASR Winter School at CU Banmaw
# How to run: bash ./mk-g2p-model.sh <dict-file> <test-file>
# e.g. ./mk-g2p-model.sh ./train.dict ./test.my | tee run.log
# You have to prepare train.dict file and test.my in advance
# e.g. train.dict file is as follows:
#
# $ head -n 3 train.dict
# ဇေယျစန္ဒ Ze Ya Sanda
# ကောင်းပြည့်ဇော် Kaung Pyah Zaw
# ရူပါ Yu Par
#
# e.g. test.my file is as follows:
# $ tail -n 3 ./test.my
# ကေသီစန်း
# အိမ့်ပန်းချီ
# မာလာရီ
echo -e ">>> Corpus building ...\n";
phonetisaurus-align --input=$1 --ofile=./$1.corpus --seq1_del=false
echo -e ">>> Create symbol file ...\n";
ngramsymbols < ./$1.corpus > $1.syms
echo -e ">>> Preparing far file ...\n";
farcompilestrings --symbols=$1.syms --keep_symbols=1 ./$1.corpus > $1.far
echo -e ">>> running ngramcount ...\n";
ngramcount --order=8 $1.far > $1.cnts
echo ">>> running ngrammake ...";
ngrammake --v=2 --bins=3 --method=kneser_ney ./$1.cnts > $1.mod
echo -e ">>> Converting to ARPA file format ...\n";
ngramprint --ARPA ./$1.mod > $1.arpa
# FST Model Building
echo -e ">>> FST model building ...\n";
phonetisaurus-arpa2wfst --lm=$1.arpa > $1.fst
echo -e ">>> Finished FST model building ...\n";
# Testing ...
echo -e ">>> Start testing with filename=$2\n";
phonetisaurus-g2pfst --model=$1.fst --wordlist=./$2 | perl -e'while(<>){s/\|/ /g; print $_;}' > $2.out