In [116]:
import sys
import os
import re
import random
import ujson

from typing import List, Dict

from collections import defaultdict, Counter
from tqdm.notebook import tqdm

from nltk.tokenize import word_tokenize

In [2]:
cmudict_path = '/Users/mac/Desktop/syt/Deep-Learning/Dataset/CMUdict/cmudict-0.7b.txt'

In [19]:
entry_lines = []

with open(cmudict_path, 'r', encoding='latin-1') as f:
    for l in f:
        if len(l.strip()) > 0 and (not l.startswith(';;;')):
            entry_lines.append(l.strip())

len(entry_lines)

133854

In [23]:
entry_lines[5::10000]

['"IN-QUOTES  IH1 N K W OW1 T S',
 'BELDOCK  B EH1 L D AA2 K',
 'CHANNELED  CH AE1 N AH0 L D',
 'DEMILLE(1)  D IH0 M IH1 L',
 'EXTORTIONISTS  EH0 K S T AO1 R SH AH0 N IH0 S T S',
 'GRIPPED  G R IH1 P T',
 'INTERSPERSES  IH2 N T ER0 S P ER1 S AH0 Z',
 'LIMESTONE  L AY1 M S T OW2 N',
 'MONROEVILLE  M AA0 N R OW1 V IH2 L',
 'PENDERGAST  P EH1 N D ER0 G AE2 S T',
 'REPEATS(1)  R IY0 P IY1 T S',
 'SIELAFF  S IY0 L AE1 F',
 'THIBEDEAU  TH IH1 B IH0 D OW0',
 'WHITEAKER(1)  HH W IH1 T AH0 K ER0']

In [34]:
def strip_stress(phone: str) -> str:
    _m = re.match(r'(.*)\d+$', phone)
    if _m is not None:
        phone = _m.group(1)
    return phone

# test 
for l in entry_lines[5::10000]:
    _word, _pron = l.split('  ')
    
    if _word.endswith(')'):
        # variant
        _m = re.match(r'(.*)\((.*)\)$', _word)
        _word = _m.group(1)
        _variant = _m.group(2)
    else:
        # no variant
        _variant = None
    
    _phones = tuple([strip_stress(_phone) for _phone in _pron.split(' ')])
    
    print(_word, _variant, _phones)

"IN-QUOTES None ('IH', 'N', 'K', 'W', 'OW', 'T', 'S')
BELDOCK None ('B', 'EH', 'L', 'D', 'AA', 'K')
CHANNELED None ('CH', 'AE', 'N', 'AH', 'L', 'D')
DEMILLE 1 ('D', 'IH', 'M', 'IH', 'L')
EXTORTIONISTS None ('EH', 'K', 'S', 'T', 'AO', 'R', 'SH', 'AH', 'N', 'IH', 'S', 'T', 'S')
GRIPPED None ('G', 'R', 'IH', 'P', 'T')
INTERSPERSES None ('IH', 'N', 'T', 'ER', 'S', 'P', 'ER', 'S', 'AH', 'Z')
LIMESTONE None ('L', 'AY', 'M', 'S', 'T', 'OW', 'N')
MONROEVILLE None ('M', 'AA', 'N', 'R', 'OW', 'V', 'IH', 'L')
PENDERGAST None ('P', 'EH', 'N', 'D', 'ER', 'G', 'AE', 'S', 'T')
REPEATS 1 ('R', 'IY', 'P', 'IY', 'T', 'S')
SIELAFF None ('S', 'IY', 'L', 'AE', 'F')
THIBEDEAU None ('TH', 'IH', 'B', 'IH', 'D', 'OW')
WHITEAKER 1 ('HH', 'W', 'IH', 'T', 'AH', 'K', 'ER')


In [38]:
word2pron = defaultdict(set) # all possible prons, Dict[str, Set[Tuple[str]]]
pron2word = defaultdict(set) # all possible words, Dict[Tuple[str], Set[str]]

for l in tqdm(entry_lines):
    _word, _pron = l.split('  ')
    
    if _word.endswith(')'):
        # variant
        _m = re.match(r'(.*)\((.*)\)$', _word)
        _word = _m.group(1)
        _variant = _m.group(2)
    else:
        # no variant
        _variant = None
    
    _phones = tuple([strip_stress(_phone) for _phone in _pron.split(' ')])
    
    word2pron[_word].add(_phones)
    pron2word[_phones].add(_word)

len(word2pron), len(pron2word)

HBox(children=(IntProgress(value=0, max=133854), HTML(value='')))




(125074, 113745)

In [40]:
idx2word = list(word2pron.keys())
word2idx = {w : idx for idx, w in enumerate(idx2word)}
idx2pron = list(pron2word.keys())
pron2idx = {p : idx for idx, p in enumerate(idx2pron)}
len(word2idx), len(pron2idx)

(125074, 113745)

In [39]:
word2pron['RECORD']

{('R', 'AH', 'K', 'AO', 'R', 'D'),
 ('R', 'EH', 'K', 'ER', 'D'),
 ('R', 'IH', 'K', 'AO', 'R', 'D')}

In [42]:
word2idx['RECORD'], idx2word[91994]

(91994, 'RECORD')

In [45]:
pron2idx[('R', 'EH', 'D')], idx2pron[83079]

(83079, ('R', 'EH', 'D'))

In [61]:
# Suffixes
suffixes = [
    ['S'],
    ['IY NG'],
    ['IH NG'],
    ['D']
]

# Similar phone clusters 
clusters = [
    ['Z', 'S'],
    ['AA', 'AO', 'EY', 'UH'],
    ['AXR', 'AX'],
    ['P', 'B', 'F'],
    ['DH', 'CH', 'ZH', 'T', 'SH'],
    ['IY', 'AY', 'OW'],
    ['EH', 'AH', 'IH', 'AW', 'ER', 'UW']
]

phone2cluster = defaultdict(list) # Dict[str, List(str)]

for c in clusters:
    for p in c:
        phone2cluster[p] = c

len(phone2cluster)

25

In [62]:
phone2cluster['IY']

['IY', 'AY', 'OW']

In [73]:
# 1-step confusion 
pron_confusions = [set() for _ in idx2pron] # confusable pron ids, List[int]; [i] has j: i can be replaced by j  

remove_consonant_cnt = 0
double_vowel_cnt = 0
add_suffix_cnt = 0
substitute_cnt = 0

for _idx, _pron in tqdm(enumerate(idx2pron), total=len(idx2pron)):
    _phones = list(_pron)
    
    # remove a consonant (outgoing edge only)
    for j in range(len(_phones)):
        _confs_pron = tuple(_phones[:j] + _phones[j+1:])
        try:
            _confs_idx = pron2idx[_confs_pron]
            pron_confusions[_idx].add(_confs_idx)
            remove_consonant_cnt += 1
            
            if remove_consonant_cnt <= 5:
                _src_pron = _pron
                _src_w = next(iter(pron2word[_src_pron]))
                _tgt_pron = _confs_pron
                _tgt_w = next(iter(pron2word[_tgt_pron]))
                print(f'Remove consonant: {_src_w}{_src_pron} -> {_tgt_w}{_tgt_pron}')
        except KeyError:
            continue
    
    # remove a doubled vowel (incoming edge only, for doubling vowel)
    for j in range(1, len(_phones)):
        if _phones[j] != _phones[j-1] or (not _phones[j][0] in 'AEIOU'):
            continue
            
        _confs_pron = tuple(_phones[:j] + _phones[j+1:])
        try:
            _confs_idx = pron2idx[_confs_pron]
            pron_confusions[_confs_idx].add(_idx)
            double_vowel_cnt += 1
            
            if double_vowel_cnt <= 5:
                _src_pron = _confs_pron
                _src_w = next(iter(pron2word[_src_pron]))
                _tgt_pron = _pron
                _tgt_w = next(iter(pron2word[_tgt_pron]))
                print(f'Double vowel: {_src_w}{_src_pron} -> {_tgt_w}{_tgt_pron}')
        except KeyError:
            continue
    
    # add a suffix (outgoing edge only)
    for _suffix in suffixes:
        _confs_pron = tuple(_phones + _suffix)
        try:
            _confs_idx = pron2idx[_confs_pron]
            pron_confusions[_idx].add(_confs_idx)
            add_suffix_cnt += 1
            
            if add_suffix_cnt <= 5:
                _src_pron = _pron
                _src_w = next(iter(pron2word[_src_pron]))
                _tgt_pron = _confs_pron
                _tgt_w = next(iter(pron2word[_tgt_pron]))
                print(f'Add suffix: {_src_w}{_src_pron} -> {_tgt_w}{_tgt_pron}')
        except KeyError:
            continue
    
    # substitute a phone (outgoing edge only; it's bidirectional in essense)
    for j in range(len(_phones)):
        _ph = _phones[j]
        _ph_cluster = phone2cluster[_ph]
        if len(_ph_cluster) <= 1:
            continue
        
        # has a cluster, do replace
        for _confs_ph in _ph_cluster:
            if _confs_ph == _ph:
                continue
            
            _confs_pron = tuple(_phones[:j] + [_confs_ph] + _phones[j+1:])
            try:
                _confs_idx = pron2idx[_confs_pron]
                pron_confusions[_idx].add(_confs_idx)
                substitute_cnt += 1

                if substitute_cnt <= 5:
                    _src_pron = _pron
                    _src_w = next(iter(pron2word[_src_pron]))
                    _tgt_pron = _confs_pron
                    _tgt_w = next(iter(pron2word[_tgt_pron]))
                    print(f'Substitute phone: {_src_w}{_src_pron} -> {_tgt_w}{_tgt_pron}')
            except KeyError:
                continue



HBox(children=(IntProgress(value=0, max=113745), HTML(value='')))

Remove consonant: QUOTE('K', 'W', 'OW', 'T') -> COAT('K', 'OW', 'T')
Remove consonant: QUOTE('K', 'W', 'OW', 'T') -> QUO('K', 'W', 'OW')
Add suffix: QUOTE('K', 'W', 'OW', 'T') -> QUOTES('K', 'W', 'OW', 'T', 'S')
Substitute phone: QUOTE('K', 'W', 'OW', 'T') -> QUIETT('K', 'W', 'IY', 'T')
Substitute phone: QUOTE('K', 'W', 'OW', 'T') -> QUITE('K', 'W', 'AY', 'T')
Remove consonant: "UNQUOTE('AH', 'N', 'K', 'W', 'OW', 'T') -> UNCOAT('AH', 'N', 'K', 'OW', 'T')
Remove consonant: PERCENT('P', 'ER', 'S', 'EH', 'N', 'T') -> PERSET('P', 'ER', 'S', 'EH', 'T')
Add suffix: PERCENT('P', 'ER', 'S', 'EH', 'N', 'T') -> PERCENTS('P', 'ER', 'S', 'EH', 'N', 'T', 'S')
Substitute phone: PERCENT('P', 'ER', 'S', 'EH', 'N', 'T') -> PRESENT('P', 'ER', 'Z', 'EH', 'N', 'T')
Remove consonant: 'ALLO('AA', 'L', 'OW') -> LOE('L', 'OW')
Substitute phone: 'ALLO('AA', 'L', 'OW') -> OLLY('AA', 'L', 'IY')
Add suffix: BOUT('B', 'AW', 'T') -> BOUTS('B', 'AW', 'T', 'S')
Substitute phone: BOUT('B', 'AW', 'T') -> POUT('P', 'AW'

In [74]:
remove_consonant_cnt, double_vowel_cnt, add_suffix_cnt, substitute_cnt

(86039, 86, 7700, 51210)

In [78]:
sorted(Counter([len(x) for x in pron_confusions]).most_common())

[(0, 45130),
 (1, 33351),
 (2, 15480),
 (3, 8802),
 (4, 5215),
 (5, 3086),
 (6, 1489),
 (7, 759),
 (8, 292),
 (9, 109),
 (10, 18),
 (11, 11),
 (12, 3)]

In [80]:
# Word confusion 
word_confusions = defaultdict(set) # confusable words (not ids!), Dict[str, Set[str]]

for _word in idx2word:
    for _pron in word2pron[_word]:
        _pron_idx = pron2idx[_pron]
        _confs_pron_ids = pron_confusions[_pron_idx]
        for _confs_pron_idx in _confs_pron_ids:
            _confs_pron = idx2pron[_confs_pron_idx]
            _confs_words = pron2word[_confs_pron]
            word_confusions[_word].update(_confs_words)

In [81]:
sorted(Counter([len(x) for x in word_confusions]).most_common())

[(1, 25),
 (2, 223),
 (3, 1513),
 (4, 6440),
 (5, 13435),
 (6, 17669),
 (7, 15616),
 (8, 10357),
 (9, 6634),
 (10, 4123),
 (11, 2422),
 (12, 1307),
 (13, 695),
 (14, 322),
 (15, 152),
 (16, 70),
 (17, 23),
 (18, 12),
 (19, 3),
 (20, 3)]

In [83]:
word2pron['RED']

{('R', 'EH', 'D')}

In [85]:
[(w, word2pron[w]) for w in word_confusions['RED']]

[('RUUD', {('R', 'UW', 'D')}),
 ('ROOD', {('R', 'UW', 'D')}),
 ('RUDE', {('R', 'UW', 'D')}),
 ('ED', {('EH', 'D')}),
 ('RHUDE', {('R', 'UW', 'D')}),
 ('REH', {('R', 'EH')}),
 ('RUDD', {('R', 'AH', 'D')}),
 ('ROODE', {('R', 'UW', 'D')}),
 ('RUD', {('R', 'AH', 'D')}),
 ('RID', {('R', 'IH', 'D')})]

In [102]:
# Sentence confusion 

def SentenceAcousticConfusion(sentence: List[str], p: float) -> List[str]:
    assert 0 <= p <= 1
    
    sen_len = len(sentence)
    confs_cnt = int(p * sen_len)
    
    confusable_positions = []
    for pos in range(sen_len):
        word = sentence[pos].upper()
        if len(word_confusions[word]) > 0:
            confusable_positions.append(pos)
    
    if len(confusable_positions) <= confs_cnt:
        # not enough positions for confusion
        confs_positions = confusable_positions
    else:
        confs_positions = random.sample(confusable_positions, k=confs_cnt)
        
    confs_sentence = list(sentence)
    for pos in confs_positions:
        word = sentence[pos].upper()
        confs_word = random.choice(list(word_confusions[word])).lower()
        confs_sentence[pos] = confs_word
    
    return confs_sentence

In [104]:
sentence = [
    "What",
    "are",
    "the",
    "names",
    "of",
    "all",
    "European",
    "countries",
    "with",
    "at",
    "least",
    "three",
    "manufacturers",
    "?"
]

SentenceAcousticConfusion(sentence, p=0.2)

['What',
 'are',
 'though',
 'names',
 'of',
 'auld',
 'European',
 'countries',
 'with',
 'at',
 'least',
 'three',
 'manufacturers',
 '?']

In [113]:
# Apply on tables.jsonl 
tabert_dataset_path = '/Users/mac/Desktop/syt/Deep-Learning/Dataset/TaBERT_datasets/tables_sample.jsonl'

with open(tabert_dataset_path, 'r') as f:
    l = f.readline()

# print(ujson.dumps(ujson.loads(l), indent=4))
d = ujson.loads(l)
d

{'uuid': 'common-crawl/crawl-data/CC-MAIN-2015-32/segments/1438042992201.62/warc/CC-MAIN-20150728002312-00130-ip-10-236-191-2.ec2.internal.warc.gz_567892730_567906623',
 'table': {'caption': '',
  'header': [{'name': 'Class',
    'name_tokens': None,
    'type': 'text',
    'sample_value': {'value': '0', 'tokens': ['0'], 'ner_tags': ['']},
    'sample_value_tokens': None,
    'is_primary_key': False,
    'foreign_key': None},
   {'name': 'Interpretation',
    'name_tokens': None,
    'type': 'text',
    'sample_value': {'value': 'Negative',
     'tokens': ['Negative'],
     'ner_tags': ['']},
    'sample_value_tokens': None,
    'is_primary_key': False,
    'foreign_key': None}],
  'data': [['0', 'Negative'],
   ['1', 'Equivocal'],
   ['2', 'Positive'],
   ['3', 'Positive'],
   ['4', 'Strongly positive'],
   ['5', 'Strongly positive'],
   ['6', 'Strongly positive']]},
 'context_before': ['Reference Values Describes reference intervals and additional information for interpretation of te

In [119]:
p = 0.15

confs_d = d.copy()
for idx, sen in enumerate(d['context_before']):
    sen_tokens = word_tokenize(sen)
    sen_tokens_confused = SentenceAcousticConfusion(sen_tokens, p)
    sen_confused = ' '.join(sen_tokens_confused)
    confs_d['context_before'][idx] = sen_confused
for idx, sen in enumerate(d['context_after']):
    sen_tokens = word_tokenize(sen)
    sen_tokens_confused = SentenceAcousticConfusion(sen_tokens, p)
    sen_confused = ' '.join(sen_tokens_confused)
    confs_d['context_after'][idx] = sen_confused
confs_d

{'uuid': 'common-crawl/crawl-data/CC-MAIN-2015-32/segments/1438042992201.62/warc/CC-MAIN-20150728002312-00130-ip-10-236-191-2.ec2.internal.warc.gz_567892730_567906623',
 'table': {'caption': '',
  'header': [{'name': 'Class',
    'name_tokens': None,
    'type': 'text',
    'sample_value': {'value': '0', 'tokens': ['0'], 'ner_tags': ['']},
    'sample_value_tokens': None,
    'is_primary_key': False,
    'foreign_key': None},
   {'name': 'Interpretation',
    'name_tokens': None,
    'type': 'text',
    'sample_value': {'value': 'Negative',
     'tokens': ['Negative'],
     'ner_tags': ['']},
    'sample_value_tokens': None,
    'is_primary_key': False,
    'foreign_key': None}],
  'data': [['0', 'Negative'],
   ['1', 'Equivocal'],
   ['2', 'Positive'],
   ['3', 'Positive'],
   ['4', 'Strongly positive'],
   ['5', 'Strongly positive'],
   ['6', 'Strongly positive']]},
 'context_before': ['Reference Values Describes reference intervals uhde additional information for interpretation irv 