Util for alignments under module `condense_new` in `dysfluencies/pipeline/util/condense_new.py`

In [78]:
import difflib
import json
import numpy as np
import pandas as pd
from collections import defaultdict
from copy import deepcopy

In [4]:
kid_recwords = pd.read_csv('kid_recwords_f0s.csv')

In [14]:
assert len(set(kid_recwords.original_session)) == 70

In [15]:
model_recwords = pd.read_csv('model_recwords_f0s.csv')

In [16]:
assert len(set(model_recwords['item'])) == 27

In [32]:
with open('../reading_examples_with_token_features_normalized.json') as f:
    reading_examples = json.load(f)
    reader_to_item = {int(ex['ReaderID']): int(ex['PassageID']) for ex in reading_examples}

In [45]:
reader_to_alignment = defaultdict(list)
for idx, row in kid_recwords.iterrows():
    reader_id = row.original_session
    word = row.word if '<' not in row.word else '<pause>'
    align = [word, row.sframe, row.nframes, row.f0s]
    reader_to_alignment[reader_id].append(align)

In [46]:
reader_to_alignment = dict(reader_to_alignment)

In [74]:
passage_id_to_sessions_to_alignments = defaultdict(lambda: defaultdict(list))
for idx, row in model_recwords.iterrows():
    passage_id, session = row['item'], row.session
    word = row.word if '<' not in row.word else '<pause>'
    align = [word, row.sframe, row.nframes, row.f0s]
    passage_id_to_sessions_to_alignments[passage_id][session].append(align)
passage_id_to_sessions_to_alignments = dict(passage_id_to_sessions_to_alignments)

In [77]:
# with open('../reader_to_item.json', 'x') as f:
#     json.dump(reader_to_item, f, indent=2)
# with open('../reader_to_alignment.json', 'x') as f:
#     json.dump(reader_to_alignment, f, indent=2)
# with open('../passage_id_to_sessions_to_alignments.json', 'x') as f:
#     json.dump(passage_id_to_sessions_to_alignments, f, indent=2)

In [81]:
sample_alignment = deepcopy(reader_to_alignment[5157])

In [96]:
def strip_pauses(alignment):
    rv = deepcopy(alignment)
    while rv[0][0] == '<pause>':
        pause_len = rv[0][2]
        for idx in range(len(rv)):
            rv[idx][1] -= pause_len
        rv = rv[1:]
    while rv[-1][0] == '<pause>':
        rv = rv[:-1]
    return rv

In [98]:
def is_valid(alignment):
    for idx, term in enumerate(alignment[:-1]):
        if term[1] + term[2] != alignment[idx + 1][1]:
            return False
    return True

In [187]:
def collapse_pauses(alignment):
    # merge pauses first
    merged = []
    for idx, term in enumerate(reversed(alignment[1:])):
        if term[0] != '<pause>':
            merged.append([term])
        else:
            merged[-1].append(term)
    merged = reversed([*merged, [alignment[0]]])
    
    # collapse null durations into next non-null token
    rv = []
    for m in merged:
        if len(m) == 0:
            rv.append(m[0])
        else:
            sframe = m[-1][1]
            nframes = sum(t[2] for t in m)
            rv.append([m[0][0], sframe, nframes])
    
    return rv

In [188]:
dummy_alignment = [
    ['a', 0, 10],
    ['b', 10, 10],
    ['c', 20, 10],
    ['<pause>', 30, 10],
    ['d', 40, 10],
    ['<pause>', 50, 10],
    ['<pause>', 60, 10],
    ['e', 70, 10],
    ['f', 80, 10]
]

In [190]:
is_valid(collapse_pauses(dummy_alignment))

True