Create more features.

The code below has not been optimized so some calculations are unnecessarily repeated. Because of the small training set size, the runtime is mostly negligible anyways. Should these features be recreated, detailed documentation is located in `dysfluencies/pipeline/more_features.md`.

In [1]:
import pandas as pd
import sys, json, difflib
sys.path.append('/Users/ycm/Desktop/dysfluencies/pipeline/util/')
import condense_new
from scipy import spatial

In [2]:
with open('res/features_and_labels_by_example/features_by_ex_NormalizedTokenScores_WithReaderID.tsv') as f:
    features = [x.strip() for x in f.readlines()]
# labels = pd.read_csv('res/features_and_labels_by_example/labels_by_ex_NormalizedTokenScores_WithReaderID.tsv')

In [3]:
json_resources = [
    'res/reader_to_item.json',
    'res/reader_to_alignment.json',
    'res/gold_passage_id_to_sessions_to_alignments.json'
]
rs = []
for r in json_resources:
    with open(r) as f:
        rs.append(json.load(f))

In [4]:
reader_to_item,\
reader_to_alignment,\
passage_id_to_session_to_gold_alignments = rs
passage_id_to_session_to_gold_alignments = {
    int(k): v
    for k, v in passage_id_to_session_to_gold_alignments.items()
}

In [5]:
len(reader_to_item), len(reader_to_alignment), len(passage_id_to_session_to_gold_alignments)

(70, 70, 27)

In [6]:
def in_gold(token):
    return token[:2] == '  ' or token[:2] == '+ '
def in_ex(token):
    return token[:2] == '  ' or token[:2] == '- '

In [7]:
def align_sequence(c_seq, g_seq):
    # '+ ' -> gold token missed
    # '- ' -> incorrect insertion
    seq1_delim = [x[0] for x in c_seq]
    seq2_delim = [x[0] for x in g_seq]
    
    diff = [x for x in difflib.ndiff(seq1_delim, seq2_delim) if x[0] != '?']
    ex_tokens = [x for x in diff if in_ex(x)]
    gold_tokens = [x for x in diff if in_gold(x)]
    
    assert len(ex_tokens) == len(c_seq) and len(gold_tokens) == len(g_seq)
    
    ex_iter = (x for x in c_seq)
    gold_iter = (x for x in g_seq)
    
    # create <pause> separated phrases in gold alignment
    gold_pause_sep = [[]]
    for token in diff:
        if in_gold(token):
            curr_token = next(gold_iter)
            if token[2:] == '<pause>':
                gold_pause_sep[-1].append(curr_token)
                gold_pause_sep.append([])
            else:
                gold_pause_sep[-1].append(curr_token)
                
    n_gold_groupings = len(gold_pause_sep)
    ex_pause_sep = [[] for _ in range(n_gold_groupings)]
    curr_group_idx = 0
    for token in diff:
        if in_ex(token):
            curr_token = next(ex_iter)
            ex_pause_sep[curr_group_idx].append(curr_token)
        if in_gold(token) and token[2:] == '<pause>':
            curr_group_idx += 1
            continue

#     for ex_terms, gold_terms in zip(ex_pause_sep, gold_pause_sep):
#         print('\t'.join([x[0] for x in ex_terms]))
#         print('\t'.join([x[0] for x in gold_terms]))
#         print()
    
    return ex_pause_sep, gold_pause_sep

In [8]:
def cosine(l1, l2):
    return 1 - spatial.distance.cosine(l1, l2)

def jaccard(l1, l2):
    ''' Unused '''
    s1, s2 = set(l1), set(l2)
    return len(s1 & s2) / len(s1 | s2)

def get_avg_f0(alignment):
    f0s = [
        f0
        for term in alignment
        for f0 in eval(term[3])
    ]
    return sum(f0s) / len(f0s)

def slope(f0s):
    return (f0s[-1] - f0s[0]) / len(f0s) 

def compute_f0_match_score(matched_ex, matched_gold, avg_ex_f0, avg_gold_f0):
    '''
    Computes F0-related scores for a pair of groups
    '''
    ex_slopes = []
    gold_slopes = []
    for ex_term, gold_term in zip(matched_ex, matched_gold):
        ex_f0s = [x - avg_ex_f0 for x in eval(ex_term[3])]
        gold_f0s = [x - avg_gold_f0 for x in eval(gold_term[3])]
        ex_slopes.append(slope(ex_f0s))
        gold_slopes.append(slope(gold_f0s))

    rv = cosine(ex_slopes, gold_slopes)
    if rv != rv: # rv == nan
        rv = 0
    return rv
        

def compute_single_group_scores(ex_group, gold_group, avg_ex_f0, avg_gold_f0):
    '''
    Computes similarity scores between a single gold GROUP
    (<pause>-delimited) and its corresponding example group
    '''
    seq1_delim = [x[0] for x in ex_group]
    seq2_delim = [x[0] for x in gold_group]
    
    diff = [x for x in difflib.ndiff(seq1_delim, seq2_delim) if x[0] != '?']
    ex_tokens = [x for x in diff if in_ex(x)]
    gold_tokens = [x for x in diff if in_gold(x)]
    
    iter_ex = (x for x in ex_group)
    iter_gold = (x for x in gold_group)
    
    matched_ex = []
    matched_gold = []
    for token in diff:
        if not in_ex(token):
            next(iter_gold)
        elif not in_gold(token):
            next(iter_ex)
        elif '<pause>' in token:
            continue
        else:
            matched_ex.append(next(iter_ex))
            matched_gold.append(next(iter_gold))
    
    n_pauses_inserted = diff.count('- <pause>')
    n_pauses_omitted = diff.count('+ <pause>')
    
    if matched_ex:
        ex_group_nframes = [x[2] for x in matched_ex]
        gold_group_nframes = [x[2] for x in matched_gold]
        nframes_similarity = cosine(ex_group_nframes, gold_group_nframes)
        matched_tokens_similarity = len(matched_ex) / len(gold_group)
        curr_group_dur_sim = nframes_similarity * matched_tokens_similarity
        
        f0_match_score = compute_f0_match_score(
            matched_ex,
            matched_gold,
            avg_ex_f0,
            avg_gold_f0
        )
        
        curr_group_f0_sim = f0_match_score * matched_tokens_similarity
    else:
        curr_group_dur_sim = 0
        curr_group_f0_sim = 0
    
    return curr_group_dur_sim, curr_group_f0_sim, n_pauses_inserted, n_pauses_omitted

In [9]:
def compute_group_based_features(ex_pause_sep, gold_pause_sep, avg_ex_f0, avg_gold_f0):
    gold_len = sum(len(x) for x in gold_pause_sep)
    
    n_pauses_in_gold = [y[0] for x in gold_pause_sep for y in x].count('<pause>')
    pauses_inserted = 0
    pauses_omitted = 0
    
    
    group_duration_sim = []
    weighted_group_duration_sim = 0
    
    group_f0_sim = []
    weighted_group_f0_sim = 0
    
    for ex_terms, gold_terms in zip(ex_pause_sep, gold_pause_sep):
        duration_score, f0_score, n_pauses_inserted, n_pauses_omitted = compute_single_group_scores(
            ex_terms,
            gold_terms,
            avg_ex_f0,
            avg_gold_f0
        )
        
        group_duration_sim.append(duration_score)
        weighted_group_duration_sim += duration_score * len(gold_terms) / gold_len
        
        group_f0_sim.append(f0_score)
        weighted_group_f0_sim += f0_score * len(gold_terms) / gold_len
        
        pauses_inserted += n_pauses_inserted
        pauses_omitted += n_pauses_omitted
    
    group_duration_sim = sum(group_duration_sim) / len(group_duration_sim)
    group_f0_sim = sum(group_f0_sim) / len(group_f0_sim)
    
    normalized_pauses_inserted = pauses_inserted / n_pauses_in_gold
    normalized_pauses_omitted = pauses_omitted / n_pauses_in_gold
    
    return group_duration_sim,\
        weighted_group_duration_sim,\
        group_f0_sim,\
        weighted_group_f0_sim,\
        normalized_pauses_inserted,\
        normalized_pauses_omitted

In [10]:
reader_to_additional_features = {}
for reader, alignment in reader_to_alignment.items():
    passage_id = reader_to_item[reader]
    c_stripped = condense_new.strip_pauses(alignment)
    c_combined_pauses = condense_new.combine_adjacent_pauses(c_stripped)
    c_condensed = condense_new.collapse_pauses(c_combined_pauses)
    
    avg_c_f0 = get_avg_f0(c_condensed)
    
    assert condense_new.is_valid(c_stripped) and condense_new.is_valid(c_condensed)
    
    # get gold alignment for current items
    gold_alignments = passage_id_to_session_to_gold_alignments[passage_id].values()
    
    group_based_feature_list = []
    for g_alignment in gold_alignments:
        g_stripped = condense_new.strip_pauses(g_alignment)
        g_combined_pauses = condense_new.combine_adjacent_pauses(g_stripped)
        g_condensed = condense_new.collapse_pauses(g_combined_pauses)
        assert condense_new.is_valid(g_stripped) and condense_new.is_valid(g_condensed)
        
        avg_g_f0 = get_avg_f0(g_condensed)
        ex_pause_sep, gold_pause_sep = align_sequence(c_stripped, g_stripped)
        group_based_feature_list.append(compute_group_based_features(
            ex_pause_sep,
            gold_pause_sep,
            avg_c_f0,
            avg_g_f0
        ))
        
#         break
    averaged_features = ([
        sum(x) / len(group_based_feature_list)
        for x in zip(*group_based_feature_list)
    ])

#     GroupDurationSim,\
#     WeightedGroupDurationSim,\
#     GroupF0Sim,\
#     WeightedGroupF0Sim,\
#     NormalizedPausesInserted,\
#     NormalizedPausesOmitted = averaged_features
    
    reader_to_additional_features[reader] = averaged_features
#     break

  dist = 1.0 - uv / np.sqrt(uu * vv)


In [11]:
len(reader_to_additional_features)

70

In [12]:
extended_features = ['\t'.join([
    features[0],
    'GroupDurationSim',
    'WeightedGroupDurationSim',
    'GroupF0Sim',
    'WeightedGroupF0Sim',
    'NormalizedPausesInserted',
    'NormalizedPausesOmitted'
])]

for line in features[1:]:
    reader_id = (line.split('\t')[0])
    additional_features = reader_to_additional_features[reader_id]
    extended_features.append(line + '\t' + '\t'.join([str(x) for x in additional_features]))

In [13]:
# with open('res/features_and_labels_by_example/features_by_ex_NormalizedTokenScores_WithReaderID_WithAdditionalFeatures.tsv', 'w') as f:
#     for line in extended_features:
#         print(line, file=f)