Difference metrics, without regard for frame count

In [1]:
import sys
import difflib
import json
import copy
sys.path.append('util')
import condense

In [2]:
def only_tokens(lst):
    return [x[0] + '\n' for x in lst]

In [3]:
def compute_diff(l1, l2):
    diff = ''.join([
        x.strip() + '\n'
        for x in difflib.ndiff(only_tokens(l1), only_tokens(l2))
    ])
    diff = [x for x in diff.split('\n') if x]
    pos_diff = [x for x in diff if x[0] == '+']
    neg_diff = [x for x in diff if x[0] == '-']
    minor_diff = [x for x in diff if x[0] == '?']
    return pos_diff, neg_diff, minor_diff

In [4]:
with open('res/model_readings/item_to_session_to_alignment_with_pauses_no_leading_or_trailing.json') as f:
    gold_stripped = json.load(f)

In [5]:
with open('res/model_readings/item_to_session_to_alignment_no_pauses.json') as f:
    gold_no_pause = json.load(f)

In [6]:
with open('res/reading_examples_numerized.json') as f:
    reading_examples = json.load(f)

print('Featurizing {} examples:'.format(len(reading_examples)))
    
diff_types = [
    'StrippedPosDiff',
    'StrippedNegDiff',
    'StrippedMinorDiff',
    'NoPausePosDiff',
    'NoPauseNegDiff',
    'NoPauseMinorDiff'
]
    
for ex_i, reading in enumerate(reading_examples):
    passage_id = reading['PassageID']
    original_alignment = copy.deepcopy(reading['Words'])
    collapsed_pauses_alignment = condense.collapse_adjacent_pauses(original_alignment)
    stripped_alignment = condense.remove_leading_and_trailing_pauses(collapsed_pauses_alignment)
    merged_pauses_alignment = condense.merge_pauses(stripped_alignment)
    
    curr_diff_values = []
    
    avg_diffs = [0, 0, 0]
    curr_diff_values.extend([
        a / len(gold_stripped[passage_id])
        for a in [
            x + y
            for k, v in gold_stripped[passage_id].items()
            for x, y in zip([len(x) for x in compute_diff(stripped_alignment, v)], avg_diffs)
    ]])
    
    avg_diffs = [0, 0, 0]
    curr_diff_values.extend([
        x / len(gold_stripped[passage_id])
        for x in [
            x + y
            for k, v in gold_no_pause[passage_id].items()
            for x, y in zip([len(x) for x in compute_diff(merged_pauses_alignment, v)], avg_diffs)
    ]])
    
    for idx, diff_type in enumerate(diff_types):
        reading[diff_type] = curr_diff_values[idx]
    if not (ex_i + 1) % 10:
        print('Featurized example {}.'.format(ex_i + 1))

Featurizing 70 examples:
Featurized example 10.
Featurized example 20.
Featurized example 30.
Featurized example 40.
Featurized example 50.
Featurized example 60.
Featurized example 70.


In [7]:
with open('res/reading_examples_with_token_features.json', 'w') as f:
    json.dump(reading_examples, f, indent=2)