Difference metrics, without regard for frame count

In [1]:
import sys
import difflib
import json
import copy
sys.path.append('util')
import condense

In [2]:
def only_tokens(lst):
    return [x[0] + '\n' for x in lst]

In [3]:
def compute_diff(l1, l2):
    diff = ''.join([
        x.strip() + '\n'
        for x in difflib.ndiff(only_tokens(l1), only_tokens(l2))
    ])
    diff = [x for x in diff.split('\n') if x]
    pos_diff = [x for x in diff if x[0] == '+']
    neg_diff = [x for x in diff if x[0] == '-']
    minor_diff = [x for x in diff if x[0] == '?']
    return pos_diff, neg_diff, minor_diff

In [4]:
with open('res/model_readings/item_to_session_to_alignment_with_pauses_no_leading_or_trailing.json') as f:
    gold_stripped = json.load(f)

In [5]:
with open('res/model_readings/item_to_session_to_alignment_no_pauses.json') as f:
    gold_no_pause = json.load(f)

In [15]:
with open('res/reading_examples_numerized.json') as f:
    reading_examples = json.load(f)

print('Featurizing {} examples:'.format(len(reading_examples)))
    
diff_types = [
    'StrippedPosDiff',
    'StrippedNegDiff',
    'StrippedMinorDiff',
    'NoPausePosDiff',
    'NoPauseNegDiff',
    'NoPauseMinorDiff'
]
    
for ex_i, reading in enumerate(reading_examples):
    passage_id = reading['PassageID']
    original_alignment = copy.deepcopy(reading['Words'])
    collapsed_pauses_alignment = condense.collapse_adjacent_pauses(original_alignment)
    stripped_alignment = condense.remove_leading_and_trailing_pauses(collapsed_pauses_alignment)
    merged_pauses_alignment = condense.merge_pauses(stripped_alignment)
    
    curr_diff_values = []
    avg_gold_len = sum(len(v) for v in gold_stripped[passage_id].values()) / len(gold_stripped[passage_id])
    avg_diffs = [0, 0, 0]
#     curr_diff_values.extend([
#         a / len(gold_stripped[passage_id])
#         for a in [
#             x + y
#             for k, v in gold_stripped[passage_id].items()
#             for x, y in zip([len(x) for x in compute_diff(stripped_alignment, v)], avg_diffs)
#     ]])
    for v in gold_stripped[passage_id].values():
        avg_diffs = [
            (x + y) / avg_gold_len
            for x, y
            in zip([len(x) for x in compute_diff(stripped_alignment, v)], avg_diffs)
        ]
    curr_diff_values.extend(avg_diffs)
    
    avg_diffs = [0, 0, 0]
    avg_gold_len = sum(
        len(v)
        for v in gold_no_pause[passage_id].values()) / len(gold_no_pause[passage_id]
    )
#     curr_diff_values.extend([
#         x / len(gold_stripped[passage_id])
#         for x in [
#             x + y
#             for k, v in gold_no_pause[passage_id].items()
#             for x, y in zip([len(x) for x in compute_diff(merged_pauses_alignment, v)], avg_diffs)
#     ]])
    for v in gold_no_pause[passage_id].values():
        avg_diffs = [
            (x + y) / avg_gold_len
            for x, y
            in zip([len(x) for x in compute_diff(merged_pauses_alignment, v)], avg_diffs)
        ]
    curr_diff_values.extend(avg_diffs)
    
    for idx, diff_type in enumerate(diff_types):
        reading[diff_type] = curr_diff_values[idx]
    if not (ex_i + 1) % 10:
        print('Featurized example {}.'.format(ex_i + 1))

Featurizing 70 examples:
[0.1719630310755348, 0.11248122055597393, 0.02242990647402421, 0.172413792183362, 0.0517241376550086, 0.0258620688275043]
[0.03982370536936644, 0.06675467628502371, 0.006666666653843333, 0.007407407385754781, 0.007407407385754781, 0.007407407385754781]
[0.044694183825181394, 0.1272684476411072, 0.0, 0.0086206896091681, 0.0344827584366724, 0.0]
[0.07320818840918028, 0.21344881114180447, 0.01470588231119576, 0.03333333317783087, 0.09166666623903491, 0.016666666588915434]
[0.07048229966327073, 0.3228734706604103, 0.03143418455964391, 0.03571428549524331, 0.11607142785954076, 0.03571428549524331]
[0.0860609238016048, 0.4512085104171093, 0.02653399663321381, 0.05925925908603825, 0.19259259202962428, 0.029629629543019124]
[0.05970211653772195, 0.18713619330169617, 0.01495327098268281, 0.0258620688275043, 0.1034482753100172, 0.0172413792183362]
[0.04678652121722913, 0.09424495329600967, 0.02346041013324983, 0.027777776799626428, 0.06944444199906608, 0.0277777767996264

In [17]:
# with open('res/reading_examples_with_token_features_normalized.json', 'w') as f:
#     json.dump(reading_examples, f, indent=2)

In [7]:
# with open('res/reading_examples_with_token_features.json', 'w') as f:
#     json.dump(reading_examples, f, indent=2)