In [1]:
import sys, os, json, itertools
import numpy as np
import pandas as pd

In [2]:
with open('item_to_session_to_alignment.json') as f:
    item_to_session_to_alignment = json.load(f)

In [3]:
# wikisource
def levenshtein(s1, s2):
    if len(s1) < len(s2):
        return levenshtein(s2, s1)
    if len(s2) == 0:
        return len(s1)
    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1 # j+1 instead of j since previous_row and current_row are one character longer
            deletions = current_row[j] + 1       # than s2
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
    return previous_row[-1]

In [4]:
for item, session_to_alignment in item_to_session_to_alignment.items():
    tokens_across_readers = [
        [x[0] for x in alignment]
        for session, alignment in session_to_alignment.items()
    ]
    n_readings = len(tokens_across_readers)
    avg_lev_across_readings = sum([
        levenshtein(tokens_across_readers[pair[0]], tokens_across_readers[pair[1]])
        for pair in itertools.combinations(range(n_readings), 2)
    ]) / n_readings
    print(item, avg_lev_across_readings)

1505 7.0
1703 12.5
1902 15.5
2101 7.75
2003 19.5
2102 5.5
2203 10.5
2304 13.0
2602 15.25
2603 13.5
2704 6.25
2701 10.0
2801 13.0
2802 18.5
2803 18.75
2904 12.0
2902 14.75
2901 18.75
3001 14.0
3003 23.75
3201 28.75
3202 14.25
3401 14.0
1502 9.5
4701 9.0
3302 17.25
3402 13.25


In [5]:
with open('item_to_session_to_alignment_collapsed_pauses.json') as f:
    condensed = json.load(f)

In [6]:
for item, session_to_alignment in condensed.items():
    tokens_across_readers = [
        [x[0] for x in alignment]
        for session, alignment in session_to_alignment.items()
    ]
    n_readings = len(tokens_across_readers)
    avg_lev_across_readings = sum([
        levenshtein(tokens_across_readers[pair[0]], tokens_across_readers[pair[1]])
        for pair in itertools.combinations(range(n_readings), 2)
    ]) / n_readings
    print(item, avg_lev_across_readings)

1505 5.5
1703 11.75
1902 13.75
2101 6.25
2003 18.75
2102 4.0
2203 9.75
2304 11.25
2602 13.25
2603 11.75
2704 4.75
2701 8.25
2801 11.25
2802 16.75
2803 17.0
2904 10.0
2902 13.25
2901 17.25
3001 12.25
3003 22.0
3201 27.0
3202 13.5
3401 13.25
1502 8.75
4701 8.0
3302 15.5
3402 11.5


In [7]:
with open('item_to_session_to_alignment_no_pauses.json') as f:
    item_to_session_to_alignment_no_pauses = json.load(f)

In [8]:
for item, session_to_alignment in item_to_session_to_alignment_no_pauses.items():
    tokens_across_readers = [
        [x[0] for x in alignment]
        for session, alignment in session_to_alignment.items()
    ]
    n_readings = len(tokens_across_readers)
    avg_lev_across_readings = sum([
        levenshtein(tokens_across_readers[pair[0]], tokens_across_readers[pair[1]])
        for pair in itertools.combinations(range(n_readings), 2)
    ]) / n_readings
    print(item, avg_lev_across_readings)

1505 0.0
1703 0.0
1902 0.0
2101 0.0
2003 0.0
2102 0.0
2203 0.0
2304 0.0
2602 0.0
2603 0.0
2704 0.0
2701 0.0
2801 0.0
2802 0.0
2803 0.0
2904 0.0
2902 0.0
2901 0.0
3001 0.0
3003 0.0
3201 0.0
3202 0.0
3401 0.0
1502 0.0
4701 0.0
3302 0.0
3402 0.0
