# Compiling performance matrix

This notebook should be self-contained.

In [1]:
import json, string, difflib
import pandas as pd
from collections import defaultdict

**For each item, load recstring and token-by-token representation**

In [2]:
df_full_texts_and_recstrings = pd.read_csv('data/moby-passages-36/passages-with-line-break-and-recstring.tsv', sep='\t')
item_number_to_recstring = dict(zip(df_full_texts_and_recstrings.Item, df_full_texts_and_recstrings['Rec String']))
df_text_with_line_breaks = pd.read_csv(
    'data/moby-passages-36/passages-with-line-breaks.tsv',
    sep='\t',
    names=['Item', 'Passage']
)
item_number_to_text_with_line_breaks = dict(zip(df_text_with_line_breaks.Item, df_text_with_line_breaks.Passage))
item_number_to_text_with_line_breaks = {
    k: v[:v.index('#')].replace('$$', ' PARBREAK ').replace('$', ' LINEBREAK ').replace(' 45 ', ' forty five ')
    for k, v in item_number_to_text_with_line_breaks.items()
}

Sanity check that the token-by-token representation minus line/paragraph breaks matches the recstring.

In [3]:
item_number_to_text_with_line_breaks[310]

'Meg wanted to bake a pie. She asked her mother for help. PARBREAK They measured the flour, sugar, and butter for the crust.  LINEBREAK Her mother showed her how to roll out the crust. Meg sliced  LINEBREAK the apples and put them in the pie pan. They put the pie  LINEBREAK in the hot oven and waited forty five minutes. PARBREAK As the pie baked, a delicious smell filled the kitchen. When  LINEBREAK the timer rang, they took the pie out of the oven and  LINEBREAK set it on the window sill to cool.'

In [4]:
def get_processed_token(token):
    rv = token
    while rv[0] in string.punctuation:
        rv = rv[1:]
    while rv[-1] in string.punctuation:
        rv = rv[:-1]
    return rv.lower()

for item, text_with_breaks in item_number_to_text_with_line_breaks.items():
    recstring = item_number_to_recstring[item]
    text_with_breaks_processed = [
        get_processed_token(token)
        for token in text_with_breaks.split()
        if token != 'LINEBREAK' and token != 'PARBREAK'
    ]
    assert recstring.split() == text_with_breaks_processed

In [5]:
gen = lambda lst: (x for x in lst)

**Compile relevant information about a reading first, before aligning to recstring.**

In [6]:
df_alignment_child = pd.read_csv('data/moby-passages-36/data-330-child/Alignment.csv', sep=',')
session, sframe, nframes, word = df_alignment_child.session, df_alignment_child.sframe, df_alignment_child.nframes, df_alignment_child.word
session, sframe, nframes, word = gen(session), gen(sframe), gen(nframes), gen(word)

session_to_alignment = defaultdict(list)
for session_id in session:
    sframe_val, nframes_val, word_val = next(sframe), next(nframes), next(word)
    if word_val[0] == '<':
        word_val = '<pause>'
    session_to_alignment[session_id].append([word_val, sframe_val, nframes_val])

## Process dependencies

Load $F_0$ information.

In [7]:
df_f0_child = pd.read_csv('data/moby-passages-36/data-330-child/f0_184.csv', sep=',')
df_f0_child.value = [[float(y.split(':')[0]) for y in x.split()] for x in df_f0_child.value]

In [8]:
session, value = df_f0_child.session, df_f0_child.value
session, value = gen(session), gen(value)
session_to_f0 = {}
for session_id in session:
    session_to_f0[session_id] = next(value)

## Compile into full data matrix

In [9]:
def CREATE_FEATURE_VECTOR(
    token='',
    matches_expected=0,
    sframe=0,
    nframes=0,
    pitch_mean=0,
    pitch_start=0,
    pitch_end=0,
    pitch_high=0,
    pitch_low=0,
    pitch_slope=0):
    rv = {
        'token': token,
        'matches_expected': matches_expected,
        'sframe': sframe,
        'nframes': nframes,
        'pitch_mean': pitch_mean,
        'pitch_start': pitch_start,
        'pitch_end': pitch_end,
        'pitch_high': pitch_high,
        'pitch_low': pitch_low,
        'pitch_slope': pitch_slope
    }
    return rv

In [10]:
session_to_features = {}
new_feature_names = [
    'token',
    'matches_expected'
    'sframe',
    'nframes',
    'pitch_mean',
    'pitch_start',
    'pitch_end',
    'pitch_high',
    'pitch_low',
    'pitch_slope'
]

for session, alignment in session_to_alignment.items():
    recording_mean = sum(session_to_f0[session]) / len(session_to_f0[session])
    session_to_features[session] = []
    for token, sframe, nframes in alignment:
        f0s = session_to_f0[session][sframe:sframe + nframes]
        f0s = [x - recording_mean for x in f0s] # center
        pitch_mean = sum(f0s) / len(f0s)
        pitch_start = f0s[0]
        pitch_end = f0s[-1]
        pitch_high = max(f0s)
        pitch_low = min(f0s)
        pitch_slope = (pitch_end - pitch_start) / len(f0s)        
        new_features = CREATE_FEATURE_VECTOR(
            token=token,
            sframe=sframe,
            nframes=nframes,
            pitch_mean=pitch_mean,
            pitch_start=pitch_start,
            pitch_end=pitch_end,
            pitch_high=pitch_high,
            pitch_low=pitch_low,
            pitch_slope=pitch_slope
        )
        session_to_features[session].append(new_features)

## Align readings with recstrings

In [11]:
recstring_330_with_pauses = ('<pause> ' + ' <pause> '.join(item_number_to_recstring[330].split()) + ' <pause>').split()
recstring_330_with_pauses[:5]

['<pause>', 'sam', '<pause>', 'and', '<pause>']

We use the naive `diff` algorithm to align readings with recstrings. This method does not capture self-correcting.

In [12]:
differ = difflib.Differ()

session_to_naive_alignment = {}
for session, features in session_to_features.items():
    transcribed_tokens = [x['token'] for x in features]
    diff = differ.compare(transcribed_tokens, recstring_330_with_pauses)
    
    # remove '?' lines
    diff = (x for x in list(diff) if x[0] != '?')
    
    # +: in full recstring but not in transcribed tokens (often, dummy <pause>s)
    # -: in transcribed tokens but not in full recstring (child added words)
    
    naive_alignment = [[] for x in recstring_330_with_pauses]
    for idx in range(len(naive_alignment)):
        while True:
            diff_next = next(diff)
            if diff_next[0] == ' ':
                naive_alignment[idx].append(diff_next[2:])
                break
            if diff_next[0] == '+':
                break
            
            # diff_next[0] == '-'
            naive_alignment[idx].append(diff_next[2:])
    session_to_naive_alignment[session] = naive_alignment

Here's what this naive alignment looks like:

In [13]:
for x, y in zip(recstring_330_with_pauses, session_to_naive_alignment[30908]):
    print(x, y)

<pause> ['<pause>']
sam ['sam']
<pause> []
and ['and']
<pause> []
jo ['jo']
<pause> ['<pause>']
went ['went']
<pause> []
for ['for']
<pause> ['<pause>']
a ['a']
<pause> []
hike ['hike']
<pause> ['<pause>']
they ['they']
<pause> ['<pause>']
took ['took', 'a', 'path', '<pause>', 'through', 'the', 'woods', '<pause>', '<pause>', 'their', 'heads', '<pause>', '<pause>', '<pause>', 'through', 'the', '<pause>', '<pause>', 'they', 'took']
<pause> ['<pause>']
a ['a']
<pause> ['<pause>']
path ['path']
<pause> ['<pause>']
through ['through']
<pause> ['<pause>']
the ['their']
<pause> ['<pause>']
woods ['mother']
<pause> []
suddenly []
<pause> []
sam []
<pause> ['<pause>']
heard ['heard']
<pause> []
a ['a']
<pause> []
noise ['noise']
<pause> []
coming ['coming']
<pause> ['<pause>']
from ['from']
<pause> []
the ['the']
<pause> []
tree ['tree']
<pause> ['<pause>']
above ['above']
<pause> ['<pause>']
their ['their']
<pause> []
heads ['heads']
<pause> []
jo ['jo']
<pause> []
climbed []
<pause> []
up []


Now we recompile the performance matrix based on this alignment. Note that:


Also, add a flag if the token in the naive alignment matches the expected token in the recstring.

In [14]:
# COPIED FROM ABOVE:
# features = {
#     'token': token,
#     'matches_expected': matches_expected,
#     'sframe': sframe,
#     'nframes': nframes,
#     'pitch_mean': pitch_mean,
#     'pitch_end': pitch_end,
#     'pitch_high': pitch_high,
#     'pitch_low': pitch_low,
#     'pitch_slope': pitch_slope
# }

N_FEATURES = len(new_feature_names)

new_compiled_features = {}
for session, naive_alignment in session_to_naive_alignment.items():
    features = session_to_features[session]
    recompiled_features = []
    previous_features_matrix_row_idx = 0
    
    for expected_token, aligned_token_group in zip(recstring_330_with_pauses, naive_alignment):
        if aligned_token_group == []:
            recompiled_features.append(CREATE_FEATURE_VECTOR(matches_expected=0))
            continue
        if len(aligned_token_group) == 1:
            is_correct_token = 1 if aligned_token_group[0] == expected_token else 0
            old_features = features[previous_features_matrix_row_idx]
            old_features['matches_expected'] = is_correct_token
            recompiled_features.append(old_features)
            previous_features_matrix_row_idx += 1
        else:
            # multiple tokens
            num_tokens = len(aligned_token_group)
            features_for_token_group = [features[i] for i in range(previous_features_matrix_row_idx, previous_features_matrix_row_idx + num_tokens)]
            
            '''
            Now compile new features
            '''
            last_token_in_token_group = aligned_token_group[-1]
            features_for_last_token = features_for_token_group[-1]
            if last_token_in_token_group == expected_token:
                token = expected_token
            else:
                token = ' '.join(aligned_token_group)
            sframe = features_for_token_group[0]['sframe'] # sframe for updated feature is the sframe for the first token in the aligned token group
            nframes = sum(f['nframes'] for f in features_for_token_group)
            
            # pitch-related features can match that of the last token, even if it is not the expected token
            pitch_mean = features_for_last_token['pitch_mean']
            pitch_start = features_for_last_token['pitch_start']
            pitch_end = features_for_last_token['pitch_end']
            pitch_high = features_for_last_token['pitch_high']
            pitch_low = features_for_last_token['pitch_low']
            pitch_slope = features_for_last_token['pitch_slope']
            
            new_feature_to_add = CREATE_FEATURE_VECTOR(
                token=token,
                matches_expected=0,
                sframe=sframe,
                nframes=nframes,
                pitch_mean=pitch_mean,
                pitch_start=pitch_start,
                pitch_end=pitch_end,
                pitch_high=pitch_high,
                pitch_low=pitch_low,
                pitch_slope=pitch_slope
            )
            recompiled_features.append(new_feature_to_add)
            previous_features_matrix_row_idx += num_tokens
    new_compiled_features[session] = recompiled_features

In [15]:
FILENAME_PREFIX = '20190720_child_readings_'

for session, features in new_compiled_features.items():
    matrix = pd.DataFrame(features)
    matrix.index = recstring_330_with_pauses
    matrix.to_csv('output/performance_matrix/' + FILENAME_PREFIX + str(session) + '.tsv', sep='\t')