# Compiling performance matrix

This notebook should be self-contained.

In [138]:
import string
import pandas as pd
import json
import difflib
from collections import defaultdict

**For each item, load recstring and token-by-token representation**

In [76]:
df_full_texts_and_recstrings = pd.read_csv('data/moby-passages-36/passages-with-line-break-and-recstring.tsv', sep='\t')
item_number_to_recstring = dict(zip(df_full_texts_and_recstrings.Item, df_full_texts_and_recstrings['Rec String']))
df_text_with_line_breaks = pd.read_csv(
    'data/moby-passages-36/passages-with-line-breaks.tsv',
    sep='\t',
    names=['Item', 'Passage']
)
item_number_to_text_with_line_breaks = dict(zip(df_text_with_line_breaks.Item, df_text_with_line_breaks.Passage))
item_number_to_text_with_line_breaks = {
    k: v[:v.index('#')].replace('$$', ' PARBREAK ').replace('$', ' LINEBREAK ').replace(' 45 ', ' forty five ')
    for k, v in item_number_to_text_with_line_breaks.items()
}

Sanity check that the token-by-token representation minus line/paragraph breaks matches the recstring.

In [77]:
item_number_to_text_with_line_breaks[310]

'Meg wanted to bake a pie. She asked her mother for help. PARBREAK They measured the flour, sugar, and butter for the crust.  LINEBREAK Her mother showed her how to roll out the crust. Meg sliced  LINEBREAK the apples and put them in the pie pan. They put the pie  LINEBREAK in the hot oven and waited forty five minutes. PARBREAK As the pie baked, a delicious smell filled the kitchen. When  LINEBREAK the timer rang, they took the pie out of the oven and  LINEBREAK set it on the window sill to cool.'

In [78]:
def get_processed_token(token):
    rv = token
    while rv[0] in string.punctuation:
        rv = rv[1:]
    while rv[-1] in string.punctuation:
        rv = rv[:-1]
    return rv.lower()

for item, text_with_breaks in item_number_to_text_with_line_breaks.items():
    recstring = item_number_to_recstring[item]
    text_with_breaks_processed = [
        get_processed_token(token)
        for token in text_with_breaks.split()
        if token != 'LINEBREAK' and token != 'PARBREAK'
    ]
    assert recstring.split() == text_with_breaks_processed

In [81]:
gen = lambda lst: (x for x in lst)

**Compile relevant information about a reading first, before aligning to recstring.**

In [90]:
df_alignment_child = pd.read_csv('data/moby-passages-36/data-330-child/Alignment.csv', sep=',')
session, sframe, nframes, word = df_alignment_child.session, df_alignment_child.sframe, df_alignment_child.nframes, df_alignment_child.word
session, sframe, nframes, word = gen(session), gen(sframe), gen(nframes), gen(word)

session_to_alignment = defaultdict(list)
for session_id in session:
    sframe_val, nframes_val, word_val = next(sframe), next(nframes), next(word)
    if word_val[0] == '<':
        word_val = '<pause>'
    session_to_alignment[session_id].append([word_val, sframe_val, nframes_val])

## Process dependencies

Load $F_0$ information.

In [106]:
df_f0_child = pd.read_csv('data/moby-passages-36/data-330-child/f0_184.csv', sep=',')
df_f0_child.value = [[float(y.split(':')[0]) for y in x.split()] for x in df_f0_child.value]

In [107]:
session, value = df_f0_child.session, df_f0_child.value
session, value = gen(session), gen(value)
session_to_f0 = {}
for session_id in session:
    session_to_f0[session_id] = next(value)

## Compile into full data matrix

In [116]:
session_to_features = {}
new_feature_names = [
    'token',
    'sframe',
    'nframes',
    'pitch_mean',
    'pitch_start',
    'pitch_end',
    'pitch_high',
    'pitch_low',
    'pitch_slope'
]

for session, alignment in session_to_alignment.items():
    recording_mean = sum(session_to_f0[session]) / len(session_to_f0[session])
    session_to_features[session] = []
    for token, sframe, nframes in alignment:
        f0s = session_to_f0[session][sframe:sframe + nframes]
        f0s = [x - recording_mean for x in f0s] # center
        pitch_mean = sum(f0s) / len(f0s)
        pitch_start = f0s[0]
        pitch_end = f0s[-1]
        pitch_high = max(f0s)
        pitch_low = min(f0s)
        pitch_slope = (pitch_end - pitch_start) / len(f0s)        
        new_features = [
            token,
            sframe,
            nframes,
            pitch_mean,
            pitch_start,
            pitch_end,
            pitch_high,
            pitch_low,
            pitch_slope
        ]
        session_to_features[session].append(new_features)

## Align readings with recstrings

In [140]:
recstring_330_with_pauses = ('<pause> ' + ' <pause> '.join(item_number_to_recstring[330].split()) + ' <pause>').split()
recstring_330_with_pauses[:5]

['<pause>', 'sam', '<pause>', 'and', '<pause>']

In [148]:
differ = difflib.Differ()

for session, features in session_to_features.items():
    transcribed_tokens = [x[0] for x in features]
    diff = differ.compare(transcribed_tokens, recstring_330_with_pauses)
    
    # remove '?' lines
    diff = (x for x in list(diff) if x[0] != '?')
    
    # +: in full recstring but not in transcribed tokens (often, dummy <pause>s)
    # -: in transcribed tokens but not in full recstring (child added words)
    
    naive_alignment = [[] for x in recstring_330_with_pauses]
    for idx in range(len(naive_alignment)):
        diff_next = next(diff)
        if diff_next[0] == ' ':
            naive_alignment[idx].append(diff_next.strip())
            continue
        if diff_next[0] == '+':
            continue
        
    break

  <pause>
  sam
+ <pause>
  and
+ <pause>
  jo
  <pause>
  went
+ <pause>
  for
  <pause>
  a
+ <pause>
  hike
  <pause>
  they
  <pause>
- took
- a
- path
- <pause>
- through
- the
- woods
- <pause>
- <pause>
- their
- heads
- <pause>
- <pause>
- <pause>
- through
- the
- <pause>
- <pause>
- they
  took
  <pause>
  a
  <pause>
  path
  <pause>
  through
  <pause>
- their
+ the
  <pause>
- mother
+ woods
+ <pause>
+ suddenly
+ <pause>
+ sam
  <pause>
  heard
+ <pause>
  a
+ <pause>
  noise
+ <pause>
  coming
  <pause>
  from
+ <pause>
  the
+ <pause>
  tree
  <pause>
  above
  <pause>
  their
+ <pause>
  heads
+ <pause>
  jo
+ <pause>
+ climbed
+ <pause>
+ up
+ <pause>
+ to
+ <pause>
+ see
  <pause>
  what
  <pause>
  the
  <pause>
- and
- jo
- went
- <pause>
- for
- a
- <pause>
- hike
- <pause>
- <pause>
- up
- <pause>
- to
- see
- what
- <pause>
- the
  noise
  <pause>
  was
  <pause>
  and
  <pause>
  found
+ <pause>
  two
+ <pause>
  baby
+ <pause>
  squirrels
  <pause>
  the
+ <pa

In [149]:
' '.join(transcribed_tokens)

'<pause> sam and jo <pause> went for <pause> a hike <pause> they <pause> took a path <pause> through the woods <pause> <pause> their heads <pause> <pause> <pause> through the <pause> <pause> they took <pause> a <pause> path <pause> through <pause> their <pause> mother <pause> heard a noise coming <pause> from the tree <pause> above <pause> their heads jo <pause> what <pause> the <pause> and jo went <pause> for a <pause> hike <pause> <pause> up <pause> to see what <pause> the noise <pause> was <pause> and <pause> found two baby squirrels <pause> the babies were alone <pause> but their <pause> mother <pause> must be somewhere near <pause> the <pause> children <pause> watched and waited <pause> sure enough the mother soon returned <pause> with <pause> a mouthful of nuts <pause> the <pause> noises <pause> <pause> babies <pause> stopped <pause> as the baby squirrels began to <pause> eat <pause> sam <pause> and <pause> jo smiled knowing <pause> the squirrels were safe <pause> <pause> safe <p

In [153]:
item_number_to_recstring[330]

'sam and jo went for a hike they took a path through the woods suddenly sam heard a noise coming from the tree above their heads jo climbed up to see what the noise was and found two baby squirrels the babies were alone but their mother must be somewhere near the children watched and waited sure enough the mother soon returned with a mouthful of nuts the noises stopped as the baby squirrels began to eat sam and jo smiled knowing the squirrels were safe with their mother'