# Compile machine scores

In [1]:
import os
import json
import constituency_retrieval_util

import pandas as pd

from scipy import spatial

In [2]:
with open('data/moby/jsons/item_to_level_to_constituency_ids.json') as f:
    item_to_level_to_constituency_ids = json.load(f)

In [25]:
ITEM_NUMBER = '2202'
CHILD_PERFORMANCE_DIR = 'output/performance-matrix/2202_child_updated_20200907/'
GOLD_PERFORMANCE_DIR = 'output/performance-matrix/2202_gold_updated_20200907'

'''Mean word durations'''
CHILD_SESSION_TO_MEAN_WORD_DURATION = 'data/moby/mean-word-frame-duration/2202-child.json'
GOLD_SESSION_TO_MEAN_WORD_DURATION = 'data/moby/mean-word-frame-duration/2202-gold.json'

OUTPUT_SUFFIX = '_20200907.json'
output_path = 'output/performance-scores/' + ITEM_NUMBER + OUTPUT_SUFFIX

In [26]:
session_to_child_df = {}
for file in os.listdir(CHILD_PERFORMANCE_DIR):
    session = file.split('.')[0]
    child_df = pd.read_csv(os.path.join(CHILD_PERFORMANCE_DIR, file), sep='\t')
    session_to_child_df[session] = child_df

session_to_gold_df = {}
for file in os.listdir(GOLD_PERFORMANCE_DIR):
    session = file.split('.')[0]
    gold_df = pd.read_csv(os.path.join(GOLD_PERFORMANCE_DIR, file), sep='\t')
    session_to_gold_df[session] = gold_df

In [27]:
with open(CHILD_SESSION_TO_MEAN_WORD_DURATION) as f:
    child_session_to_mean_word_duration = json.load(f)

with open(GOLD_SESSION_TO_MEAN_WORD_DURATION) as f:
    gold_session_to_mean_word_duration = json.load(f)

In [28]:
def center_word_durations(session_to_dfs, session_to_mean_word_duration):
    for session, df in session_to_dfs.items():
        mean_word_duration = int(session_to_mean_word_duration[session])
        token_list = list(str(w) for w in df.token)
        matches_expected_list = list(df.matches_expected)
        nframes_list = list(df.nframes)
        
        centered_nframes = []
        for token, matches_expected, nframes in zip(token_list, matches_expected_list, nframes_list):
            if ('<pause>' != token != 'nan'):
                centered_nframes.append(nframes - mean_word_duration)
            else:
                centered_nframes.append(nframes)
        session_to_dfs[session]['centered_nframes'] = centered_nframes

center_word_durations(session_to_child_df, child_session_to_mean_word_duration)
center_word_durations(session_to_gold_df, gold_session_to_mean_word_duration)

In [29]:
def cosine_similarity(v1, v2):
    return 1 - spatial.distance.cosine(v1, v2)

In [30]:
# categories = [
#     'pitch_slope_score_lower_constituents_with_pauses',
#     'pitch_slope_score_lower_constituents_no_pauses',
#     'pitch_slope_score_middle_constituents_with_pauses',
#     'pitch_slope_score_middle_constituents_no_pauses',
#     'pitch_slope_score_higher_constituents_with_pauses',
#     'pitch_slope_score_higher_constituents_no_pauses',
    
#     'duration_score_lower_constituents_with_pauses',
#     'duration_score_lower_constituents_no_pauses',
#     'duration_score_middle_constituents_with_pauses',
#     'duration_score_middle_constituents_no_pauses',
#     'duration_score_higher_constituents_with_pauses',
#     'duration_score_higher_constituents_no_pauses',
# ]

In [31]:
def compute_average_score_from_index_sets(child_column, gold_column, indices_list):
    running_sum = 0
    for indices in indices_list:
        child = [child_column[i] for i in indices]
        gold = [gold_column[i] for i in indices]
        running_sum += cosine_similarity(child, gold)
    return running_sum / len(indices_list)

In [32]:
constituency_indices = [
    index_set
    for level, index_sets in item_to_level_to_constituency_ids[ITEM_NUMBER].items()
    for index_set in index_sets
]

lower_level_constituencies = [c for c in constituency_indices if len(c) <= 3]
middle_level_constituencies = [c for c in constituency_indices if 3 < len(c) < 6]
higher_level_constituencies = [c for c in constituency_indices if 6 <= len(c)]

In [33]:
lower_level_constituencies_with_pauses = [
    constituency_retrieval_util.get_full_indices_includings_pauses(c) for c in lower_level_constituencies
]
middle_level_constituencies_with_pauses = [
    constituency_retrieval_util.get_full_indices_includings_pauses(c) for c in middle_level_constituencies
]
higher_level_constituencies_with_pauses = [
    constituency_retrieval_util.get_full_indices_includings_pauses(c) for c in higher_level_constituencies
]

In [34]:
child_session_to_machine_scores = {}

for child_session, child_df in session_to_child_df.items():
    child_df_pitch_slope = list(child_df.pitch_slope)
    child_df_centered_nframes = list(child_df.centered_nframes)
    
    pitch_slope_score_lower_constituents_with_pauses_list = []
    pitch_slope_score_lower_constituents_no_pauses_list = []
    pitch_slope_score_middle_constituents_with_pauses_list = []
    pitch_slope_score_middle_constituents_no_pauses_list = []
    pitch_slope_score_higher_constituents_with_pauses_list = []
    pitch_slope_score_higher_constituents_no_pauses_list = []
    
    duration_score_lower_constituents_with_pauses_list = []
    duration_score_lower_constituents_no_pauses_list = []
    duration_score_middle_constituents_with_pauses_list = []
    duration_score_middle_constituents_no_pauses_list = []
    duration_score_higher_constituents_with_pauses_list = []
    duration_score_higher_constituents_no_pauses_list = []
    
    for gold_session, gold_df in session_to_gold_df.items():
        gold_df_pitch_slope = list(gold_df.pitch_slope)
        gold_df_centered_nframes = list(gold_df.centered_nframes)
        
        pitch_slope_score_lower_constituents_with_pauses =\
            compute_average_score_from_index_sets(
                child_df_pitch_slope,
                gold_df_pitch_slope,
                lower_level_constituencies_with_pauses
        )
        pitch_slope_score_lower_constituents_no_pauses =\
            compute_average_score_from_index_sets(
                child_df_pitch_slope,
                gold_df_pitch_slope,
                lower_level_constituencies
        )
        pitch_slope_score_middle_constituents_with_pauses =\
            compute_average_score_from_index_sets(
                child_df_pitch_slope,
                gold_df_pitch_slope,
                middle_level_constituencies_with_pauses
        )
        pitch_slope_score_middle_constituents_no_pauses =\
            compute_average_score_from_index_sets(
                child_df_pitch_slope,
                gold_df_pitch_slope,
                middle_level_constituencies
        )
        pitch_slope_score_higher_constituents_with_pauses =\
            compute_average_score_from_index_sets(
                child_df_pitch_slope,
                gold_df_pitch_slope,
                higher_level_constituencies_with_pauses
        )
        pitch_slope_score_higher_constituents_no_pauses =\
            compute_average_score_from_index_sets(
                child_df_pitch_slope,
                gold_df_pitch_slope,
                higher_level_constituencies
        )
        
        duration_score_lower_constituents_with_pauses =\
            compute_average_score_from_index_sets(
                child_df_centered_nframes,
                gold_df_centered_nframes,
                lower_level_constituencies_with_pauses
        )
        duration_score_lower_constituents_no_pauses =\
            compute_average_score_from_index_sets(
                child_df_centered_nframes,
                gold_df_centered_nframes,
                lower_level_constituencies
        )
        duration_score_middle_constituents_with_pauses =\
            compute_average_score_from_index_sets(
                child_df_centered_nframes,
                gold_df_centered_nframes,
                middle_level_constituencies_with_pauses
        )
        duration_score_middle_constituents_no_pauses =\
            compute_average_score_from_index_sets(
                child_df_centered_nframes,
                gold_df_centered_nframes,
                middle_level_constituencies
        )
        duration_score_higher_constituents_with_pauses =\
            compute_average_score_from_index_sets(
                child_df_centered_nframes,
                gold_df_centered_nframes,
                higher_level_constituencies_with_pauses
        )
        duration_score_higher_constituents_no_pauses =\
            compute_average_score_from_index_sets(
                child_df_centered_nframes,
                gold_df_centered_nframes,
                higher_level_constituencies
        )
        
        pitch_slope_score_lower_constituents_with_pauses_list.append(pitch_slope_score_lower_constituents_with_pauses)
        pitch_slope_score_lower_constituents_no_pauses_list.append(pitch_slope_score_lower_constituents_no_pauses)
        pitch_slope_score_middle_constituents_with_pauses_list.append(pitch_slope_score_middle_constituents_with_pauses)
        pitch_slope_score_middle_constituents_no_pauses_list.append(pitch_slope_score_middle_constituents_no_pauses)
        pitch_slope_score_higher_constituents_with_pauses_list.append(pitch_slope_score_higher_constituents_with_pauses)
        pitch_slope_score_higher_constituents_no_pauses_list.append(pitch_slope_score_higher_constituents_no_pauses)
        
        duration_score_lower_constituents_with_pauses_list.append(duration_score_lower_constituents_with_pauses)
        duration_score_lower_constituents_no_pauses_list.append(duration_score_lower_constituents_no_pauses)
        duration_score_middle_constituents_with_pauses_list.append(duration_score_middle_constituents_with_pauses)
        duration_score_middle_constituents_no_pauses_list.append(duration_score_middle_constituents_no_pauses)
        duration_score_higher_constituents_with_pauses_list.append(duration_score_higher_constituents_with_pauses)
        duration_score_higher_constituents_no_pauses_list.append(duration_score_higher_constituents_no_pauses)
    
    best_pitch_slope_score_lower_constituents_with_pauses = max(pitch_slope_score_lower_constituents_with_pauses_list)
    best_pitch_slope_score_lower_constituents_no_pauses = max(pitch_slope_score_lower_constituents_no_pauses_list)
    best_pitch_slope_score_middle_constituents_with_pauses = max(pitch_slope_score_middle_constituents_with_pauses_list)
    best_pitch_slope_score_middle_constituents_no_pauses = max(pitch_slope_score_middle_constituents_no_pauses_list)
    best_pitch_slope_score_higher_constituents_with_pauses = max(pitch_slope_score_higher_constituents_with_pauses_list)
    best_pitch_slope_score_higher_constituents_no_pauses = max(pitch_slope_score_higher_constituents_no_pauses_list)

    best_duration_score_lower_constituents_with_pauses = max(duration_score_lower_constituents_with_pauses_list)
    best_duration_score_lower_constituents_no_pauses = max(duration_score_lower_constituents_no_pauses_list)
    best_duration_score_middle_constituents_with_pauses = max(duration_score_middle_constituents_with_pauses_list)
    best_duration_score_middle_constituents_no_pauses = max(duration_score_middle_constituents_no_pauses_list)
    best_duration_score_higher_constituents_with_pauses = max(duration_score_higher_constituents_with_pauses_list)
    best_duration_score_higher_constituents_no_pauses = max(duration_score_higher_constituents_no_pauses_list)

    if best_pitch_slope_score_lower_constituents_with_pauses != best_pitch_slope_score_lower_constituents_with_pauses:
        best_pitch_slope_score_lower_constituents_with_pauses = 0
    if best_pitch_slope_score_lower_constituents_no_pauses != best_pitch_slope_score_lower_constituents_no_pauses:
        best_pitch_slope_score_lower_constituents_no_pauses = 0
    if best_pitch_slope_score_middle_constituents_with_pauses != best_pitch_slope_score_middle_constituents_with_pauses:
        best_pitch_slope_score_middle_constituents_with_pauses = 0    
    if best_pitch_slope_score_middle_constituents_no_pauses != best_pitch_slope_score_middle_constituents_no_pauses:
        best_pitch_slope_score_middle_constituents_no_pauses = 0
    if best_pitch_slope_score_higher_constituents_with_pauses != best_pitch_slope_score_higher_constituents_with_pauses:
        best_pitch_slope_score_higher_constituents_with_pauses = 0        
    if best_pitch_slope_score_higher_constituents_no_pauses != best_pitch_slope_score_higher_constituents_no_pauses:
        best_pitch_slope_score_higher_constituents_no_pauses = 0
        
    if best_duration_score_lower_constituents_with_pauses != best_duration_score_lower_constituents_with_pauses:
        best_duration_score_lower_constituents_with_pauses = 0
    if best_duration_score_lower_constituents_no_pauses != best_duration_score_lower_constituents_no_pauses:
        best_duration_score_lower_constituents_no_pauses = 0
    if best_duration_score_middle_constituents_with_pauses != best_duration_score_middle_constituents_with_pauses:
        best_duration_score_middle_constituents_with_pauses = 0
    if best_duration_score_middle_constituents_no_pauses != best_duration_score_middle_constituents_no_pauses:
        best_duration_score_middle_constituents_no_pauses = 0
    if best_duration_score_higher_constituents_with_pauses != best_duration_score_higher_constituents_with_pauses:
        best_duration_score_higher_constituents_with_pauses = 0
    if best_duration_score_higher_constituents_no_pauses != best_duration_score_higher_constituents_no_pauses:
        best_duration_score_higher_constituents_no_pauses = 0
        
    child_session_to_machine_scores[child_session] = {
        'best_pitch_slope_score_lower_constituents_with_pauses': best_pitch_slope_score_lower_constituents_with_pauses,
        'best_pitch_slope_score_lower_constituents_no_pauses': best_pitch_slope_score_lower_constituents_no_pauses,
        'best_pitch_slope_score_middle_constituents_with_pauses': best_pitch_slope_score_middle_constituents_with_pauses,
        'best_pitch_slope_score_middle_constituents_no_pauses': best_pitch_slope_score_middle_constituents_no_pauses,
        'best_pitch_slope_score_higher_constituents_with_pauses': best_pitch_slope_score_higher_constituents_with_pauses,
        'best_pitch_slope_score_higher_constituents_no_pauses': best_pitch_slope_score_higher_constituents_no_pauses,

        'best_duration_score_lower_constituents_with_pauses': best_duration_score_lower_constituents_with_pauses,
        'best_duration_score_lower_constituents_no_pauses': best_duration_score_lower_constituents_no_pauses,
        'best_duration_score_middle_constituents_with_pauses': best_duration_score_middle_constituents_with_pauses,
        'best_duration_score_middle_constituents_no_pauses': best_duration_score_middle_constituents_no_pauses,
        'best_duration_score_higher_constituents_with_pauses': best_duration_score_higher_constituents_with_pauses,
        'best_duration_score_higher_constituents_no_pauses': best_duration_score_higher_constituents_no_pauses
    }

In [35]:
with open(output_path, 'w') as f:
    json.dump(child_session_to_machine_scores, f)