In [9]:
import sys, os, json, termcolor
import numpy as np, pandas as pd

from collections import *

In [10]:
with open('../data/moby/jsons/item_to_passage_new.json') as f:
    item_to_passage = json.load(f)
with open('../data/moby/jsons/item_to_recstring_new.json') as f:
    item_to_recstring = json.load(f)
with open('../data/moby/jsons/item_to_form.json') as f:
    item_to_form = json.load(f)
with open('item_to_expected_pauses_mask.json') as f:
    item_to_expected_pauses_mask = json.load(f)

In [11]:
target_punct = {'.', ',', '?', '!'}

In [13]:
item_to_punctuation_indices = defaultdict(list)
for item, passage in item_to_passage.items():
    recstring = item_to_recstring[item].split()
    passage_no_quotes = passage.replace('"', '')
    passage_no_quotes_tokenized = passage_no_quotes.split()
    for idx, token in enumerate(passage_no_quotes_tokenized):
        if token[-1] in target_punct and token != 'Ms.':
#             termcolor.cprint(token, 'red', end=' ')
#             print(recstring[idx])
            item_to_punctuation_indices[item].append((idx, token, recstring[idx]))
        else:
#             print(token, end=' ')
            pass
#     print()
item_to_punctuation_indices = dict(item_to_punctuation_indices)

In [36]:
eval_thresholds_red = {
    'Grade1Fall': 0.5,
    'Grade1Winter': 0.4,
    'Grade1Spring': 0.4,
    'Grade2Fall': 0.4,
    'Grade2Winter': 0.4,
    'Grade2Spring': 0.4,
    'Grade3Fall': 0.4,
    'Grade3Winter': 0.4,
    'Grade3Spring': 0.3,
    'Grade4Fall': 0.3,
    'Grade4Winter': 0.3,
    'Grade4Spring': 0.3 
}
eval_thresholds_yellow = {
    'Grade1Fall': 0.3,
    'Grade1Winter': 0.3,
    'Grade1Spring': 0.3,
    'Grade2Fall': 0.2,
    'Grade2Winter': 0.2,
    'Grade2Spring': 0.2,
    'Grade3Fall': 0.2,
    'Grade3Winter': 0.2,
    'Grade3Spring': 0.2,
    'Grade4Fall': 0.2,
    'Grade4Winter': 0.2,
    'Grade4Spring': 0.2
}

In [43]:
# print('Item\tPrePunctTokenIndices')
list_item = []
list_form = []
list_prepunct_indices = []
list_prepunct_tokens = []
list_signif_indices = []
list_signif_tokens = []
list_red_threshold = []
list_yellow_threshold = []
list_pause_signif_threshold = []
list_child_norm_signif_threshold = []
list_passage_correctness_threshold = []

for item, idxs in item_to_punctuation_indices.items():
    list_item.append(item)
    idxs_no_final = idxs[:-1]
    form = item_to_form[item]
    list_form.append(form)
    
    signif = [x for i, x in enumerate(idxs_no_final) if item_to_expected_pauses_mask[item][i]]
    assert len(item_to_expected_pauses_mask[item]) == len(idxs) - 1, item
    
    list_prepunct_indices.append(','.join([str(x[0]) for x in idxs_no_final]))
    list_prepunct_tokens.append(','.join([x[-1] for x in idxs_no_final]))
    
    list_signif_indices.append(','.join([str(x[0]) for x in signif]))
    list_signif_tokens.append(','.join([x[-1] for x in signif]))
    
    list_red_threshold.append(eval_thresholds_red[form])
    list_yellow_threshold.append(eval_thresholds_yellow[form])
    
    list_pause_signif_threshold.append(30)
    list_child_norm_signif_threshold.append(10)
    list_passage_correctness_threshold.append(0.5)
    
#     print(item + '\t' + ','.join([str(x[0]) for x in idxs]))

In [46]:
# pd.DataFrame({
#     'Item': list_item,
#     'Form': list_form,
#     'PrePunctTokens': list_prepunct_tokens,
#     'PrePunctTokenIndices': list_prepunct_indices,
#     'SignifTokens': list_signif_tokens,
#     'SignifTokenIndices': list_signif_indices,
#     'RedThreshold': list_red_threshold,
#     'YellowThreshold': list_yellow_threshold,
#     'PauseSignificanceThreshold': list_pause_signif_threshold,
#     'ChildNormalizedPauseSignificanceThreshold': list_child_norm_signif_threshold,
#     'PassageCorrectnessThreshold': list_passage_correctness_threshold
# }).to_csv('prepunct_table_with_signif.tsv', sep='\t', index=None)

In [5]:
def get_metrics_for_response(table, idx):
    pause_length_after_token = table.iloc[idx + 1].nframes
    
    f0_minus_2 = table.iloc[idx - 4].pitch_mean
    f0_minus_1 = table.iloc[idx - 2].pitch_mean
    f0_plus_1  = table.iloc[idx + 2].pitch_mean
    f0_plus_2  = table.iloc[idx + 4].pitch_mean
    
    ratio_before = f0_minus_2 / f0_minus_1
    ratio_after  = f0_plus_1 / f0_plus_2
    
    if ratio_before != ratio_before:
        ratio_before = 0
    if ratio_after != ratio_after:
        aratio_after = 0
    
    ratio_before_max = max(f0_minus_2, f0_minus_1)
    ratio_after_max  = max(f0_plus_1, f0_plus_2)
    
    return {
        'pause_length_after_token': pause_length_after_token,
        'ratio_before': ratio_before,
        'ratio_after': ratio_after,
        'ratio_before_max': ratio_before_max,
        'ratio_after_max': ratio_after_max
    }

In [6]:
perfmat_dir = '../output/performance-matrix/all-fixed/'

item_to_punct_location_to_stats = defaultdict(list)

for item_dir in os.listdir(perfmat_dir):
    if item_dir.endswith('-182'):
        item = item_dir.replace('-182', '')
        punct_indices = item_to_punctuation_indices[item]
        punct_indices_perfmat = [xyz[0] * 2 + 1 for xyz in punct_indices]
        
        punct_location_to_stats = defaultdict(list)
        
        for perfmat_file in os.listdir(os.path.join(perfmat_dir, item_dir)):
            if not perfmat_file.endswith('tsv'):
                continue
            perf_mat = pd.read_csv(os.path.join(perfmat_dir, item_dir, perfmat_file), sep='\t')
            
            for _idx, idx in enumerate(punct_indices_perfmat[:-1]):
                metrics = get_metrics_for_response(perf_mat, idx)
                
                punct_location_to_stats[(idx,
                                         punct_indices[_idx][1],
                                         punct_indices[_idx][2])].append(metrics)
                
                assert perf_mat.iloc[idx]['Unnamed: 0'] == punct_indices[_idx][2]
        
        as_list_punct_location_to_stats = sorted(list([*x, y] for x, y in punct_location_to_stats.items()), key=lambda x: x[0])
        item_to_punct_location_to_stats[item] = as_list_punct_location_to_stats
#             break
#         break

item_to_punct_location_to_stats = dict(item_to_punct_location_to_stats)

  # Remove the CWD from sys.path while we load stuff.
  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.


In [7]:
def make_stats_table(item):
    pause_min_c  = []
    pause_max_c  = []
    pause_1q_c   = []
    pause_3q_c   = []
    pause_med_c  = []
    pause_mean_c = []
    pause_sstd_c = []
    
    f0ratio_before_min_c  = []
    f0ratio_before_max_c  = []
    f0ratio_before_1q_c   = []
    f0ratio_before_3q_c   = []
    f0ratio_before_med_c  = []
    f0ratio_before_mean_c = []
    f0ratio_before_sstd_c = [] 
    
    f0_before_max_med_c = []
    f0_after_max_med_c  = []

    f0ratio_after_min_c  = []
    f0ratio_after_max_c  = []
    f0ratio_after_1q_c   = []
    f0ratio_after_3q_c   = []
    f0ratio_after_med_c  = []
    f0ratio_after_mean_c = []
    f0ratio_after_sstd_c = [] 
    
    punct_data = item_to_punct_location_to_stats[item]
    
    for idx, token, token_with_punct, stats in punct_data:
        pause_lengths = [d['pause_length_after_token'] for d in stats]
        f0ratio_before = np.array([d['ratio_before'] for d in stats])
        f0ratio_after = np.array([d['ratio_after'] for d in stats])
        f0_before_max = np.array([d['ratio_before_max'] for d in stats])
        f0_after_max  = np.array([d['ratio_after_max'] for d in stats])
        
        f0ratio_before = [x for x in f0ratio_before if (not np.isnan(x)) and np.isfinite(x)]
        f0ratio_after = [x for x in f0ratio_after if (not np.isnan(x)) and np.isfinite(x)]
        
        pause_min_c.append(np.min(pause_lengths))
        pause_max_c.append(np.max(pause_lengths))
        pause_1q_c.append(np.percentile(pause_lengths, 25))
        pause_3q_c.append(np.percentile(pause_lengths, 75))
        pause_med_c.append(np.median(pause_lengths))
        pause_mean_c.append(np.mean(pause_lengths))
        pause_sstd_c.append(np.std(pause_lengths, ddof=1))
        
        f0_before_max_med_c.append(np.median(f0_before_max))
        f0_after_max_med_c.append(np.median(f0_after_max))
        
        f0ratio_before_min_c.append(np.min(f0ratio_before))
        f0ratio_before_max_c.append(np.max(f0ratio_before))
        f0ratio_before_1q_c.append(np.percentile(f0ratio_before, 25))
        f0ratio_before_3q_c.append(np.percentile(f0ratio_before, 75))
        f0ratio_before_med_c.append(np.median(f0ratio_before))
        f0ratio_before_mean_c.append(np.mean(f0ratio_before))
        f0ratio_before_sstd_c.append(np.std(f0ratio_before, ddof=1))
        
        f0ratio_after_min_c.append(np.min(f0ratio_after))
        f0ratio_after_max_c.append(np.max(f0ratio_after))
        f0ratio_after_1q_c.append(np.percentile(f0ratio_after, 25))
        f0ratio_after_3q_c.append(np.percentile(f0ratio_after, 75))
        f0ratio_after_med_c.append(np.median(f0ratio_after))
        f0ratio_after_mean_c.append(np.mean(f0ratio_after))
        f0ratio_after_sstd_c.append(np.std(f0ratio_after, ddof=1))
    
    stats_table = {
        'form': item_to_form[item],
        'item': item,
        'perfmat_idx': [x[0] for x in punct_data],
        'token': [x[2] for x in punct_data],
        'punct': [x[1][-1] for x in punct_data],
        'pause_min': pause_min_c,
        'pause_max': pause_max_c,
        'pause_1q': pause_1q_c,
        'pause_3q': pause_3q_c, 
        'pause_med': pause_med_c,
        'pause_mean': pause_mean_c,
        'pause_sstd': pause_sstd_c,
        'f0_before_max_med': f0_before_max_med_c,
        'f0_after_max_med': f0_after_max_med_c,
        'f0ratio_before_min': f0ratio_before_min_c,
        'f0ratio_before_max': f0ratio_before_max_c,
        'f0ratio_before_1q': f0ratio_before_1q_c,
        'f0ratio_before_3q': f0ratio_before_3q_c, 
        'f0ratio_before_med': f0ratio_before_med_c,
        'f0ratio_before_mean': f0ratio_before_mean_c,
        'f0ratio_before_sstd': f0ratio_before_sstd_c,
        'f0ratio_after_min': f0ratio_after_min_c,
        'f0ratio_after_max': f0ratio_after_max_c,
        'f0ratio_after_1q': f0ratio_after_1q_c,
        'f0ratio_after_3q': f0ratio_after_3q_c, 
        'f0ratio_after_med': f0ratio_after_med_c,
        'f0ratio_after_mean': f0ratio_after_mean_c,
        'f0ratio_after_sstd': f0ratio_after_sstd_c
    }
    stats_df = pd.DataFrame(stats_table)
    return stats_df

In [8]:
stats_dfs = [make_stats_table(item) for item in item_to_punct_location_to_stats]
big_df = pd.concat(stats_dfs, ignore_index=True)    

In [9]:
big_df.sort_values(['form', 'item']).reset_index().drop('index', axis=1).to_csv('gold_punct_metrics_updated.csv', index=False)