In [1]:
import os
import numpy as np
import pandas as pd
import pickle as pkl
from collections import defaultdict

In [2]:
gold_punct_df = pd.read_csv('gold_punct_metrics_updated.csv')

In [3]:
item_to_pause_idxs = defaultdict(list)
for _, row in gold_punct_df.iterrows():
    item_to_pause_idxs[row['item']].append(row.perfmat_idx)

In [4]:
def get_average_pause_length_over_span(perfmat):
    pausenan = {'<pause>', 'nan'}
    rows = []
    for _, row in perfmat.iterrows():
        rows.append((row.token, row.nframes))
    rows = rows[1:-1]
    
    while rows and rows[0][1] == 0:
        rows = rows[2:]
    while rows and rows[-1][1] == 0:
        rows = rows[:-2]
    
    pauses = [x for x in rows[1::2] if str(x[0]) in pausenan]
    
    if pauses:
        pause_median = np.median([x[1] for x in pauses])
        pause_mean = np.mean([x[1] for x in pauses])
        return pause_median, pause_mean
    return 0, 0

In [20]:
session_to_data = defaultdict(list)

perfmat_dirs = '../output/performance-matrix/all-fixed/'
for perfmat_dir in os.listdir(perfmat_dirs):
    if perfmat_dir.endswith('184'):
        item = int(perfmat_dir.replace('-184', ''))
        for perfmat_file in os.listdir(os.path.join(perfmat_dirs, perfmat_dir)):
            pause_lengths_for_response = []
            correct_word_read_flag = []
            is_pause_flag = []

            perfmat = pd.read_csv(os.path.join(perfmat_dirs, perfmat_dir, perfmat_file), sep='\t')
            for idx in item_to_pause_idxs[item]:
                pause_lengths_for_response.append(perfmat.iloc[idx + 1].nframes)
                
                if perfmat.iloc[idx].matches_expected:
                    correct_word_read_flag.append(1)
                else:
                    correct_word_read_flag.append(0)
                
                if str(perfmat.iloc[idx + 1].token) in {'nan', '<pause>'}:
                    is_pause_flag.append(1)
                else:
                    is_pause_flag.append(0)
            
            perfmat2 = perfmat.copy(deep=True)
            for idx in item_to_pause_idxs[item][::-1]:
                perfmat2 = perfmat2.drop(idx + 1)
            
            median_span_pause_length, mean_span_pause_length = get_average_pause_length_over_span(perfmat2)
            
            response_pause_data = {
                'pause_len': pause_lengths_for_response,
                'correct_word_read': correct_word_read_flag,
                'is_pause': is_pause_flag,
                'median_span_pause_len': median_span_pause_length,
                'mean_span_pause_len': mean_span_pause_length
            }
            
            session_to_data[perfmat_file.replace('.tsv', '')].append((item, response_pause_data))

In [28]:
with open('../data/moby/pkls/session_to_pause_length_data-updated.pkl', 'wb') as f:
    pkl.dump(session_to_data, f)