In [1]:
import sys, os, json
import numpy as np, pandas as pd, matplotlib.pyplot as plt
from collections import defaultdict

In [2]:
with open('../data/moby/jsons/item_to_line_indices.json') as f:
    item_to_line_indices = json.load(f)

In [3]:
test_response_df = pd.read_csv('../output/performance-matrix/all-fixed/1401-184/27877.tsv', sep='\t')

In [4]:
item = '1401'

In [5]:
item_to_line_indices['1401']

[[0, 1, 2, 3, 4],
 [5, 6, 7, 8],
 [9, 10, 11, 12, 13],
 [14, 15, 16, 17, 18],
 [19, 20, 21, 22, 23],
 [24, 25, 26, 27, 28],
 [29, 30, 31, 32, 33],
 [34, 35, 36, 37, 38]]

In [12]:
print('item\tline_indices')
for item, line_indices in item_to_line_indices.items():
    print(item + '\t' + ';'.join([','.join([str(x) for x in line]) for line in line_indices]))

item	line_indices
310	0,1,2,3,4,5,6,7,8,9,10,11;12,13,14,15,16,17,18,19,20,21;22,23,24,25,26,27,28,29,30,31,32,33;34,35,36,37,38,39,40,41,42,43,44,45,46;47,48,49,50,51,52,53,54,55;56,57,58,59,60,61,62,63,64,65,66;67,68,69,70,71,72,73,74,75,76,77,78;79,80,81,82,83,84,85,86
320	0,1,2,3,4,5,6,7,8,9,10,11;12,13,14,15,16,17,18,19,20;21,22,23,24,25,26,27,28,29;30,31,32,33,34,35,36,37,38;39,40,41,42,43,44,45,46,47,48,49;50,51,52,53,54,55,56,57,58,59,60;61,62,63,64,65,66,67,68,69,70;71,72,73,74,75,76,77,78,79,80;81,82,83,84,85,86,87,88
330	0,1,2,3,4,5,6,7,8,9,10,11,12;13,14,15,16,17,18,19,20,21,22;23,24,25,26,27,28,29,30,31,32,33,34;35,36,37,38,39,40,41,42,43,44;45,46,47,48,49,50,51,52,53;54,55;56,57,58,59,60,61,62,63,64,65,66;67,68,69,70,71,72,73,74,75,76;77,78,79,80,81,82,83,84,85,86;87,88
350	0,1,2,3,4,5,6,7,8,9,10;11,12,13,14,15,16,17,18,19,20;21,22,23,24,25,26,27,28,29,30;31,32,33,34,35,36,37,38,39;40,41,42,43,44,45;46,47,48,49,50,51,52,53,54;55,56,57,58,59,60,61,62;63,64,65,66,67,68,69,7

In [68]:
def get_missed_line_nums(test_response_df, item, window_size=3, correct_threshold_in_window=0.66):
    line_num_to_info = {
        i: {
            'missed': 0,
            'hit_beginning': 0,
            'hit_end': 0
        }
        for i in range(len(item_to_line_indices[item]))
    }
    missed = []
    skipped = []
    for line_num, line_indices in enumerate(item_to_line_indices[item]):
        perf_mat_indices = [i * 2 + 1 for i in line_indices]
        
        ####### 'Lenience' for shorter lines
        if len(line_indices) <= window_size: # if a line is no longer than, say, 3 tokens:
            tokens_read = [test_response_df['token'][i]
                           for i in perf_mat_indices
                           if test_response_df['token'][i] == test_response_df['token'][i]] # handle nans?
            tokens_read = ' '.join(tokens_read)
            tokens_read = tokens_read.split()
            expected_tokens = [test_response_df['Unnamed: 0'][i]
                               for i in perf_mat_indices]
#             print(expected_tokens, tokens_read, line_num)
            if any([token in expected_tokens for token in tokens_read]):
                line_num_to_info[line_num]['missed'] = 0
                line_num_to_info[line_num]['hit_beginning'] = 1
                line_num_to_info[line_num]['hit_end'] = 1
                continue
        #######
                
#         if all([test_response_df.matches_expected[i] == 0 for i in perf_mat_indices]) and\
#                 sum(test_response_df.nframes[i] for i in perf_mat_indices) == 0:
        if sum(test_response_df.nframes[i] for i in perf_mat_indices) == 0:
            line_num_to_info[line_num]['missed'] = 1
            
        if np.mean([test_response_df.matches_expected[i]\
                   for i in perf_mat_indices[:window_size]]) > correct_threshold_in_window:
            line_num_to_info[line_num]['hit_beginning'] = 1
        if np.mean([test_response_df.matches_expected[i]\
                   for i in perf_mat_indices[-window_size:]]) > correct_threshold_in_window:
            line_num_to_info[line_num]['hit_end'] = 1
    
        ###### LENIENCE FOR ITEM 2102 Line 5 (sixth line) 'under the ground / underground'
        if item == '2102' and line_num == 5:
            perf_mat_indices_for_line_4 = [i * 2 + 1 for i in item_to_line_indices[item][4]]
            line_4_tokens_read = ' '.join([
                test_response_df.token[i] for i in perf_mat_indices_for_line_4
                if test_response_df.token[i] == test_response_df.token[i] # remove nans
            ])
            
            perf_mat_indices_for_line_6 = [i * 2 + 1 for i in item_to_line_indices[item][6]]
            line_6_tokens_read = ' '.join([
                test_response_df.token[i] for i in perf_mat_indices_for_line_6
                if test_response_df.token[i] == test_response_df.token[i] # remove nans
            ])
            
            if 'underground' in line_4_tokens_read or 'underground' in line_6_tokens_read:
                line_num_to_info[line_num]['missed'] = 0
                line_num_to_info[line_num]['hit_beginning'] = 1
                line_num_to_info[line_num]['hit_end'] = 1
            
    
#     print(line_num_to_info)
    
    for line_num, info in line_num_to_info.items():
        if info['missed']:
            missed.append(line_num)
            
            start_of_skip = line_num
            while (start_of_skip > 0) and line_num_to_info[start_of_skip - 1]['missed']:
                start_of_skip -= 1
            end_of_skip = line_num
            while (end_of_skip < len(line_num_to_info) - 1) and line_num_to_info[end_of_skip + 1]['missed']:
                end_of_skip += 1
            
            if start_of_skip == 0 or end_of_skip == len(line_num_to_info) - 1:
                continue
            if line_num_to_info[start_of_skip - 1]['hit_end'] and\
                line_num_to_info[end_of_skip + 1]['hit_beginning']:
                skipped.append(line_num)
    return {'missed': missed, 'skipped': skipped}

In [75]:
PERFMAT_DIR = '../output/performance-matrix/all-fixed/'
PERFMAT_DIR = '../output/performance-matrix/all-194/'

SESSION_SET = '184'
SESSION_SET = '194'

print('item', 'session', 'lines_skipped', sep='\t')

for item_dir in os.listdir(PERFMAT_DIR):
    if item_dir.endswith(SESSION_SET):
        item = item_dir[:item_dir.index('-')]
        for session_file in os.listdir(os.path.join(PERFMAT_DIR, item_dir)):
    #         if '31930' not in session_file:
    #             continue
            df = pd.read_csv(os.path.join(PERFMAT_DIR, item_dir, session_file), sep='\t')
            missed_and_skipped = get_missed_line_nums(df, item)
            if missed_and_skipped['skipped']:
                print(item,
                      session_file.replace('.tsv', ''),
                      ', '.join([str(i) for i in missed_and_skipped['skipped']]), sep='\t')

    #     break

item	session	lines_skipped
1401	61622	1
1401	67273	3
1401	77717	5, 6
1401	66862	3
420	95710	2
420	82745	2
420	61586	4
420	102011	5
350	77669	4
350	77672	4
350	67129	3
350	80307	4
350	79349	7
2904	83267	2
410	108909	4
2701	86418	4, 5, 6, 7, 8
2203	72036	5
2401	80332	2
2401	67093	5
2401	73962	5
2401	72047	5
2401	78583	5
2401	78180	2
310	108707	1
2402	99941	1, 2, 3
1903	61622	4
1903	83932	4
1903	77721	1
