In [1]:
import sys, os, json
import numpy as np, pandas as pd, matplotlib.pyplot as plt
from collections import defaultdict

In [3]:
with open('../data/moby/jsons/item_to_line_indices.json') as f:
    item_to_line_indices = json.load(f)

In [7]:
test_response_df = pd.read_csv('../output/performance-matrix/all-fixed/1401-184/27877.tsv', sep='\t')

In [9]:
item = '1401'

In [10]:
item_to_line_indices['1401']

[[0, 1, 2, 3, 4],
 [5, 6, 7, 8],
 [9, 10, 11, 12, 13],
 [14, 15, 16, 17, 18],
 [19, 20, 21, 22, 23],
 [24, 25, 26, 27, 28],
 [29, 30, 31, 32, 33],
 [34, 35, 36, 37, 38]]

In [58]:
def get_missed_line_nums(test_response_df, item, window_size=3, correct_threshold_in_window=0.66):
    line_num_to_info = {
        i: {
            'missed': 0,
            'hit_beginning': 0,
            'hit_end': 0
        }
        for i in range(len(item_to_line_indices[item]))
    }
    missed = []
    skipped = []
    for line_num, line_indices in enumerate(item_to_line_indices[item]):
        perf_mat_indices = [i * 2 + 1 for i in line_indices]
        
        ####### 'Lenience' for shorter lines
        if len(line_indices) <= window_size: # if a line is no longer than, say, 3 tokens:
            tokens_read = [test_response_df['token'][i]
                           for i in perf_mat_indices
                           if test_response_df['token'][i] == test_response_df['token'][i]] # handle nans?
            tokens_read = ' '.join(tokens_read)
            tokens_read = tokens_read.split()
            expected_tokens = [test_response_df['Unnamed: 0'][i]
                               for i in perf_mat_indices]
#             print(expected_tokens, tokens_read, line_num)
            if any([token in expected_tokens for token in tokens_read]):
                line_num_to_info[line_num]['missed'] = 0
                line_num_to_info[line_num]['hit_beginning'] = 1
                line_num_to_info[line_num]['hit_end'] = 1
                continue
        #######
                
        if all([test_response_df.matches_expected[i] == 0 for i in perf_mat_indices]):
            line_num_to_info[line_num]['missed'] = 1
        if np.mean([test_response_df.matches_expected[i]\
                   for i in perf_mat_indices[:window_size]]) > correct_threshold_in_window:
            line_num_to_info[line_num]['hit_beginning'] = 1
        if np.mean([test_response_df.matches_expected[i]\
                   for i in perf_mat_indices[-window_size:]]) > correct_threshold_in_window:
            line_num_to_info[line_num]['hit_end'] = 1
    
#     print(line_num_to_info)
    
    for line_num, info in line_num_to_info.items():
        if info['missed']:
            missed.append(line_num)
            
            start_of_skip = line_num
            while (start_of_skip > 0) and line_num_to_info[start_of_skip - 1]['missed']:
                start_of_skip -= 1
            end_of_skip = line_num
            while (end_of_skip < len(line_num_to_info) - 1) and line_num_to_info[end_of_skip + 1]['missed']:
                end_of_skip += 1
            
            if start_of_skip == 0 or end_of_skip == len(line_num_to_info) - 1:
                continue
            if line_num_to_info[start_of_skip - 1]['hit_end'] and\
                line_num_to_info[end_of_skip + 1]['hit_beginning']:
                skipped.append(line_num)
    return {'missed': missed, 'skipped': skipped}

In [59]:
PERFMAT_DIR = '../output/performance-matrix/all-fixed/'
for item_dir in os.listdir(PERFMAT_DIR):
    if item_dir.endswith('182'):
        continue
#     if '2201' not in item_dir:
#         continue
    item = item_dir[:item_dir.index('-')]
    for session_file in os.listdir(os.path.join(PERFMAT_DIR, item_dir)):
#         if '31930' not in session_file:
#             continue
        df = pd.read_csv(os.path.join(PERFMAT_DIR, item_dir, session_file), sep='\t')
        missed_and_skipped = get_missed_line_nums(df, item)
        if missed_and_skipped['skipped']:
            print(item, session_file, missed_and_skipped['skipped'])
        
#     break

2402 38887.tsv [2, 3, 4, 5, 6, 7]
2601 41124.tsv [6]
2403 28051.tsv [2]
1903 21004.tsv [1]
2203 31213.tsv [2]
2201 35171.tsv [9]
2201 50140.tsv [5]
2201 31525.tsv [8]
2201 48155.tsv [8]
2102 34053.tsv [5]
2102 26860.tsv [5]
2102 35138.tsv [5]
2102 46924.tsv [5]
2102 34140.tsv [5]
2102 48709.tsv [5]
2102 46650.tsv [5]
2102 50227.tsv [5]
2102 39861.tsv [5]
2102 53173.tsv [5]
1804 50616.tsv [3]
350 29650.tsv [4]
2902 51417.tsv [7]
1401 35160.tsv [3]
1603 51620.tsv [1]
1603 33303.tsv [3, 4, 5, 6]
3106 33654.tsv [9]
1505 41671.tsv [6]
1505 50492.tsv [3]
330 41120.tsv [7]
3105 51790.tsv [4, 5, 6, 7]
1703 46016.tsv [4]
1703 30056.tsv [3]
1703 29522.tsv [3]
2803 27575.tsv [2]
2803 33659.tsv [2]
2803 39310.tsv [2]
2803 28812.tsv [3]
2803 45977.tsv [2]
2803 26674.tsv [2]
1502 42844.tsv [5]
1502 52098.tsv [3]
1502 52416.tsv [5]
1502 49397.tsv [5]
1502 52872.tsv [5]
