In [1]:
import sys
import os
import itertools

import numpy as np
import sklearn.metrics

sys.path.append(os.path.dirname(os.path.realpath('.')))
os.environ["WANDB_DISABLED"] = "true"

# Evaluation utility methods

In [2]:
def range_equals(left: 'Tuple[float, float]', right: 'Tuple[float, float]', eps: float) -> bool:
    left_start, left_end = left
    right_start, right_end = right
    
    return (abs(left_start - right_start) <= eps
        and abs(left_end - right_end) <= eps)

def count_range_equals(pairs, eps: float) -> int:
    cnt = 0
    for left, right in pairs:
        if range_equals(left, right, eps):
            cnt += 1
    return cnt

def range_negation(base: 'Tuple[float, float]', ranges: 'List[Tuple[float, float]]') -> 'List[Tuple[float, float]]':
    """
    base:    |-------------|
    ranges:  | ***   **    |
    Return:  |#   ###  ####|
    """
    results = []
    last_end = base[0]
    for r in ranges:
        if last_end != r[0]:
            results.append((last_end, r[0]))
        last_end = r[1]
    if last_end != base[1]:
        results.append((last_end, base[1]))
        
    return results

In [32]:
def create_labels_from_range(captions, sponsor_ranges):
    caption_labels = np.zeros(len(captions), dtype=bool)
    for start_idx, end_idx in sponsor_ranges:
        if start_idx is None or end_idx is None:
            continue
        for i in range(start_idx, end_idx + 1):
            caption_labels[i] = True

    token_labels = []
    for i, caption in enumerate(captions):
        num_tokens = len(caption.text.split())
        token_labels.extend([caption_labels[i]] * num_tokens)
    return token_labels

def create_labels_from_times(captions, sponsor_times):
    ranges = [get_intersection_range(captions, *pair[1]) for pair in sponsor_times]
    return create_labels_from_range(captions, ranges)

def merge_ranges(ranges):
    range_iter = iter(ranges)
    try:
        results = [next(range_iter)]
    except StopIteration:
        return
    for r in range_iter:
        if results[-1][1] == r[0]:
            results.append(r)
        else:
            yield results[0][0], results[-1][1]
            results = [r]

    yield results[0][0], results[-1][1]

def compute_results(videos, model, eps=5):
    from tqdm.auto import tqdm
    
    predicted_labels = np.empty(0)
    actual_labels = np.empty(0)
    # Values for our close match metric (exact match with threshold)
    # Number of maches
    close_matches = 0
    # Number of predicted ranges
    total_predicted_ranges = 0
    
    for video_id, captions, sponsor_ranges in tqdm(videos):
        predicted_sponsor_ranges = []

        for predicted_range in merge_ranges(model.predict(captions)):
            if predicted_range[0] is None or predicted_range[1] is None:
                continue
            
            predicted_sponsor_ranges.append(predicted_range)
            
            if any((range_equals(predicted_range, r, eps) for r in sponsor_ranges)):
                close_matches += 1
            total_predicted_ranges += 1

        predicted_labels = np.append(predicted_labels, create_labels_from_range(captions, predicted_sponsor_ranges))
        actual_labels = np.append(actual_labels, create_labels_from_range(captions, sponsor_ranges))
        
        print(f'\tPredicted={predicted_sponsor_ranges},\n\tExpected={sponsor_ranges}')
        
    return { 'predictions': predicted_labels, 'references': actual_labels, 'exact_match': close_matches, 'predicted_ranges': total_predicted_ranges }

def evaluate(videos, model, eps=1):
    from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, precision_recall_curve, roc_curve
    
    outputs = compute_results(videos, model, eps)
    predictions = outputs['predictions']
    references = outputs['references']
    exact_matches = outputs['exact_match']
    total_predictions = outputs['predicted_ranges']
    
    exact_match_score = exact_matches / total_predictions
    print(f'Exact match (with {eps}s threshold)', exact_match_score)
    print('\tConfusion matrix', confusion_matrix(predictions, references))
    print('\tAccuracy', accuracy_score(predictions, references))
    print('\tPrecision', precision_score(predictions, references))
    print('\tRecall', recall_score(predictions, references))
    print('\tP@R', precision_recall_curve(predictions, references))
    print('\tRoC', roc_curve(predictions, references))

# Load videos to evaluate

In [33]:
from data_loader import load_captions_from_chunks

test_videos = list(itertools.islice(load_captions_from_chunks('data', './', [16]), 1, 20))

[34mOpening ./data.16.json.gz for reading...[0m


Dropping YzgTMh21zhI because sponsor times do not match the captions
Dropping yzhnRt6ZDKM because sponsor times do not match the captions
Dropping YZhVE7X0zwk because sponsor times do not match the captions
Dropping yzJokj2gelY because sponsor times do not match the captions
Dropping YzMAxdSdkzo because sponsor times do not match the captions
Dropping YZMrBCxarlk because sponsor times do not match the captions


# Evaluate Sequence Classification

In [34]:
from sequence_classification import SponsorSequenceClassification

evaluate(
    videos=test_videos,
    model=SponsorSequenceClassification('distilbert-classification-uncased/checkpoint-7210'),
    eps=1
)

  0%|          | 0/19 [00:00<?, ?it/s]

	Predicted=[(230, 235), (351, 357)],
	Expected=[[21, 46]]
	Predicted=[(264, 270)],
	Expected=[[267, 300]]
	Predicted=[(13, 17), (33, 38), (56, 58)],
	Expected=[[14, 41]]
	Predicted=[(51, 56), (187, 192), (256, 259), (292, 297)],
	Expected=[[187, 231]]
	Predicted=[],
	Expected=[[156, 196]]
	Predicted=[(0, 10), (147, 153), (161, 167), (181, 193)],
	Expected=[[145, 194]]
	Predicted=[],
	Expected=[[9, 32]]
	Predicted=[],
	Expected=[[0, 0]]
	Predicted=[(0, 4), (101, 115), (124, 128)],
	Expected=[[0, 5], [97, 125]]
	Predicted=[(17, 24), (48, 53)],
	Expected=[[20, 58]]
	Predicted=[(0, 4)],
	Expected=[[0, 5]]
	Predicted=[],
	Expected=[[2, 4], [276, 302]]
	Predicted=[(0, 3), (469, 475), (641, 648)],
	Expected=[[0, 1]]
	Predicted=[(0, 3), (869, 873), (880, 884), (891, 895)],
	Expected=[[0, 4]]
	Predicted=[(0, 2)],
	Expected=[[0, 0], [62, 77]]
	Predicted=[(0, 6), (14, 20)],
	Expected=[[2, 28]]
	Predicted=[(129, 134)],
	Expected=[[73, 98]]
	Predicted=[(27, 31)],
	Expected=[[14, 40]]
	Predicted=[(1

# Evaluate Sequence Labelling

In [36]:
from sequence_labelling import SponsorTokenClassification

evaluate(
    videos=test_videos,
    model=SponsorTokenClassification('seq_labelling.model'),
    eps=1
)

  0%|          | 0/19 [00:00<?, ?it/s]

	Predicted=[(350, 362)],
	Expected=[[21, 46]]
	Predicted=[(264, 264), (265, 297)],
	Expected=[[267, 300]]
	Predicted=[(0, 0), (14, 39)],
	Expected=[[14, 41]]
	Predicted=[],
	Expected=[[187, 231]]
	Predicted=[],
	Expected=[[156, 196]]
	Predicted=[(0, 1), (2, 6), (139, 139), (140, 140), (141, 141), (142, 142), (143, 194), (538, 538), (540, 540), (541, 541), (542, 542), (543, 543), (544, 546), (547, 547)],
	Expected=[[145, 194]]
	Predicted=[(14, 14), (15, 28), (29, 30)],
	Expected=[[9, 32]]
	Predicted=[],
	Expected=[[0, 0]]
	Predicted=[(1, 1), (2, 3), (4, 4), (5, 5), (64, 64), (97, 129)],
	Expected=[[0, 5], [97, 125]]
	Predicted=[],
	Expected=[[20, 58]]
	Predicted=[(0, 4)],
	Expected=[[0, 5]]
	Predicted=[(0, 3), (270, 270), (272, 272), (273, 273), (274, 274), (275, 276), (277, 302), (308, 308)],
	Expected=[[2, 4], [276, 302]]
	Predicted=[(463, 463), (469, 517)],
	Expected=[[0, 1]]
	Predicted=[(864, 864), (865, 865), (866, 866), (868, 878), (879, 915), (940, 943), (946, 946), (947, 949), (

# Evaluate Span Extraction
The predict function is not implemented. Some results are available in the corresponding notebook.

In [38]:
from span_extraction import SponsorSpanExtraction

evaluate(
    videos=test_videos,
    model=SponsorSpanExtraction('distilbert-span-extraction-uncased/checkpoint-9000'),
    eps=1
)

  0%|          | 0/19 [00:00<?, ?it/s]

{'input_ids': [101, 7592, 6160, 2067, 2000, 1043, 2290, 2694, 2062, 4919, 6456, 1998, 11218, 6160, 2067, 2000, 2047, 2482, 2154, 1045, 1005, 1049, 2182, 2012, 16099, 28387, 1998, 2651, 1045, 1005, 1049, 9334, 2026, 5585, 2475, 19989, 15386, 1042, 2017, 1005, 2222, 5060, 1999, 1996, 4281, 2182, 2045, 1005, 1055, 2070, 5220, 3765, 2057, 1005, 2310, 2288, 2720, 5226, 1005, 1055, 2938, 1999, 2304, 1049, 2620, 2023, 2003, 2941, 1996, 2034, 2051, 1045, 1005, 2310, 2464, 2009, 2200, 3835, 2000, 2156, 2023, 2482, 2009, 2941, 3504, 3243, 4658, 2005, 13255, 2023, 2003, 3492, 4658, 2061, 2092, 2589, 2032, 1045, 2123, 1005, 1056, 2411, 2507, 2032, 19394, 2015, 2021, 2045, 1005, 1055, 1037, 2235, 2028, 2045, 1998, 2057, 1005, 2310, 2036, 2288, 2026, 2567, 1005, 1055, 2146, 29141, 2846, 13631, 1998, 2036, 2026, 2567, 2004, 2092, 2026, 2567, 2003, 2058, 2045, 1998, 13255, 2003, 2058, 2045, 2004, 2092, 2027, 1005, 2128, 2045, 2521, 2062, 10990, 2135, 2084, 2216, 28781, 2059, 2007, 2023, 2678, 2017, 20

NameError: name 'start_char_idx' is not defined