In [2]:
import json
#See docstring for info 

def extract_segments(path):
    """Given path to json file containing an episode extracts all segments of that episode, 
    including start and end time of each segment."""
    with open(path, "r") as read_file:
        episode = json.load(read_file)
    segments=[]
    #had to do "manual" iteration due to irregularities in data
    iter=0
    for segment in episode["results"]:
        seg_result={}
        #make sure there is only one dict in this list (should be true according to dataset description)
        assert len(segment["alternatives"])==1
        segment_dict=segment["alternatives"][0]
        #sometimes "alternatives" dict is empty...
        if "words" and "transcript"  in segment_dict:
            #add segment number
            seg_result["segNum"]=iter
            #add timestamp of the first word in this segment
            seg_result["startTime"]=segment_dict["words"][0]["startTime"]
            #add timestamp of the last word in this segment
            seg_result["endTime"]=segment_dict["words"][-1]["endTime"]
            #add transcript of this segment 
            seg_result["transcript"]=segment_dict["transcript"]
            segments.append(seg_result)
            iter+=1

    return segments


In [3]:
from training_data_collection import collect_training_episodes
import os
#finds all episodes in training set, extracts their segments using 
# and creates dict where key is episode and value is list with segments
path_to_training_set='../data/training_sub.json'
if not os.path.exists(path_to_training_set):
    input_file = '../data/podcasts_2020_train.1-8.qrels.txt'
    root_dir = '../data/podcasts-no-audio-13GB'
    training_episodes = collect_training_episodes(root_dir, input_file)
    training_segments = {}

    for episode in training_episodes:
        episode_id=episode.split('/')[-1].split('.json')[0]
        training_segments[episode_id]=extract_segments(episode)
    with open(path_to_training_set, 'w+') as fout:
        json.dump(training_segments , fout)

In [4]:
#load episode segments data
import json
with open(path_to_training_set,'r') as f:
    data=json.load(f)


In [5]:
#fetches topics in training set and puts query and description into dict with key being topic number.
import xml.etree.ElementTree as ET

tree = ET.parse('../data/podcasts_2020_topics_train.xml')
root = tree.getroot()

topics={}
for element in root:
    topics[element[0].text]={'query':element[1].text,'description':element[3].text} 
topics

{'1': {'query': 'coronavirus spread',
  'description': 'What were people saying about the spread of the novel coronavirus NCOV-19 in Wuhan at the end of 2019?'},
 '2': {'query': 'greta thunberg cross atlantic',
  'description': 'What were people saying about Greta Thunberg’s sailing trip across the Atlantic Ocean in the fall of 2019 and its relationship to global climate change?'},
 '3': {'query': 'black hole image',
  'description': 'In May 2019 astronomers released the first-ever picture of a black hole.  I would like to hear some conversations and educational discussion about the science of astronomy, black holes, and of the picture itself.'},
 '4': {'query': 'story about riding a bird',
  'description': 'I remember hearing a podcast that had a story about a kid riding some kind of bird.  I want to find it again.'},
 '5': {'query': 'daniel ek interview',
  'description': 'Someone told me about a podcast interview with Daniel Ek, CEO of Spotify, about the founding and early days of S

In [6]:
#Extract jump-in points from training set.
from collections import defaultdict

with open('../data/podcasts_2020_train.1-8.qrels.txt','r') as f:
    contents=f.readlines()

#Find topic + episode from training set and extract jump-in point (if rating is better than zero). 
temp_list=[(line[0],line.split()[2].split('_')[0].split(':')[2],line.split()[2].split('_')[1]) for line in contents if line[-2]!='0']

#Put into dict to make sure we only get one instance of topic + episode. Value is list of jump-in points 
#for episode + topic combination.
targets=defaultdict(list)
for line in temp_list:
    targets[line[0]+'-'+line[1]].append(float(line[2]))


In [7]:
from sentence_transformers import SentenceTransformer, util
import numpy as np
embedder = SentenceTransformer('bert-base-nli-mean-tokens')


In [8]:
#get prediction based on single nearest neighbour segment (k=1)

from tqdm import tqdm
#set k
top_k=1
#set true for easier debugging
verbose=True

#prediction loop
n_correct=0
#remove n_targets once training_sub.json contains all episodes in training set
n_targets=0
#to slice targets use tqdm(list(targets.items())[:]) 
for key,jump_in_point in tqdm(targets.items()):
    topic_no,episode_id=key.split('-')
    query=topics[topic_no]['query']
    description=topics[topic_no]['description']
    #remove if statement once training_sub.json contains all episodes in training set
    if episode_id in data:
        episode_data=data[episode_id]
        segment_texts=[item["transcript"] for item in episode_data]
        segment_timespans=[(float(item["startTime"].split('s')[0]),float(item["endTime"].split('s')[0])) for item in episode_data]
        #create list of tuples with (start_time,start_time+120) for each jump-in point in
        target_timespans=[ (start_time,start_time+120.0) for start_time in jump_in_point]
        #embed topic using both query and description (could use both or either)
        query_embedding=embedder.encode(query +' '+description , convert_to_tensor=True)
        segment_embeddings=embedder.encode(segment_texts , convert_to_tensor=True)
        cos_scores = util.pytorch_cos_sim(query_embedding, segment_embeddings)[0]
        cos_scores = cos_scores.cpu()
        results = np.argpartition(-cos_scores, range(top_k))[0:top_k]
        idx_closest=results[0]
        pred_timespan=(segment_timespans[idx_closest][0],segment_timespans[idx_closest][1])
        if verbose:
            print(f"\n Target timespans: {target_timespans}")
            print(f" Predicted timespan: {pred_timespan}")
        #function to check if two ranges overlap a=(start_range_a,end_range_a) b=(start_range_b,end_range_b)
        does_overlap=lambda a,b:max(0, min(a[1], b[1]) - max(a[0], b[0]))>0
        
        #if any target timespan overlap with the predicted timespan the prediction is correct.
        if any([does_overlap(pred_timespan, target_timespan) for target_timespan in target_timespans]):
            if verbose:
                print("Correct prediction!")
            n_correct+=1
        #remove n_targets once training_sub.json contains all episodes in training set
        n_targets+=1  

        
accuracy=n_correct/n_targets
    

accuracy

  1%|          | 1/100 [00:10<17:05, 10.35s/it]


 Target timespans: [(240.0, 360.0), (300.0, 420.0), (360.0, 480.0)]
 Predicted timespan: (1327.5, 1343.7)


  2%|▏         | 2/100 [00:12<12:41,  7.77s/it]


 Target timespans: [(120.0, 240.0)]
 Predicted timespan: (42.9, 72.6)


  2%|▏         | 2/100 [00:18<15:10,  9.29s/it]


KeyboardInterrupt: 