# Search Testing

In [1]:
# Want to be able to reload changed modules on the fly
%load_ext autoreload
%autoreload 2

In [2]:
import os
import json

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import Audio

import src.search
from src.data import load_metadata, find_paths

import src.utils
src.utils.gpu_setup()

4 actual GPUs, 0 in use.


## Set up the searcher

In [3]:
%%time
# Set up the searcher
searcher = src.search.Searcher("../config.yaml")

CPU times: user 9.1 s, sys: 760 ms, total: 9.86 s
Wall time: 11.8 s


## Query: "near death experiences"

In [5]:
query_title = "near death experiences"
query_desc = "I wonder if people have shared near-death experiences in podcast episodes.  I would like to find and listen to some stories.  I am not interested in the science of near-death experiences."
search_df_0 = searcher.search(query_title, query_desc)

Running Elasticsearch query... returned 100 segments in 0.87 seconds
Getting rerank scores for segments... returned 100 scores in 20.06 seconds
Getting audio scores for 74 segments... 


100%|██████████| 74/74 [17:31<00:00, 14.20s/it]
  return (p/np.sum(p,axis=0)).T


In [6]:
search_ids_0 = searcher.rerank(search_df_0)
id_file = open("near-death-experiences-ids.json", "w")
json.dump(search_ids_0, id_file)
id_file.close()

Only 6 'entertaining' segments found, appending topical rank...


In [None]:
# First highest topical score
print(search_ids_0["topical"][0])
waveform = searcher.get_segment_audio(search_ids_0["topical"][0])
Audio(waveform, rate=44100)

In [None]:
# First highest entertaining score
print(search_ids_0["entertaining"][0])
waveform = searcher.get_segment_audio(search_ids_0["entertaining"][2])
Audio(waveform, rate=44100)

In [None]:
# First highest subjective score
print(search_ids_0["subjective"][0])
waveform = searcher.get_segment_audio(search_ids_0["subjective"][0])
Audio(waveform, rate=44100)

In [None]:
# First highest discussion score
print(search_ids_0["discussion"][0])
waveform = searcher.get_segment_audio(search_ids_0["discussion"][0])
Audio(waveform, rate=44100)

## Query: "black lives matter"

In [77]:
query_title = "black lives matter"
query_desc = "What do people mean when they say “black lives matter”?  I am interested in personal reflections that give context to the phrase “black lives matter” and why it is important to individuals.  News stores about Black Lives Matter protests are relevant as well."
search_df_1 = searcher.search(query_title, query_desc)

Running Elasticsearch query... returned 100 segments in 3.25 seconds
Getting rerank scores for segments... returned 100 scores in 19.40 seconds
Getting audio scores for 49 segments... 


100%|██████████| 49/49 [09:11<00:00, 11.26s/it]
  return (p/np.sum(p,axis=0)).T


In [78]:
search_ids_1 = searcher.rerank(search_df_1)
id_file = open("black-lives-matter-ids.json", "w")
json.dump(search_ids_1, id_file)
id_file.close()

In [None]:
# First highest topical score
print(search_ids_1["topical"][0])
waveform = searcher.get_segment_audio(search_ids_1["topical"][0])
Audio(waveform, rate=44100)

In [None]:
# First highest entertaining score
print(search_ids_1["entertaining"][0])
waveform = searcher.get_segment_audio(search_ids_1["entertaining"][0])
Audio(waveform, rate=44100)

In [None]:
# First highest subjective score
print(search_ids_1["subjective"][0])
waveform = searcher.get_segment_audio(search_ids_1["subjective"][0])
Audio(waveform, rate=44100)

In [None]:
# First highest discussion score
print(search_ids_1["discussion"][0])
waveform = searcher.get_segment_audio(search_ids_1["discussion"][0])
Audio(waveform, rate=44100)

## Query: "workplace diversity"

In [83]:
query_title = "workplace diversity"
query_desc = "What are things companies are doing and could do to promote diversity in the workplace?  Things like workplace programs, initiatives, education, and outreach are relevant.  Discussion about the outcomes of these efforts are relevant as well."
search_df_2 = searcher.search(query_title, query_desc)

Running Elasticsearch query... returned 100 segments in 0.17 seconds
Getting rerank scores for segments... returned 100 scores in 19.43 seconds
Getting audio scores for 43 segments... 


100%|██████████| 43/43 [05:42<00:00,  7.96s/it]


In [84]:
search_ids_2 = searcher.rerank(search_df_2)
id_file = open("workplace-diversity-ids.json", "w")
json.dump(search_ids_2, id_file)
id_file.close()

Only 1 'entertaining' segments found, appending topical rank...
Only 0 'subjective' segments found, appending topical rank...


In [None]:
# First highest topical score
print(search_ids_2["topical"][0])
waveform = searcher.get_segment_audio(search_ids_2["topical"][0])
Audio(waveform, rate=44100)

In [None]:
# First highest entertaining score
print(search_ids_2["entertaining"][0])
waveform = searcher.get_segment_audio(search_ids_2["entertaining"][0])
Audio(waveform, rate=44100)

In [None]:
# First highest subjective score
print(search_ids_2["subjective"][0])
waveform = searcher.get_segment_audio(search_ids_2["subjective"][0])
Audio(waveform, rate=44100)

In [None]:
# First highest discussion score
print(search_ids_2["discussion"][0])
waveform = searcher.get_segment_audio(search_ids_2["discussion"][0])
Audio(waveform, rate=44100)

## Query: "halloween stories and chat"

In [11]:
query_title = "halloween stories and chat"
query_desc = "I love Halloween and I want to hear stories and conversations about things people have done to celebrate it.  I am not looking for information about the history of Halloween or generalities about how it is celebrated, I want specific stories from individuals."
search_df_3 = searcher.search(query_title, query_desc)

Running Elasticsearch query... returned 100 segments in 0.47 seconds
Getting rerank scores for segments... returned 100 scores in 19.79 seconds
Getting audio scores for 44 segments... 


100%|██████████| 44/44 [10:15<00:00, 13.99s/it]
  return (p/np.sum(p,axis=0)).T


In [12]:
search_ids_3 = searcher.rerank(search_df_3)
id_file = open("halloween-stories-and-chat-ids.json", "w")
json.dump(search_ids_3, id_file)
id_file.close()

Only 5 'subjective' segments found, appending topical rank...


In [None]:
# First highest topical score
print(search_ids_3["topical"][0])
waveform = searcher.get_segment_audio(search_ids_3["topical"][0])
Audio(waveform, rate=44100)

In [None]:
# First highest entertaining score
print(search_ids_3["entertaining"][2])
waveform = searcher.get_segment_audio(search_ids_3["entertaining"][2])
Audio(waveform, rate=44100)

In [None]:
# First highest subjective score
print(search_ids_3["subjective"][0])
waveform = searcher.get_segment_audio(search_ids_3["subjective"][0])
Audio(waveform, rate=44100)

In [None]:
# First highest discussion score
print(search_ids_3["discussion"][0])
waveform = searcher.get_segment_audio(search_ids_3["discussion"][0])
Audio(waveform, rate=44100)

## Generating random lists

In [109]:
# Load the id dicts...
a_file = open("near-death-experiences-ids.json", "r")
search_ids_0 = a_file.read()
search_ids_0 = json.loads(search_ids_0)
a_file.close()

a_file = open("black-lives-matter-ids.json", "r")
search_ids_1 = a_file.read()
search_ids_1 = json.loads(search_ids_1)
a_file.close()

a_file = open("workplace-diversity-ids.json", "r")
search_ids_2 = a_file.read()
search_ids_2 = json.loads(search_ids_2)
a_file.close()

a_file = open("halloween-stories-and-chat-ids.json", "r")
search_ids_3 = a_file.read()
search_ids_3 = json.loads(search_ids_3)
a_file.close()

In [110]:
# Add different mood lists together for each
search_ids_0_comb = sum(search_ids_0.values(), [])
search_ids_1_comb = sum(search_ids_1.values(), [])
search_ids_2_comb = sum(search_ids_2.values(), [])
search_ids_3_comb = sum(search_ids_3.values(), [])

In [111]:
# Find unique ids
search_ids_0_set = set(search_ids_0_comb)
search_ids_1_set = set(search_ids_1_comb)
search_ids_2_set = set(search_ids_2_comb)
search_ids_3_set = set(search_ids_3_comb)

In [113]:
print(len(search_ids_0_set))
print(len(search_ids_1_set))
print(len(search_ids_2_set))
print(len(search_ids_3_set))

23
27
13
22


In [112]:
# Print for each
print(search_ids_0_set)
print(search_ids_1_set)
print(search_ids_2_set)
print(search_ids_3_set)

{'3FpXzNnV6p1zGrKntTuBwJ_480', '3FpXzNnV6p1zGrKntTuBwJ_180', '081C1aKHqRm3TLE5HfNFnJ_2820', '3FpXzNnV6p1zGrKntTuBwJ_1920', '2OpWYbK7lrZqo1RIL7aZXN_840', '6ryjkPOTQSD894X8408Gua_780', '3FpXzNnV6p1zGrKntTuBwJ_240', '6ryjkPOTQSD894X8408Gua_840', '3FpXzNnV6p1zGrKntTuBwJ_60', '2OpWYbK7lrZqo1RIL7aZXN_1620', '2OpWYbK7lrZqo1RIL7aZXN_900', '2mAS7kzNeqWYbENsPYti0M_1080', '2OpWYbK7lrZqo1RIL7aZXN_1680', '2OpWYbK7lrZqo1RIL7aZXN_60', '5mDIjsJihGSHJSq5oyPgux_0', '3FpXzNnV6p1zGrKntTuBwJ_1500', '7F5kSbXhBx6LdhfJ09Fnsy_300', '1gncCb6vqP89WL8svVgGBq_240', '5sTWLPTj2kEVTD2Gs0hp16_2640', '5DXiyNJMCJu7ZJTvN6K4zG_60', '2OpWYbK7lrZqo1RIL7aZXN_120', '75a4qxnqCYVPPB00E7rvjh_2100', '3FpXzNnV6p1zGrKntTuBwJ_0'}
{'2eKgAe2W243hPMJpYpD4Sm_3660', '3JoMKEfcvrKfBZ8grFwOgt_180', '1Mv8k3QtmXCbTBZgZatGTw_180', '6NuF9BV3moacBd7u6AxLU1_3240', '4q9AxryFxK0NYSTbnKemaj_2460', '2fmh46haAGGG37B625t4vC_360', '3amq9FA4tFZxXO4654OmFE_600', '0wC0hbvqCxAwa3nPz7HmSf_2460', '1oBaNfo2TZZCYg5T5Lk01J_1200', '7jYoZ6xFGTEmZUQFltBg8u_1560', '

## Analysis

In [3]:
# Load the id dicts...
a_file = open("data/near-death-experiences-ids.json", "r")
nde_ids = a_file.read()
nde_ids = json.loads(nde_ids)
a_file.close()

a_file = open("data/black-lives-matter-ids.json", "r")
blm_ids = a_file.read()
blm_ids = json.loads(blm_ids)
a_file.close()

a_file = open("data/halloween-stories-and-chat-ids.json", "r")
hsac_ids = a_file.read()
hsac_ids = json.loads(hsac_ids)
a_file.close()

In [4]:
# Load the evaluated segments
nde_df = pd.read_csv('./data/nde.csv')  
blm_df = pd.read_csv('./data/blm.csv')  
hsac_df = pd.read_csv('./data/hsac.csv')  

In [5]:
def evaluate(baseline_segs, our_segs, true_segs):
    base_correct = 0
    for seg in baseline_segs:
        if seg in true_segs:
            base_correct += 1
    our_correct = 0
    for seg in our_segs:
        if seg in true_segs:
            our_correct += 1
    
    print("Num true: {}, Num baseline: {}, Num ours: {}".format(len(true_segs), len(baseline_segs), len(our_segs)))
    print("Baseline correct: {}, Ours correct: {}".format(base_correct, our_correct))
    print("Baseline frac: {}, Ours frac: {}".format((base_correct/len(baseline_segs)),(our_correct/len(our_segs))))
    print()

In [8]:
true_funny_segs = []
for index, row in nde_df.iterrows():
    if "funny" in row["entertaining"]:
        true_funny_segs.append(row["uri"].strip("spotify:episode:") + "_" + str(row["timestamp"]))
for index, row in blm_df.iterrows():
    if "funny" in row["entertaining"]:
        true_funny_segs.append(row["uri"].strip("spotify:episode:") + "_" + str(row["timestamp"]))
for index, row in hsac_df.iterrows():
    if "funny" in row["entertaining"]:
        true_funny_segs.append(row["uri"].strip("spotify:episode:") + "_" + str(row["timestamp"]))
        
our_funny_segs = []
our_funny_segs.extend(nde_ids["entertaining"][:6])
our_funny_segs.extend(blm_ids["entertaining"][:])
our_funny_segs.extend(hsac_ids["entertaining"][:])

base_funny_segs = []
base_funny_segs.extend(nde_ids["topical"][:])
base_funny_segs.extend(blm_ids["topical"][:])
base_funny_segs.extend(hsac_ids["topical"][:])

evaluate(base_funny_segs, our_funny_segs, true_funny_segs)

Num true: 26, Num baseline: 30, Num ours: 26
Baseline correct: 6, Ours correct: 20
Baseline frac: 0.2, Ours frac: 0.7692307692307693



In [9]:
true_discussion_segs = []
for index, row in nde_df.iterrows():
    if "conversation" in row["discussion"] or "debate" in row["discussion"]:
        true_discussion_segs.append(row["uri"].strip("spotify:episode:") + "_" + str(row["timestamp"]))
for index, row in blm_df.iterrows():
    if "conversation" in row["discussion"] or "debate" in row["discussion"]:
        true_discussion_segs.append(row["uri"].strip("spotify:episode:") + "_" + str(row["timestamp"]))
for index, row in hsac_df.iterrows():
    if "conversation" in row["discussion"] or "debate" in row["discussion"]:
        true_discussion_segs.append(row["uri"].strip("spotify:episode:") + "_" + str(row["timestamp"]))
        
our_discussion_segs = []
our_discussion_segs.extend(nde_ids["discussion"][:])
our_discussion_segs.extend(blm_ids["discussion"][:])
our_discussion_segs.extend(hsac_ids["discussion"][:])

base_discussion_segs = []
base_discussion_segs.extend(nde_ids["topical"][:])
base_discussion_segs.extend(blm_ids["topical"][:])
base_discussion_segs.extend(hsac_ids["topical"][:])

evaluate(base_discussion_segs, our_discussion_segs, true_discussion_segs)

Num true: 41, Num baseline: 30, Num ours: 30
Baseline correct: 7, Ours correct: 13
Baseline frac: 0.23333333333333334, Ours frac: 0.43333333333333335



In [78]:
true_subjective_segs = []
for index, row in nde_df.iterrows():
    if "Disapproval" in row["subjective"] or "Approval" in row["subjective"]:
        true_subjective_segs.append(row["uri"].strip("spotify:episode:") + "_" + str(row["timestamp"]))
for index, row in blm_df.iterrows():
    if "Disapproval" in row["subjective"] or "Approval" in row["subjective"]:
        true_subjective_segs.append(row["uri"].strip("spotify:episode:") + "_" + str(row["timestamp"]))
for index, row in hsac_df.iterrows():
    if "Disapproval" in row["subjective"] or "Approval" in row["subjective"]:
        true_subjective_segs.append(row["uri"].strip("spotify:episode:") + "_" + str(row["timestamp"]))
        
our_subjective_segs = []
our_subjective_segs.extend(nde_ids["subjective"][:])
our_subjective_segs.extend(blm_ids["subjective"][:])
our_subjective_segs.extend(hsac_ids["subjective"][:5])

base_subjective_segs = []
base_subjective_segs.extend(nde_ids["topical"][:])
base_subjective_segs.extend(blm_ids["topical"][:])
base_subjective_segs.extend(hsac_ids["topical"][:])

evaluate(base_subjective_segs, our_subjective_segs, true_subjective_segs)

Num true: 30, Num baseline: 30, Num ours: 25
Baseline correct: 11, Ours correct: 11
Baseline frac: 0.36666666666666664, Ours frac: 0.44


In [68]:
true_funny_segs = []
for index, row in nde_df.iterrows():
    if "funny" in row["entertaining"]:
        true_funny_segs.append(row["uri"].strip("spotify:episode:") + "_" + str(row["timestamp"]))
evaluate(nde_ids["topical"][:], nde_ids["entertaining"][:6], true_funny_segs)

Num true: 8, Num baseline: 10, Num ours: 6
Baseline correct: 0, Ours correct: 6
Baseline frac: 0.0, Ours frac: 1.0


In [82]:
true_funny_segs = []
for index, row in blm_df.iterrows():
    if "funny" in row["entertaining"]:
        true_funny_segs.append(row["uri"].strip("spotify:episode:") + "_" + str(row["timestamp"]))
evaluate(blm_ids["topical"][:], blm_ids["entertaining"][:], true_funny_segs)

Num true: 9, Num baseline: 10, Num ours: 10
Baseline correct: 3, Ours correct: 7
Baseline frac: 0.3, Ours frac: 0.7


In [83]:
true_funny_segs = []
for index, row in hsac_df.iterrows():
    if "funny" in row["entertaining"]:
        true_funny_segs.append(row["uri"].strip("spotify:episode:") + "_" + str(row["timestamp"]))
evaluate(hsac_ids["topical"][:], hsac_ids["entertaining"][:], true_funny_segs)

Num true: 9, Num baseline: 10, Num ours: 10
Baseline correct: 3, Ours correct: 7
Baseline frac: 0.3, Ours frac: 0.7


In [71]:
true_discussion_segs = []
for index, row in nde_df.iterrows():
    if "conversation" in row["discussion"] or "debate" in row["discussion"]:
        true_discussion_segs.append(row["uri"].strip("spotify:episode:") + "_" + str(row["timestamp"]))
evaluate(nde_ids["topical"][:], nde_ids["discussion"][:], true_discussion_segs)

Num true: 11, Num baseline: 10, Num ours: 10
Baseline correct: 0, Ours correct: 4
Baseline frac: 0.0, Ours frac: 0.4


In [72]:
true_discussion_segs = []
for index, row in blm_df.iterrows():
    if "conversation" in row["discussion"] or "debate" in row["discussion"]:
        true_discussion_segs.append(row["uri"].strip("spotify:episode:") + "_" + str(row["timestamp"]))
evaluate(blm_ids["topical"][:], blm_ids["discussion"][:], true_discussion_segs)

Num true: 18, Num baseline: 10, Num ours: 10
Baseline correct: 5, Ours correct: 8
Baseline frac: 0.5, Ours frac: 0.8


In [73]:
true_discussion_segs = []
for index, row in hsac_df.iterrows():
    if "conversation" in row["discussion"] or "debate" in row["discussion"]:
        true_discussion_segs.append(row["uri"].strip("spotify:episode:") + "_" + str(row["timestamp"]))
evaluate(hsac_ids["topical"][:], hsac_ids["discussion"][:], true_discussion_segs)

Num true: 12, Num baseline: 10, Num ours: 10
Baseline correct: 2, Ours correct: 1
Baseline frac: 0.2, Ours frac: 0.1


In [74]:
true_subjective_segs = []
for index, row in nde_df.iterrows():
    if "Disapproval" in row["subjective"] or "Approval" in row["subjective"]:
        true_subjective_segs.append(row["uri"].strip("spotify:episode:") + "_" + str(row["timestamp"]))
evaluate(nde_ids["topical"][:], nde_ids["subjective"][:], true_subjective_segs)

Num true: 8, Num baseline: 10, Num ours: 10
Baseline correct: 3, Ours correct: 4
Baseline frac: 0.3, Ours frac: 0.4


In [75]:
true_subjective_segs = []
for index, row in blm_df.iterrows():
    if "Disapproval" in row["subjective"] or "Approval" in row["subjective"]:
        true_subjective_segs.append(row["uri"].strip("spotify:episode:") + "_" + str(row["timestamp"]))
evaluate(blm_ids["topical"][:], blm_ids["subjective"][:], true_subjective_segs)

Num true: 17, Num baseline: 10, Num ours: 10
Baseline correct: 5, Ours correct: 5
Baseline frac: 0.5, Ours frac: 0.5


In [84]:
true_subjective_segs = []
for index, row in hsac_df.iterrows():
    if "Disapproval" in row["subjective"] or "Approval" in row["subjective"]:
        true_subjective_segs.append(row["uri"].strip("spotify:episode:") + "_" + str(row["timestamp"]))
evaluate(hsac_ids["topical"][:], hsac_ids["subjective"][:5], true_subjective_segs)

Num true: 5, Num baseline: 10, Num ours: 5
Baseline correct: 3, Ours correct: 2
Baseline frac: 0.3, Ours frac: 0.4
