# Setup

In [None]:
import os
import numpy as np
import pandas as pd
import soundfile as sf

In [None]:
from src.data import load_metadata, find_paths
from omegaconf import OmegaConf
from IPython.display import Audio
    

conf = OmegaConf.load("../config.yaml")
DATA_AUDIO = os.path.join(conf.dataset_path, "podcasts-audio")

In [None]:
import warnings
warnings.filterwarnings('ignore')

done = False

In [None]:
uri_file = "blm.txt" # TODO maybe you need to change this

In [None]:
# loading metadata
metadata = load_metadata(conf.dataset_path)

In [None]:
def audio_snippet(uri, starttime, metadata, duration=120):
    episode = metadata[(metadata.episode_uri == uri).values]
    audio_file = find_paths(episode, DATA_AUDIO, ".ogg")
    snippet, sr = sf.read(audio_file[0], start=starttime*44100, stop=(starttime+duration)*44100, dtype=np.int16)
    return snippet, sr

In [None]:
try:
    df = pd.read_csv(f"uri_set_labeled_blm.csv")
    df.set_index("Unnamed: 0", inplace=True)
except:
    print("saved file doesn't exist yet")
    segments = np.array([i.split('_') for i in  np.loadtxt(uri_file, dtype=str) ] )
    df = pd.DataFrame(segments, columns=['uri', 'timestamp'])
    df['uri'] = "spotify:episode:" + df['uri']
    df["entertaining"] = np.ones(len(df)) * -99
    df["discussion"] = np.ones(len(df)) * -99
    df["subjective"] = np.ones(len(df)) * -99

In [None]:
# finding where you left off:
for i in range(len(df)):
    if df.entertaining.iloc[i] == -99:
        break

In [None]:
print(i)

# Select Topic

In [None]:
topic = 34

In [None]:
subset = qrels.iloc[ (qrels["query"] == topic).values]

In [None]:
df = pd.DataFrame(columns=subset.columns)
for i in range(5):
    relevant = subset[subset.relevance == i]
    order = np.random.permutation(len(relevant))
    df = df.append(relevant.iloc[order].iloc[:5])
df["entertaining"] = np.ones(len(df)) * -99
df["discussion"] = np.ones(len(df)) * -99
df["subjective"] = np.ones(len(df)) * -99
df = df.sample(frac=1, random_state=42)

In [None]:
for i in range(5):
    print(f"There are {sum(subset.relevance == i):>4} episodes with relevancy {i}")

In [None]:
i = 0

# Loop this

In [None]:
ep = metadata[metadata.episode_uri == df.uri.iloc[i]]
print("Show name: \n\t", ep.show_name.iloc[0])
print("Show description: \n\t", ep.show_description.iloc[0])
print()
print("Episode name:\n\t", ep.episode_name.iloc[0])
print("Episode description: \n\t", ep.episode_description.iloc[0])

Loop the following section, listen to the audio, fill in the label for how entertaining it was

In [None]:
audio, sr = audio_snippet(df.uri.iloc[i], int(float(df.timestamp.iloc[i])), metadata)
Audio(np.mean(audio, axis=1), rate=sr)

__Select multiple from "storytelling", "excitement", "funny", "angry", "sad"__

In [None]:
df["entertaining"].iloc[i] = ["funny"]

__Select multiple from "narration/monologue", "conversation", "debate", "interview"__

In [None]:
df["discussion"].iloc[i] = ["debate"]

__Select multiple from "Disapproval", "Approval"__

In [None]:
df["subjective"].iloc[i] = ["Disapproval"]

In [None]:
i += 1
print(f"Are you done? {'YES!' if i == len(df) else 'NO!'}      ({(i)/len(df)*100:.0f}% done)" )

# Final Check + Save

In [None]:
missing = (df.entertaining == -99) | (df.discussion == - 99)
if sum(missing) == 0:
    print("All entries rated")
    done = True
else:
    print(f"Missing ratings for i= {list(np.where(missing)[0])}")
    done = False

In [None]:
if done:
    df.to_csv(f"uri_set_labeled_blm.csv")
else:
    print("Please first finish the labeling")