# Setup

In [1]:
import os
import numpy as np
import pandas as pd
import soundfile as sf

In [2]:
from src.data import load_metadata, find_paths
from omegaconf import OmegaConf
from IPython.display import Audio
    

conf = OmegaConf.load("../config.yaml")
DATA_AUDIO = os.path.join(conf.dataset_path, "podcasts-audio")

In [3]:
import warnings
warnings.filterwarnings('ignore')

done = False

# Select your uri list

In [4]:
YOUR_NUMBER = 1

In [5]:
uri_file = f"../data/uri_set_{YOUR_NUMBER}.txt" # TODO maybe you need to change this

In [6]:
# loading metadata
metadata = load_metadata(conf.dataset_path)

In [7]:
def audio_snippet(uri, starttime, metadata, duration=120):
    episode = metadata[(metadata.episode_uri == uri).values]
    audio_file = find_paths(episode, DATA_AUDIO, ".ogg")
    snippet, sr = sf.read(audio_file[0], start=starttime*44100, stop=(starttime+duration)*44100, dtype=np.int16)
    return snippet, sr

# Load partially finished file OR create new dataframe:

In [8]:
try:
    df = pd.read_csv(f"uri_set_labeled_{YOUR_NUMBER}.csv")
    df.set_index("Unnamed: 0", inplace=True)
except:
    print("saved file doesn't exist yet")
    segments = np.array([i.split('_') for i in  np.loadtxt(uri_file, dtype=str) ] )
    df = pd.DataFrame(segments, columns=['uri', 'timestamp'])
    df['uri'] = "spotify:episode:" + df['uri']
    df["entertaining"] = np.ones(len(df)) * -99
    df["discussion"] = np.ones(len(df)) * -99
    df["subjective"] = np.ones(len(df)) * -99

In [9]:
# finding where you left off:
for i in range(len(df)):
    if df.entertaining.iloc[i] == -99:
        break

In [10]:
print(i)

0


# Loop this

In [11]:
ep = metadata[metadata.episode_uri == df.uri.iloc[i]]
print("Show name: \n\t", ep.show_name.iloc[0])
print("Show description: \n\t", ep.show_description.iloc[0])
print()
print("Episode name:\n\t", ep.episode_name.iloc[0])
print("Episode description: \n\t", ep.episode_description.iloc[0])

Show name: 
	 The LashBase Podcast
Show description: 
	 The latest lash industry news, interviews and hot topics. Hosted by LashBase Jamie.

Episode name:
	 Hand Tested Tweezers. Are they worth it?
Episode description: 
	 Episode 1 is finally here. We have spent a long time trying to decide what to talk about, what format to use and how long each episode should be. But do you know what? None of that matters! What matters is getting started. So with that in mind, we would like to introduce to you our first (of many) podcast episodes that digs deep into some of the most talked-about and controversial subject in the industry. Hosted by @LashBase_Jamie and @LashBase_Amy. We want to be able to open up a discussion about taboo subjects and pain points. We will give positive and negatives sides to each topic in the hope that it helps you make up your own mind. Because at the end of the day, there is no wrong or right when it comes to most things in the lash industry and it is most definitely 

Loop the following section, listen to the audio, fill in the label for how entertaining it was

In [12]:
audio, sr = audio_snippet(df.uri.iloc[i], int(float(df.timestamp.iloc[i])), metadata)
Audio(np.mean(audio, axis=1), rate=sr)

__Select multiple from "storytelling", "excitement", "funny", "angry", "sad"__

In [13]:
df["entertaining"].iloc[i] = ["storytelling"] #["storytelling", "excitement", "funny", "angry", "sad"]

__Select multiple from "narration/monologue", "conversation", "debate", "interview"__

In [14]:
df["discussion"].iloc[i] = ["narration/monologue"] #["narration/monologue", "conversation", "debate", "interview"]

__Select multiple from "Disapproval", "Approval"__

In [16]:
df["subjective"].iloc[i] = [""]#["Disapproval", "Approval"]

In [17]:
i += 1
print(f"Are you done? {'YES!' if i == len(df) else 'NO!'}      ({(i)/len(df)*100:.0f}% done)" )

Are you done? NO!      (2% done)


# Final Check + Save

In [18]:
missing = (df.entertaining == -99) | (df.discussion == - 99)
if sum(missing) == 0:
    print("All entries rated")
    done = True
else:
    print(f"Missing ratings for i= {list(np.where(missing)[0])}")
    done = False

Missing ratings for i= [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]


In [19]:
df.to_csv(f"uri_set_labeled_{YOUR_NUMBER}.csv")
