In [12]:
import pandas as pd
import jsonlines
import numpy as np
import re

In [46]:
def rnd(num):
    return round(num * 2) / 2

def get_df(tv_show):
    base_path = "../data"
    speaker_path = f"{base_path}/diarized/{tv_show}.csv"
    transcr_path = f"{base_path}/transcriptions/{tv_show}.jsonl"

    df = pd.read_csv(speaker_path, header=None)
    df.columns = ["start", "end", "speaker"]
    def transform_speaker(speaker_str):
        return int(speaker_str.split("_")[2])
    df["speaker"] = df["speaker"].apply(transform_speaker)

    # round to nearest 0.5 seconds
    max_time = rnd(df.iloc[-1]["end"])

    # create a time series from 0 to max_time
    series = pd.Series(np.arange(0, max_time + 0.5, 0.5))
    series

    new_df = pd.DataFrame()
    new_df["time"] = series

    # DIARIZATION
    # for each speaker, find the time series indices that fall within the speaker's start-end time frame
    df_as_obj = df.to_dict(orient="records")
    for speaker in df_as_obj:
        speaker_id = speaker["speaker"]
        start = rnd(speaker["start"])
        end = rnd(speaker["end"])
        new_df[speaker_id] = new_df["time"].apply(lambda x: 1 if x >= start and x <= end else 0)

    # convert from detailed view to a single speaker col, -1 if there's no speaker
    new_df["speaker"] = -1
    for index, row in new_df.iterrows():
        for speaker_id in df["speaker"].unique():
            if row[speaker_id] == 1:
                new_df.at[index, "speaker"] = speaker_id
    new_df = new_df[["time", "speaker"]]

    # TRANSCRIPTIONS
    parsed = []
    with jsonlines.open(transcr_path) as reader:
        for obj in reader:
            timestamp = obj["timestamp"]
            txt = obj["text"]
            start, end = timestamp
            parsed.append({"start": start, "end": end, "text": txt})
    transcript_df = pd.DataFrame(parsed)
    transcript_df["text"] = transcript_df["text"].apply(lambda x: x.strip())
    # replace nan with max_end
    transcript_df["end"] = transcript_df["end"].fillna(max_time)
    # round all times to 0.5
    transcript_df["start"] = transcript_df["start"].apply(lambda x: rnd(x))
    transcript_df["end"] = transcript_df["end"].apply(lambda x: rnd(x))

    pattern = re.compile(r"\d+(?:,\d+)+")
    def filter_start(sent):
        sent = pattern.sub("", sent)
        return re.sub(r"^[^a-zA-Z0-9]+", "", sent)

    transcript_df["text"] = transcript_df["text"].apply(filter_start)

    # expand the start-end to 0.5 second intervals
    for index, row in transcript_df.iterrows():
        start = row["start"]
        end = row["end"]
        text = row["text"]
        for i in np.arange(start, end, 0.5):
            new_df.loc[new_df["time"] == i, "text"] = text
    new_df.fillna("", inplace=True)

    # create a mapping from text -> speaker
    text_to_speaker = {}
    for index, row in new_df.iterrows():
        speaker = row["speaker"]
        text = row["text"]

        if speaker == -1 or text == "":
            continue
        if text not in text_to_speaker:
            text_to_speaker[text] = {}
        if speaker not in text_to_speaker[text]:
            text_to_speaker[text][speaker] = 0
        text_to_speaker[text][speaker] += 1

    text_to_speaker = {k: max(v, key=v.get)
                    for k, v in sorted(text_to_speaker.items(),
                                        key=lambda item: sum(item[1].values()), reverse=True)}

    new_df["speaker"] = new_df["text"].apply(
        lambda x: text_to_speaker[x] if x in text_to_speaker else -1)

    return new_df

df = get_df("6899tilegdør")

Unnamed: 0,time,speaker,text
3682,1841.0,-1,
3683,1841.5,-1,
3684,1842.0,-1,
3685,1842.5,-1,
3686,1843.0,-1,
3687,1843.5,-1,
3688,1844.0,-1,
3689,1844.5,-1,"Nå som kameraet er skrudd av, så teller vi det."
3690,1845.0,-1,"Nå som kameraet er skrudd av, så teller vi det."
3691,1845.5,-1,"Nå som kameraet er skrudd av, så teller vi det."


In [44]:
new_df.tail(50)

Unnamed: 0,time,speaker,text
3692,1846.0,-1,"Nå som kameraet er skrudd av, så teller vi det."
3693,1846.5,-1,"Nå som kameraet er skrudd av, så teller vi det."
3694,1847.0,-1,"Nå som kameraet er skrudd av, så teller vi det."
3695,1847.5,-1,"Nå som kameraet er skrudd av, så teller vi det."
3696,1848.0,-1,"Nå som kameraet er skrudd av, så teller vi det."
3697,1848.5,-1,"Nå som kameraet er skrudd av, så teller vi det."
3698,1849.0,-1,Nå flytter vi. Tilbake til Bergen.
3699,1849.5,-1,Nå flytter vi. Tilbake til Bergen.
3700,1850.0,-1,Nå flytter vi. Tilbake til Bergen.
3701,1850.5,-1,Nå flytter vi. Tilbake til Bergen.
