In [2]:
import jsonlines
from pydub.playback import play
from pydub import AudioSegment

audio_path = "../nrk/Debatten12okt.mp3"
audio = AudioSegment.from_mp3(audio_path)
transcription_path = "../first_tests/output/Debatten12okt.jsonl"
data = []
with jsonlines.open(transcription_path) as reader:
    for obj in reader:
        data.append(obj)

In [3]:
import numpy as np

def segment_sentences(start, end, txt):
    if "." not in txt:
        return [{"start": start, "end": end, "text": txt.strip()}]
    sentences = [s.strip() for s in txt.split(".") if len(s.strip()) > 0]
    n_sents = len(sentences)
    time_range = np.linspace(start, end, n_sents + 1)
    segments = []
    for i, sentence in enumerate(sentences):
        # _start = np.round(time_range[i], 2)
        # _end = np.round(time_range[i+1], 2)
        # segments.append({"start": _start, "end": _end, "text": sentence})
        segments.append({"start": start, "end": end, "text": sentence})
    return segments

start=4
end=15
txt = "This is a sentence. This is another sentence. This is a third sentence."

for ss in segment_sentences(start, end, txt):
    print(ss)

for ss in segment_sentences(0, 3, "This is a sentence."):
    print(ss)

{'start': 4, 'end': 15, 'text': 'This is a sentence'}
{'start': 4, 'end': 15, 'text': 'This is another sentence'}
{'start': 4, 'end': 15, 'text': 'This is a third sentence'}
{'start': 0, 'end': 3, 'text': 'This is a sentence'}


In [4]:
import numpy as np

parsed = []
for obj in data:
    timestamp = obj["timestamp"]
    txt = obj["text"]
    start, end = timestamp
    segments = segment_sentences(start, end, txt)
    segments = [s for s in segments if len(s["text"].split()) > 2]
    parsed.extend(segments)

In [5]:
import pandas as pd
df = pd.DataFrame(parsed)
df.head()

Unnamed: 0,start,end,text
0,3.96,7.28,Denne debatten skal handle om dette
1,7.28,15.72,Målet er at flest mulig skal jobbe
2,7.28,15.72,Derfor må det lønne seg å stå opp om morgenen
3,17.52,21.88,Det er litt forstemmende at vi ofte får en dis...
4,21.96,26.96,"nivået på ytelser, og altfor lite om hvordan v..."


In [6]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("NbAiLab/nb-sbert-base")

In [7]:
embeddings = model.encode(df["text"].tolist(), show_progress_bar=True);

Batches:   0%|          | 0/21 [00:00<?, ?it/s]

In [8]:
from autofaiss import build_index
index, index_infos = build_index(embeddings, save_on_disk=False, verbose=0)

2023-10-27 14:00:39,079 [INFO]: Using 10 omp threads (processes), consider increasing --nb_cores if you have more
2023-10-27 14:00:39,083 [INFO]: Launching the whole pipeline 10/27/2023, 14:00:39
2023-10-27 14:00:39,084 [INFO]: Reading total number of vectors and dimension 10/27/2023, 14:00:39
2023-10-27 14:00:39,089 [DEBUG]: open file: /var/folders/0k/1bg63zt532nb9d86g5tk_6vh0000gn/T/tmpqawb_b2z/emb.npy
100%|██████████| 1/1 [00:00<00:00, 28532.68it/s]
2023-10-27 14:00:39,093 [INFO]: There are 661 embeddings of dim 768
2023-10-27 14:00:39,093 [INFO]: >>> Finished "Reading total number of vectors and dimension" in 0.0091 secs
2023-10-27 14:00:39,093 [INFO]: 	Compute estimated construction time of the index 10/27/2023, 14:00:39
2023-10-27 14:00:39,093 [INFO]: 		-> Train: 16.7 minutes
2023-10-27 14:00:39,094 [INFO]: 		-> Add: 0.0 seconds
2023-10-27 14:00:39,094 [INFO]: 		Total: 16.7 minutes
2023-10-27 14:00:39,094 [INFO]: 	>>> Finished "Compute estimated construction time of the index" in

In [9]:
def query(q, K=10):
    emb = model.encode([q], show_progress_bar=False)
    _, matches = index.search(emb, K)
    return df.iloc[matches[0]]

def play_result(result):
    start = result["start"] * 1000
    end = result["end"] * 1000
    play(audio[start:end + 300])

def query_and_play(q):
    matches = query(q, K=3)
    print(matches)
    for match in matches.iterrows():
        play_result(match[1])

query_and_play("utfordringer rundt arbeidsledighet")

       start      end                                               text
338  1372.76  1377.68  Men det er også et spørsmål om hvordan vi tilp...
102   423.88   426.04      Det er en utfordring for den enkelte,ge uføre
377  1510.88  1517.92  Ser du på uføretrygden kontra en lavlønnsyrke,...
