In [15]:
import os
import whisper
import torch
from pyannote.audio import Pipeline
from pyannote.core import Segment, Annotation, Timeline
import datetime
from transformers import BartTokenizer, BartModel, pipeline

In [16]:
def remove_suffix(filename):
    return filename.replace(".wav", "")

In [17]:
# Taken from https://github.com/yinruiqing/pyannote-whisper
class PyanWhisper:
    PUNC_SENT_END = ['.', '?', '!']
        
    def diarize_text(transcribe_res, diarization_result):
        timestamp_texts = PyanWhisper.get_text_with_timestamp(transcribe_res)
        spk_text = PyanWhisper.add_speaker_info_to_text(timestamp_texts, diarization_result)
        res_processed = PyanWhisper.merge_sentence(spk_text)
        return res_processed

    def get_text_with_timestamp(transcribe_res):
        timestamp_texts = []
        for item in transcribe_res['segments']:
            start = item['start']
            end = item['end']
            text = item['text']
            timestamp_texts.append((Segment(start, end), text))
        return timestamp_texts
    
    def add_speaker_info_to_text(timestamp_texts, ann):
        spk_text = []
        for seg, text in timestamp_texts:
            spk = ann.crop(seg).argmax()
            spk_text.append((seg, spk, text))
        return spk_text
    
    def merge_cache(text_cache):
        sentence = ''.join([item[-1] for item in text_cache])
        spk = text_cache[0][1]
        start = text_cache[0][0].start
        end = text_cache[-1][0].end
        return Segment(start, end), spk, sentence
    
    def merge_sentence(spk_text):
        merged_spk_text = []
        pre_spk = None
        text_cache = []
        for seg, spk, text in spk_text:
            if spk != pre_spk and pre_spk is not None and len(text_cache) > 0:
                merged_spk_text.append(PyanWhisper.merge_cache(text_cache))
                text_cache = [(seg, spk, text)]
                pre_spk = spk
            elif text[-1] in PyanWhisper.PUNC_SENT_END:
                text_cache.append((seg, spk, text))
                merged_spk_text.append(PyanWhisper.merge_cache(text_cache))
                text_cache = []
                pre_spk = spk
            else:
                text_cache.append((seg, spk, text))
                pre_spk = spk
        if len(text_cache) > 0:
            merged_spk_text.append(PyanWhisper.merge_cache(text_cache))
        return merged_spk_text

    def write_to_txt(spk_sent, file):
        with open(file, 'w') as fp:
            for seg, spk, sentence in spk_sent:
                line = f'{seg.start:.2f} {seg.end:.2f} {spk} {sentence}\n'
                fp.write(line)

In [18]:
model = whisper.load_model("base", device="cuda")
# pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization@2.1",
#                                     use_auth_token="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")

In [19]:
input_directory = "../../../Pipeline/sample_input2/"
output_directory = "../../../Pipeline/output_pure_whisper/"

In [20]:
for filename in os.listdir(input_directory):
    audio = os.path.join(input_directory, filename)
    text = os.path.join(output_directory, remove_suffix(filename))
    print("Input:" + audio)
    print("Output:" + text)
    result = model.transcribe(audio)
    #diarization = pipeline(audio)
    #final_result = PyanWhisper.diarize_text(result, diarization)
    print(result["text"])

#     with open(text + '.txt', 'w+') as f:
#         for seg, spk, sent in final_result:
#             start = str(datetime.timedelta(seconds=int(seg.start)))
#             line = start + f'\t{spk}\n{sent}\n\n'
#             f.write(line)
    

Input:../../../Pipeline/sample_input2/Bed015.interaction.wav
Output:../../../Pipeline/output_pure_whisper/Bed015.interaction
 what things to talk about. Really? Oh, it was horrible. It was incentive. You recorded it? Hello? Hello? Yeah. Which one? All right, good. OK. OK. Did you do something? OK. And I guess I'm doing something. So basically, the result of much thinking since the last time we met, but not as much writing, is a sheet that I have a lot of thoughts and justification to comment on. But I'll just pass out as is right now. So here's what's around. And there's two things. So one on one side is a revised, updated semantics specification. The other side is revised. This is just one sheet, right? No, just one sheet. OK. So this thing also. And it's very similar to the reverse. If you change it, it's always more. But before I don't think everyone here is seen all of this. So sure, here, begin. As usual, the disclaimers are there. All these things are, it's only slightly worse pe