In [80]:
import os
import json
import whisper
import torch
from pyannote.audio import Pipeline
from pyannote.core import Segment, Annotation, Timeline
import datetime

# My GDrive Path
# PATH = "/content/drive/MyDrive/audio_data/"
PATH = "/home/yonglong/project/speech_to_text_for_conversations/"

# For initial testing purposes
DUMMY_LONG = PATH + "ES2002a.Mix-Headset.wav"
filename = os.path.basename(DUMMY_LONG).split('/')[-1]

def remove_suffix(filename):
    return filename.replace(".Mix-Headset.wav", "")
new_filename = remove_suffix("ES2002a.Mix-Headset.wav")
print(new_filename)

ES2002a


In [81]:
# Taken from https://github.com/yinruiqing/pyannote-whisper
class PyanWhisper:
    def get_text_with_timestamp(transcribe_res):
        timestamp_texts = []
        for item in transcribe_res['segments']:
            start = item['start']
            end = item['end']
            text = item['text']
            timestamp_texts.append((Segment(start, end), text))
        return timestamp_texts


    def add_speaker_info_to_text(timestamp_texts, ann):
        spk_text = []
        for seg, text in timestamp_texts:
            spk = ann.crop(seg).argmax()
            spk_text.append((seg, spk, text))
        return spk_text


    def merge_cache(text_cache):
        sentence = ''.join([item[-1] for item in text_cache])
        spk = text_cache[0][1]
        start = text_cache[0][0].start
        end = text_cache[-1][0].end
        return Segment(start, end), spk, sentence


    PUNC_SENT_END = ['.', '?', '!']


    def merge_sentence(spk_text):
        merged_spk_text = []
        pre_spk = None
        text_cache = []
        for seg, spk, text in spk_text:
            if spk != pre_spk and pre_spk is not None and len(text_cache) > 0:
                merged_spk_text.append(merge_cache(text_cache))
                text_cache = [(seg, spk, text)]
                pre_spk = spk

            elif text[-1] in PUNC_SENT_END:
                text_cache.append((seg, spk, text))
                merged_spk_text.append(merge_cache(text_cache))
                text_cache = []
                pre_spk = spk
            else:
                text_cache.append((seg, spk, text))
                pre_spk = spk
        if len(text_cache) > 0:
            merged_spk_text.append(merge_cache(text_cache))
        return merged_spk_text


    def diarize_text(transcribe_res, diarization_result):
        timestamp_texts = get_text_with_timestamp(transcribe_res)
        spk_text = add_speaker_info_to_text(timestamp_texts, diarization_result)
        res_processed = merge_sentence(spk_text)
        return res_processed


    def write_to_txt(spk_sent, file):
        with open(file, 'w') as fp:
            for seg, spk, sentence in spk_sent:
                line = f'{seg.start:.2f} {seg.end:.2f} {spk} {sentence}\n'
                fp.write(line)

In [82]:
model = whisper.load_model("base", device="cuda")
result = model.transcribe(DUMMY_LONG)
print(result['text'])

 I think it's already produced a PowerPoint version. I think it's already done, actually. Go ahead and listen, work. I'll put it in the back. Okay, right. Okay. Right. Well, that's the kickoff meeting for our project. And this is just what we're going to be doing over the next 25 minutes. So, first of all, just to kind of make sure that we all know each other. I'm Laura and I'm the project manager. Great. Do you want to introduce yourself again? I'm David and I'm supposed to be a industrial designer. Okay. I'm Andrew and I'm a marketing expert. I'm Greg and I'm a hugeurant face. Great. Okay. So we're designing a new remote control. And I have to record he's here, actually. So that's David, Andrew and Craig, isn't it? And you're alive, don't you? Yes, we designed a new remote control. As you can see, it's supposed to be original, trendy and user-friendly. So that's kind of our brief. And so there are three different stages to the design. I'm not really sure what you guys have already re

In [83]:
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization@2.1",
                                    use_auth_token="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")
diarization = pipeline(DUMMY_LONG)

with open("audio.rttm", "w") as rttm:
    diarization.write_rttm(rttm)

In [84]:
final_result = PyanWhisper.diarize_text(result, diarization)

with open(new_filename + '.txt', 'w+') as f:
    for seg, spk, sent in final_result:
        start = str(datetime.timedelta(seconds=seg.start))
        line = start + f'\t{spk}\n{sent}\n\n'
        f.write(line)