In [7]:
# Test GPU Compatibility
import tensorflow as tf
import json

# My GDrive Path
PATH = "/content/drive/MyDrive/audio_data/"

# For initial testing purposes
DUMMY = PATH + "anjali_zoom_test.m4a"
DUMMY_LONG = PATH + "50_min_test.wav"

# Verify GPU is active
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

# Utility functions
def run_with_gpu(function, *extra_args, gpu=True, which_gpu="/GPU:0"):
    """
    """
    if gpu:
        with tf.device(which_gpu):
            return function(*extra_args)
    else:
        return function(*extra_args)




Found GPU at: /device:GPU:0


In [10]:
# Taken from https://github.com/yinruiqing/pyannote-whisper
from pyannote.core import Segment, Annotation, Timeline


def get_text_with_timestamp(transcribe_res):
    timestamp_texts = []
    for item in transcribe_res['segments']:
        start = item['start']
        end = item['end']
        text = item['text']
        timestamp_texts.append((Segment(start, end), text))
    return timestamp_texts


def add_speaker_info_to_text(timestamp_texts, ann):
    spk_text = []
    for seg, text in timestamp_texts:
        spk = ann.crop(seg).argmax()
        spk_text.append((seg, spk, text))
    return spk_text


def merge_cache(text_cache):
    sentence = ''.join([item[-1] for item in text_cache])
    spk = text_cache[0][1]
    start = text_cache[0][0].start
    end = text_cache[-1][0].end
    return Segment(start, end), spk, sentence


PUNC_SENT_END = ['.', '?', '!']


def merge_sentence(spk_text):
    merged_spk_text = []
    pre_spk = None
    text_cache = []
    for seg, spk, text in spk_text:
        if spk != pre_spk and pre_spk is not None and len(text_cache) > 0:
            merged_spk_text.append(merge_cache(text_cache))
            text_cache = [(seg, spk, text)]
            pre_spk = spk

        elif text[-1] in PUNC_SENT_END:
            text_cache.append((seg, spk, text))
            merged_spk_text.append(merge_cache(text_cache))
            text_cache = []
            pre_spk = spk
        else:
            text_cache.append((seg, spk, text))
            pre_spk = spk
    if len(text_cache) > 0:
        merged_spk_text.append(merge_cache(text_cache))
    return merged_spk_text


def diarize_text(transcribe_res, diarization_result):
    timestamp_texts = get_text_with_timestamp(transcribe_res)
    spk_text = add_speaker_info_to_text(timestamp_texts, diarization_result)
    res_processed = merge_sentence(spk_text)
    return res_processed


def write_to_txt(spk_sent, file):
    with open(file, 'w') as fp:
        for seg, spk, sentence in spk_sent:
            line = f'{seg.start:.2f} {seg.end:.2f} {spk} {sentence}\n'
            fp.write(line)

In [11]:
import whisper
model = whisper.load_model("base")

result = run_with_gpu(model.transcribe, DUMMY)
        
print(result["text"])


100%|███████████████████████████████████████| 139M/139M [00:02<00:00, 50.8MiB/s]


 Hi, I'm Anjali Sebastian and for my research presentation, I have selected the paper, Predictibility of Web Server Traffic Conjession written by Y. Barashnikov et al. Written in 2005. The location to the reference in the textbook is on page 417, and which discusses dynamically creating replicas when demand goes up. There are often large swings in demand for web server content, and these bursts of traffic are usually called hotspots or flash crowds. So the paper actually looks at investigating the potential for predicting these hotspots in advance. The idea here is to assess the feasibility of a prediction algorithm, not specifically deliver a finished product. The approach taken is predictive versus reactive. So when you're talking about a reactive approach, you are already experiencing the hotspot and are trying to mitigate it by either throttling requests or you know, farming new servers. With this predictive approach, however, you could also use it in conjunction with a reactive ap

In [None]:
result["text"]

" Hi, I'm Anjali Sebastian and for my research presentation, I have selected the paper, Predictibility of Web Server Traffic Conjession written by Y. Barashnikov et al. Written in 2005. The location to the reference in the textbook is on page 417, and which discusses dynamically creating replicas when demand goes up. There are often large swings in demand for web server content, and these bursts of traffic are usually called hotspots or flash crowds. So the paper actually looks at investigating the potential for predicting these hotspots in advance. The idea here is to assess the feasibility of a prediction algorithm, not specifically deliver a finished product. The approach taken is predictive versus reactive. So when you're talking about a reactive approach, you are already experiencing the hotspot and are trying to mitigate it by either throttling requests or you know, farming new servers. With this predictive approach, however, you could also use it in conjunction with a reactive a

In [12]:
result = run_with_gpu(model.transcribe, DUMMY_LONG)

In [None]:
result["text"]

" Okay. Okay. Does anyone want to see Steve's feedback from the specification? Is there much more in it than he said yesterday? Not really. Just what he's talking about, duplication of effort. Like duplication of effort and stuff. Yeah, seeing that we should maybe think about having a prototype or a weak sex, which is necessary. So, this is probably prioritized or... Yeah, I'd say if for the prototype we feature like wherever possible, chunking the stuff that we have pre-anotated and stuff. And for the stuff that we don't have pre-anotated, right, like a stupid baseline, then we should probably be able to... Basically, that means we focus on the interface first, so that we take the ready-made parts and just see how we get them work together in the interface the way we want them. And then we have a working prototype and then we can go back and replace pieces either by our own components or by more sophisticated components of our own. So, it's probably feasible. I think it's a way this w

In [13]:
from pyannote.audio import Pipeline

pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization@2.1",
                                    use_auth_token="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")

diarization = run_with_gpu(pipeline, DUMMY_LONG)

with open("audio.rttm", "w") as rttm:
    diarization.write_rttm(rttm)

In [14]:
final_result = diarize_text(result, diarization)

for seg, spk, sent in final_result:
    line = f'{seg.start:.2f} {seg.end:.2f} {spk} {sent}'
    print(line)

0.00 7.00 SPEAKER_05  Okay.
7.00 17.00 SPEAKER_04  Okay. Does anyone want to see Steve's feedback from the specification?
17.00 21.00 SPEAKER_00  Is there much more in it than he said yesterday?
21.00 28.00 SPEAKER_04  Not really. Just what he's talking about, duplication of effort.
28.00 33.00 SPEAKER_04  Like duplication of effort and stuff.
33.00 42.00 SPEAKER_04  Yeah, seeing that we should maybe think about having a prototype or a weak sex, which is necessary.
42.00 46.00 SPEAKER_04  So, this is probably prioritized or...
46.00 56.00 SPEAKER_00  Yeah, I'd say if for the prototype we feature like wherever possible, chunking the stuff that we have pre-anotated and stuff.
56.00 61.00 SPEAKER_00  And for the stuff that we don't have pre-anotated, right, like a stupid baseline, then we should probably be able to...
61.00 69.00 SPEAKER_00  Basically, that means we focus on the interface first, so that we take the ready-made parts and just see how we get them work together in the interfa