In [1]:
import whisper
import datetime

import subprocess

import torch
import pyannote.audio
from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
embedding_model = PretrainedSpeakerEmbedding( 
    "speechbrain/spkrec-ecapa-voxceleb",
    device=torch.device("cuda"))

from pyannote.audio import Audio
from pyannote.core import Segment

import wave
import contextlib

from sklearn.cluster import AgglomerativeClustering
import numpy as np

path = 'afjiv.wav'
num_speakers = 4

print ('__________________________________________________________')
if path[-3:] != 'wav':
  print ('error !!!! ______________________')

model = whisper.load_model("large")

result = model.transcribe(path)
segments = result["segments"]

with contextlib.closing(wave.open(path,'r')) as f:
  frames = f.getnframes()
  rate = f.getframerate()
  duration = frames / float(rate)

audio = Audio()

def segment_embedding(segment):
  start = segment["start"]
  # Whisper overshoots the end timestamp in the last segment
  end = min(duration, segment["end"])
  clip = Segment(start, end)
  waveform, sample_rate = audio.crop(path, clip)
  return embedding_model(waveform[None])

embeddings = np.zeros(shape=(len(segments), 192))
for i, segment in enumerate(segments):
  embeddings[i] = segment_embedding(segment)

embeddings = np.nan_to_num(embeddings)

clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
labels = clustering.labels_
for i in range(len(segments)):
  segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)

def time(secs):
  return datetime.timedelta(seconds=round(secs))

f = open("transcript.txt", "w")

for (i, segment) in enumerate(segments):
  if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
    f.write("\n" + segment["speaker"] + ' ' + str(time(segment["start"])) + '\n')
  f.write(segment["text"][1:] + ' ')
f.close()

__________________________________________________________


In [11]:
identity_speaker = {}
for (i, segment) in enumerate(segments):

  if segment["speaker"] not in identity_speaker:
    identity_speaker[segment["speaker"]] = [time(segment["start"]),time(segment["end"])]

print(identity_speaker)
identity_speaker = dict(sorted(identity_speaker.items(), key=lambda item: int(item[0][8:])))
print(identity_speaker)

{'SPEAKER 2': [datetime.timedelta(0), datetime.timedelta(seconds=9)], 'SPEAKER 3': [datetime.timedelta(seconds=38), datetime.timedelta(seconds=43)], 'SPEAKER 1': [datetime.timedelta(seconds=80), datetime.timedelta(seconds=86)], 'SPEAKER 4': [datetime.timedelta(seconds=118), datetime.timedelta(seconds=124)]}
{'SPEAKER 1': [datetime.timedelta(seconds=80), datetime.timedelta(seconds=86)], 'SPEAKER 2': [datetime.timedelta(0), datetime.timedelta(seconds=9)], 'SPEAKER 3': [datetime.timedelta(seconds=38), datetime.timedelta(seconds=43)], 'SPEAKER 4': [datetime.timedelta(seconds=118), datetime.timedelta(seconds=124)]}


In [12]:
for (i, segment) in enumerate(segments):
    print('i:',i)
    print(segment, '\n')

i: 0
{'id': 0, 'seek': 0, 'start': 0.0, 'end': 9.0, 'text': " I think if you're a leader and you don't understand the terms that you're using,", 'tokens': [50364, 286, 519, 498, 291, 434, 257, 5263, 293, 291, 500, 380, 1223, 264, 2115, 300, 291, 434, 1228, 11, 50814], 'temperature': 0.0, 'avg_logprob': -0.30656783077694955, 'compression_ratio': 1.821011673151751, 'no_speech_prob': 0.021781528368592262, 'speaker': 'SPEAKER 2'} 

i: 1
{'id': 1, 'seek': 0, 'start': 9.0, 'end': 13.200000000000001, 'text': " that's probably the first start. It's really important that as a leader in the", 'tokens': [50814, 300, 311, 1391, 264, 700, 722, 13, 467, 311, 534, 1021, 300, 382, 257, 5263, 294, 264, 51024], 'temperature': 0.0, 'avg_logprob': -0.30656783077694955, 'compression_ratio': 1.821011673151751, 'no_speech_prob': 0.021781528368592262, 'speaker': 'SPEAKER 2'} 

i: 2
{'id': 2, 'seek': 0, 'start': 13.200000000000001, 'end': 17.16, 'text': ' organization you understand what digitization means, yo