In [10]:
import wave
import librosa
import numpy as np
import soundfile as sf
import webrtcvad
from tqdm import tqdm
import pandas as pd
import glob
import os

In [11]:
rttm_path = "/home/tuyendv/projects/speaker-diazation/diarization-data/rttms/dev"
wav_path = "/home/tuyendv/projects/speaker-diazation/diarization-data/wav"

rttm_files = glob.glob(os.path.join(rttm_path, "*.rttm"))
print(rttm_files)
wav_files = glob.glob(os.path.join(wav_path, "*/audio/*.wav"))
print(wav_files)

rttm2wav = {}
for rttm_file in rttm_files:
    rttm_name = rttm_file.split("/")[-1].split(".")[0]
    for wav_file in wav_files:
        if rttm_name == wav_file.split("/")[-3]:
            rttm2wav[rttm_name] = wav_file
print(rttm2wav)

['/home/tuyendv/projects/speaker-diazation/diarization-data/rttms/dev/TS3004a.rttm', '/home/tuyendv/projects/speaker-diazation/diarization-data/rttms/dev/IS1008c.rttm', '/home/tuyendv/projects/speaker-diazation/diarization-data/rttms/dev/IS1008d.rttm', '/home/tuyendv/projects/speaker-diazation/diarization-data/rttms/dev/ES2011a.rttm', '/home/tuyendv/projects/speaker-diazation/diarization-data/rttms/dev/IB4011.rttm', '/home/tuyendv/projects/speaker-diazation/diarization-data/rttms/dev/IS1008a.rttm', '/home/tuyendv/projects/speaker-diazation/diarization-data/rttms/dev/TS3004b.rttm', '/home/tuyendv/projects/speaker-diazation/diarization-data/rttms/dev/IB4010.rttm', '/home/tuyendv/projects/speaker-diazation/diarization-data/rttms/dev/IB4004.rttm', '/home/tuyendv/projects/speaker-diazation/diarization-data/rttms/dev/TS3004c.rttm', '/home/tuyendv/projects/speaker-diazation/diarization-data/rttms/dev/IB4003.rttm', '/home/tuyendv/projects/speaker-diazation/diarization-data/rttms/dev/IB4001.rtt

In [12]:
def load_wave(path):
    with wave.open(path, 'rb') as wf:
        num_channels = wf.getnchannels()
        assert num_channels == 1
        sample_width = wf.getsampwidth()
        assert sample_width == 2
        sample_rate = wf.getframerate()
        assert sample_rate in (8000, 16000, 32000, 48000)
        pcm_data = wf.readframes(wf.getnframes())
    return pcm_data, sample_rate
    
class Frame(object):
  def __init__(self, bytes, timestamp, duration):
        self.bytes = bytes
        self.timestamp = timestamp
        self.duration = duration


def frame_generator(frame_duration_ms, audio, sample_rate):
    n = int(sample_rate * (frame_duration_ms / 1000.0) * 2)
    offset = 0
    timestamp = 0.0
    duration = (float(n) / sample_rate) / 2.0
    while offset + n < len(audio):
        yield Frame(audio[offset:offset + n], timestamp, duration)
        timestamp += duration
        offset += n


def vad_collector(vad, frames, sample_rate):
    is_speech = []
    for frame in frames:
        is_speech.append(vad.is_speech(frame.bytes, sample_rate))
    return is_speech

def vad(file):
    audio, sample_rate = load_wave(file)
    vad = webrtcvad.Vad(3)
    frames = frame_generator(10, audio, sample_rate)
    frames = list(frames)
    segments = vad_collector(vad, frames, sample_rate)
    return segments

def speech(file):
  dummy = 0
  data = []
  segments = vad(file)
  audio, sr = librosa.load(file)
  for i in segments:
    if i == True:
      data.append(audio[dummy:dummy + 160])
      dummy = dummy + 160
    else:
      dummy = dummy + 160
  data = np.ravel(np.asarray(data))

  return data

In [13]:
def vad(audio, sample_rate):
    vad = webrtcvad.Vad(2)
    frames = frame_generator(10, audio, sample_rate)
    frames = list(frames)
    segments = vad_collector(vad, frames, sample_rate)
    return segments

In [14]:
def resegment(audio, framerate, start, segments):
    speech_in_frame, non_speech_in_frame = [], []
    timestamp_in_frame = int(start * framerate)
    duration_in_frame = 160
    new_segments = []
    
    total_nonspeech = 0
    prev = False
    for idx, curr in enumerate(segments):
        if curr == True or idx == 0:
            if prev == False:
                start = timestamp_in_frame
        elif curr == False:
            # print("False")
            if prev == True:
                end = timestamp_in_frame
                # print(f"start : {start} - end : {end}")
                new_segments.append((start/framerate, end/framerate))
                # speech_in_frame.append(list(audio[start:end]))
                speech_in_frame+=list(audio[start:end])
            total_nonspeech += duration_in_frame
        timestamp_in_frame = timestamp_in_frame + duration_in_frame
        prev = curr
    # speech_in_frame = np.ravel(np.asarray(speech_in_frame))
    return new_segments, total_nonspeech

In [15]:
def do_retaglabel(element, segments):
    new_segment = []
    temp = {}
    for start, end in segments:
        temp = dict(element)
        temp["st"] = round(start,2)
        temp["et"] = round(end, 2)
        temp["dur"] = round(end - start,2)
        new_segment.append(temp)
    return new_segment

In [16]:
def save_file(path, df):
    with open(path, "w", encoding="utf-8")as tmp:
        temp = df.to_csv(index=False, sep=" ", na_rep="<NA>", header=None)
        tmp.write(temp)
    print("saved: ", path)

In [17]:
def retaglabel(rttm_in):
    print(rttm_in)
    rttm_out = "/home/tuyendv/projects/speaker-diazation/data/dev"
    
    label = pd.read_csv(rttm_in, sep=" ", header=None)
    label.columns = ["utt","rec","sth_1","st","dur","sth_2","sth_3","spk_id","sth_4","sth_5"]
    label["et"]= label['st'] + label['dur']
    
    rttm_name = rttm_in.split("/")[-1].split(".")[0]
    wavs, sample_rate = load_wave(rttm2wav[rttm_name])
    audio, _ = librosa.load(rttm2wav[rttm_name], sr=sample_rate)
    
    total_nonspeech = 0
    new_segments = []
    for index in label.index:
        start = label['st'][index]
        end = label['et'][index]
        wav = wavs[int(start*sample_rate*2): int(end*sample_rate*2)]
        
        segments = vad(wav, sample_rate)
        segments, nonspeech = resegment(audio, sample_rate, start, segments)
        total_nonspeech += nonspeech
        segments = do_retaglabel(label.iloc[index], segments)
        new_segments+=segments
    print(total_nonspeech/16000)
    label = pd.DataFrame(new_segments)
    out_path = os.path.join(rttm_out, rttm_name+".rttm")
    save_file(out_path, label)

In [18]:
for rttm_file in tqdm(rttm_files):
    retaglabel(rttm_file)

  0%|          | 0/18 [00:00<?, ?it/s]

/home/tuyendv/projects/speaker-diazation/diarization-data/rttms/dev/TS3004a.rttm


  6%|▌         | 1/18 [00:00<00:15,  1.12it/s]

374.36
saved:  /home/tuyendv/projects/speaker-diazation/data/dev/TS3004a.rttm
/home/tuyendv/projects/speaker-diazation/diarization-data/rttms/dev/IS1008c.rttm


 11%|█         | 2/18 [00:02<00:18,  1.18s/it]

128.8
saved:  /home/tuyendv/projects/speaker-diazation/data/dev/IS1008c.rttm
/home/tuyendv/projects/speaker-diazation/diarization-data/rttms/dev/IS1008d.rttm


 17%|█▋        | 3/18 [00:03<00:18,  1.25s/it]

119.28
saved:  /home/tuyendv/projects/speaker-diazation/data/dev/IS1008d.rttm
/home/tuyendv/projects/speaker-diazation/diarization-data/rttms/dev/ES2011a.rttm


 22%|██▏       | 4/18 [00:04<00:15,  1.11s/it]

222.76
saved:  /home/tuyendv/projects/speaker-diazation/data/dev/ES2011a.rttm
/home/tuyendv/projects/speaker-diazation/diarization-data/rttms/dev/IB4011.rttm


 28%|██▊       | 5/18 [00:06<00:20,  1.56s/it]

152.95
saved:  /home/tuyendv/projects/speaker-diazation/data/dev/IB4011.rttm
/home/tuyendv/projects/speaker-diazation/diarization-data/rttms/dev/IS1008a.rttm


 33%|███▎      | 6/18 [00:07<00:15,  1.29s/it]

82.35
saved:  /home/tuyendv/projects/speaker-diazation/data/dev/IS1008a.rttm
/home/tuyendv/projects/speaker-diazation/diarization-data/rttms/dev/TS3004b.rttm


 39%|███▉      | 7/18 [00:09<00:16,  1.48s/it]

697.47
saved:  /home/tuyendv/projects/speaker-diazation/data/dev/TS3004b.rttm
/home/tuyendv/projects/speaker-diazation/diarization-data/rttms/dev/IB4010.rttm


 44%|████▍     | 8/18 [00:12<00:19,  1.92s/it]

187.04
saved:  /home/tuyendv/projects/speaker-diazation/data/dev/IB4010.rttm
/home/tuyendv/projects/speaker-diazation/diarization-data/rttms/dev/IB4004.rttm


 50%|█████     | 9/18 [00:14<00:19,  2.11s/it]

157.52
saved:  /home/tuyendv/projects/speaker-diazation/data/dev/IB4004.rttm
/home/tuyendv/projects/speaker-diazation/diarization-data/rttms/dev/TS3004c.rttm


 56%|█████▌    | 10/18 [00:16<00:16,  2.09s/it]

385.29
saved:  /home/tuyendv/projects/speaker-diazation/data/dev/TS3004c.rttm
/home/tuyendv/projects/speaker-diazation/diarization-data/rttms/dev/IB4003.rttm


 61%|██████    | 11/18 [00:19<00:14,  2.13s/it]

125.65
saved:  /home/tuyendv/projects/speaker-diazation/data/dev/IB4003.rttm
/home/tuyendv/projects/speaker-diazation/diarization-data/rttms/dev/IB4001.rttm


 67%|██████▋   | 12/18 [00:20<00:11,  1.92s/it]

177.88
saved:  /home/tuyendv/projects/speaker-diazation/data/dev/IB4001.rttm
/home/tuyendv/projects/speaker-diazation/diarization-data/rttms/dev/IS1008b.rttm


 72%|███████▏  | 13/18 [00:21<00:08,  1.74s/it]

127.58
saved:  /home/tuyendv/projects/speaker-diazation/data/dev/IS1008b.rttm
/home/tuyendv/projects/speaker-diazation/diarization-data/rttms/dev/ES2011d.rttm


 78%|███████▊  | 14/18 [00:23<00:06,  1.68s/it]

352.24
saved:  /home/tuyendv/projects/speaker-diazation/data/dev/ES2011d.rttm
/home/tuyendv/projects/speaker-diazation/diarization-data/rttms/dev/IB4002.rttm


 83%|████████▎ | 15/18 [00:25<00:04,  1.66s/it]

281.47
saved:  /home/tuyendv/projects/speaker-diazation/data/dev/IB4002.rttm
/home/tuyendv/projects/speaker-diazation/diarization-data/rttms/dev/TS3004d.rttm


 89%|████████▉ | 16/18 [00:26<00:03,  1.73s/it]

378.73
saved:  /home/tuyendv/projects/speaker-diazation/data/dev/TS3004d.rttm
/home/tuyendv/projects/speaker-diazation/diarization-data/rttms/dev/ES2011c.rttm


 94%|█████████▍| 17/18 [00:28<00:01,  1.66s/it]

242.53
saved:  /home/tuyendv/projects/speaker-diazation/data/dev/ES2011c.rttm
/home/tuyendv/projects/speaker-diazation/diarization-data/rttms/dev/ES2011b.rttm


100%|██████████| 18/18 [00:29<00:00,  1.66s/it]

236.55
saved:  /home/tuyendv/projects/speaker-diazation/data/dev/ES2011b.rttm



