In [27]:
import collections
import contextlib
import sys
import wave
import os
import pandas as pd
import numpy as np
from pydub import AudioSegment

import webrtcvad


def read_wave(path):
    """Reads a .wav file.

    Takes the path, and returns (PCM audio data, sample rate).
    """
    with contextlib.closing(wave.open(path, 'rb')) as wf:
        num_channels = wf.getnchannels()
        assert num_channels == 1
        
        sample_width = wf.getsampwidth()
        assert sample_width == 2
        
        sample_rate = wf.getframerate()
        assert sample_rate in (8000, 16000, 32000, 48000)
        
        pcm_data = wf.readframes(wf.getnframes())
        return pcm_data, sample_rate


def write_wave(path, audio, sample_rate):
    """Writes a .wav file.

    Takes path, PCM audio data, and sample rate.
    """
    with contextlib.closing(wave.open(path, 'wb')) as wf:
        wf.setnchannels(1)
        wf.setsampwidth(2)
        wf.setframerate(sample_rate)
        wf.writeframes(audio)


class Frame(object):
    """Represents a "frame" of audio data."""
    def __init__(self, bytes, timestamp, duration):
        self.bytes = bytes
        self.timestamp = timestamp
        self.duration = duration


def frame_generator(frame_duration_ms, audio, sample_rate):
    """Generates audio frames from PCM audio data.

    Takes the desired frame duration in milliseconds, the PCM data, and
    the sample rate.

    Yields Frames of the requested duration.
    """
    n = int(sample_rate * (frame_duration_ms / 1000.0) * 2)
    offset = 0
    timestamp = 0.0
    duration = (float(n) / sample_rate) / 2.0
    while offset + n < len(audio):
        yield Frame(audio[offset:offset + n], timestamp, duration)
        timestamp += duration
        offset += n


def vad_collector(sample_rate, frame_duration_ms,
                  padding_duration_ms, vad, frames):
    """Filters out non-voiced audio frames.

    Given a webrtcvad.Vad and a source of audio frames, yields only
    the voiced audio.

    Uses a padded, sliding window algorithm over the audio frames.
    When more than 90% of the frames in the window are voiced (as
    reported by the VAD), the collector triggers and begins yielding
    audio frames. Then the collector waits until 90% of the frames in
    the window are unvoiced to detrigger.

    The window is padded at the front and back to provide a small
    amount of silence or the beginnings/endings of speech around the
    voiced frames.

    Arguments:

    sample_rate - The audio sample rate, in Hz.
    frame_duration_ms - The frame duration in milliseconds.
    padding_duration_ms - The amount to pad the window, in milliseconds.
    vad - An instance of webrtcvad.Vad.
    frames - a source of audio frames (sequence or generator).

    Returns: A generator that yields PCM audio data.
    """
    num_padding_frames = int(padding_duration_ms / frame_duration_ms)
    # We use a deque for our sliding window/ring buffer.
    ring_buffer = collections.deque(maxlen=num_padding_frames)
    # We have two states: TRIGGERED and NOTTRIGGERED. We start in the
    # NOTTRIGGERED state.
    triggered = False

    voiced_frames = []
    for frame in frames:
        is_speech = vad.is_speech(frame.bytes, sample_rate)

        # sys.stdout.write('1' if is_speech else '0')
        if not triggered:
            ring_buffer.append((frame, is_speech))
            num_voiced = len([f for f, speech in ring_buffer if speech])
            # If we're NOTTRIGGERED and more than 90% of the frames in
            # the ring buffer are voiced frames, then enter the
            # TRIGGERED state.
            if num_voiced > 0.9 * ring_buffer.maxlen:
                triggered = True
                # sys.stdout.write('+(%s)' % (ring_buffer[0][0].timestamp,))
                # We want to yield all the audio we see from now until
                # we are NOTTRIGGERED, but we have to start with the
                # audio that's already in the ring buffer.
                for f, s in ring_buffer:
                    voiced_frames.append(f)
                ring_buffer.clear()
        else:
            # We're in the TRIGGERED state, so collect the audio data
            # and add it to the ring buffer.
            voiced_frames.append(frame)
            ring_buffer.append((frame, is_speech))
            num_unvoiced = len([f for f, speech in ring_buffer if not speech])
            # If more than 90% of the frames in the ring buffer are
            # unvoiced, then enter NOTTRIGGERED and yield whatever
            # audio we've collected.
            if num_unvoiced > 0.9 * ring_buffer.maxlen:
                # sys.stdout.write('-(%s)' % (frame.timestamp + frame.duration))
                triggered = False
                yield b''.join([f.bytes for f in voiced_frames])
                ring_buffer.clear()
                voiced_frames = []
    # if triggered:
        # sys.stdout.write('-(%s)' % (frame.timestamp + frame.duration))
    # sys.stdout.write('\n')
    # If we have any leftover voiced audio when we run out of input,
    # yield it.
    if voiced_frames:
        yield b''.join([f.bytes for f in voiced_frames])


In [2]:
# noisered_path = "/data/zaki/UASpeech/audio/noisereduce"
# aggresiveness = 3

# failed = []
# for p in os.listdir(noisered_path):
#     output_dir = f"preprocess/{p}"
#     os.makedirs(output_dir, exist_ok=True)
#     for f in os.listdir(f"{noisered_path}/{p}"):
#         if not f.endswith('.wav') or f.startswith("._"):
#             continue
        
#         vad = webrtcvad.Vad(aggresiveness)
#         input_path = f"{noisered_path}/{p}/{f}"
#         audio, sample_rate = read_wave(input_path)
#         frames = frame_generator(30, audio, sample_rate)
#         frames = list(frames)
#         segment = list(vad_collector(sample_rate, 30, 300, vad, frames))
        
#         if len(segment) != 1:
#             failed.append((input_path, len(segment)))
#             # print(input_path, len(segment))
#             continue

#         output_path = f"{output_dir}/{f}"
#         write_wave(output_path, segment[0], sample_rate)
#     print(f"{output_dir} DONE")

preprocess/M12 DONE
preprocess/CM12 DONE
preprocess/F03 DONE
preprocess/CF02 DONE
preprocess/CM05 DONE
preprocess/CM10 DONE
preprocess/M11 DONE
preprocess/M14 DONE
preprocess/M07 DONE
preprocess/CM13 DONE
preprocess/M16 DONE
preprocess/F02 DONE
preprocess/CM01 DONE
preprocess/CF04 DONE
preprocess/M10 DONE
preprocess/F05 DONE
preprocess/CM08 DONE
preprocess/CM09 DONE
preprocess/M04 DONE
preprocess/CF05 DONE
preprocess/M08 DONE
preprocess/F04 DONE
preprocess/CM06 DONE
preprocess/CM04 DONE
preprocess/M01 DONE
preprocess/CF03 DONE
preprocess/M09 DONE
preprocess/M05 DONE


In [None]:
# with open("failed_2.csv", "w") as ff:
#     for x in sorted(failed):
#         f, sc = x
#         ff.write(f"{f},{sc}\n")

# with open("failed_2_cnt.csv", "w") as ff:
#     for k,v in cnt.items():
#         ff.write(f"{k},{v}\n")

In [87]:
def process(input_path, output_path, aggresiveness):
    vad = webrtcvad.Vad(aggresiveness)
    audio, sample_rate = read_wave(input_path)
    frames = frame_generator(30, audio, sample_rate)
    frames = list(frames)
    segments = list(vad_collector(sample_rate, 30, 300, vad, frames))
    print(len(segments))
    # print(segments)
    return segments
    # if len(segment) != 1:
    #     print(input_path)
    #     return
    # for i, segment in enumerate(segments):
    #     path = output_path
    #     print(' Writing %s' % (path,))
    #     write_wave(path, segment, sample_rate)

file_path = "/data/zaki/UASpeech/audio/noisereduce/F04/F04_B3_CW30_M4.wav"
output_path = "chunk6-00.wav"
aggresiveness = 3


segments = process(file_path, output_path, aggresiveness)

for s in segments:
    audio = AudioSegment(data=s, sample_width=2, frame_rate=16000, channels=1)
    r = audio.get_array_of_samples()
    d = audio.duration_seconds
    print(len(r), d)
    
# audio = AudioSegment(data=segments[0], sample_width=2, frame_rate=16000, channels=1)
audio = AudioSegment(data=segments[1], sample_width=2, frame_rate=16000, channels=1)
# audio = AudioSegment(data=segments[2], sample_width=2, frame_rate=16000, channels=1)
# audio = AudioSegment(data=segments[3], sample_width=2, frame_rate=16000, channels=1)
# audio = AudioSegment.from_file(file_path)
# print(len(audio.get_array_of_samples()))
# print(len(audio))
# print(audio.duration_seconds)
audio

2
15360 0.96
12000 0.75


In [798]:
pc = {}
sc = {}
pwc = {}
for f, c in sorted(dict(failed).items()):
    split = f.replace(noisered_path, "").split("/")

    p = split[1]
    sc[c] = 1 if sc.get(c, None) is None else sc[c] + 1
    pc[p] = 1 if pc.get(p, None) is None else pc[p] + 1
    
print(sorted(pc.items()))
print(sc)

[('CF02', 115), ('CF03', 136), ('CF04', 340), ('CF05', 43), ('CM01', 440), ('CM04', 616), ('CM05', 85), ('CM06', 32), ('CM08', 12), ('CM09', 43), ('CM10', 152), ('CM12', 102), ('CM13', 12), ('F02', 1906), ('F03', 544), ('F04', 302), ('F05', 157), ('M01', 1176), ('M04', 74), ('M05', 383), ('M07', 402), ('M08', 211), ('M09', 81), ('M10', 22), ('M11', 255), ('M12', 1476), ('M14', 151), ('M16', 303)]
{2: 7376, 0: 331, 3: 1094, 7: 53, 5: 153, 6: 73, 4: 353, 8: 63, 9: 43, 11: 6, 10: 10, 17: 4, 18: 4, 16: 1, 12: 1, 19: 2, 15: 2, 21: 1, 20: 1}


In [None]:
pwc = {}
for f, c in sorted(dict(failed).items()):
    # if "/M12_B3_UW56" in f:
    #     print(f, c)
    
    if  c != 2:
        continue
    p, x = f.replace(noisered_path, "").split("/")[1:3]
    
    ss = x.replace(p, "").split("_")
    pw = f"{ss[1]}_{ss[2]}"
    if pwc.get(p, None) is None:
        pwc[p] = {
            pw: [c]
        }
    else:
        if pwc[p].get(pw, None) is None:
            pwc[p][pw] = [c]
        else:
            pwc[p][pw].append(c)

for k, v in pwc.items():
    for kk, vv in v.items():
        # if len(vv) > 1:
            print(k + "_" + kk, len(vv))

In [12]:
len(os.listdir("/data/zaki/UASpeech/audio/noisereduce/M12"))

4590

Exclusion 1: Segments more than 3

In [None]:
# with open("exclude_path.csv", "w") as ff:
#     excluded = [f for f, c in sorted(dict(failed).items()) if c >= 10]
#     ff.write("\n".join(excluded))

Preprocess 0, 2, 3 segments:
- Try 10 times for each aggressiveness 3,2,1
- If produce 1 segment, return the segment
- If produce 0 segment

In [None]:
import shutil

def process_with_retry(input_path):
    aggresiveness = 3
    rep = 10

    segment_map = {}
    while aggresiveness > 0:
        while rep > 0:
            vad = webrtcvad.Vad(aggresiveness)
            audio, sample_rate = read_wave(input_path)
            frames = frame_generator(30, audio, sample_rate)
            frames = list(frames)
            segment = list(vad_collector(sample_rate, 30, 300, vad, frames))

            if len(segment) == 1:
                return segment, sample_rate
            if len(segment) not in segment_map.keys():
                segment_map[len(segment)] = segment

            rep -= 1
        aggresiveness -= 1
            
    min_len = min(segment_map.keys())
    return segment_map[min_len], sample_rate
            
failed_retry = []
for f, c in sorted(dict(failed).items()):
    if c > 3:
        continue

    segment, sample_rate = process_with_retry(f)
    
    split = f.replace(noisered_path, "").split("/")
    p = split[1]
    output_dir = f"preprocess-failed/{p}"
    os.makedirs(output_dir, exist_ok=True)


    if len(segment) == 1:
        output_path = f"{output_dir}/{split[2]}"
        write_wave(output_path, segment[0], sample_rate)
    elif len(segment) == 0:
        try:
            shutil.copy2(f, output_dir)
        except Exception as e:
            failed_retry.append((f, len(segment)))
    elif len(segment) == 2:
        failed_retry.append((f, len(segment)))
        audio1 = AudioSegment(data=segment[0], sample_width=2, frame_rate=16000, channels=1)
        audio2 = AudioSegment(data=segment[1], sample_width=2, frame_rate=16000, channels=1)

        os.makedirs(f"preprocess-failed-2/{p}", exist_ok=True)
        output_path = f"preprocess-failed-2/{p}/{split[2]}"
        if audio1.duration_seconds > audio2.duration_seconds:
            write_wave(output_path, segment[0], sample_rate)
        else:
            write_wave(output_path, segment[1], sample_rate)
    elif len(segment) > 2:
        failed_retry.append((f, len(segment)))

    print(f"Processed {split[2]}: {len(segment)}")

In [97]:
file_path = "/data/zaki/UASpeech/preprocess-failed-2/M11/M11_B1_CW89_M7.wav"
audio1 = AudioSegment.from_file(file_path)
# print(audio1.duration_seconds)
# file_path = "/data/zaki/UASpeech/preprocess-failed-2/F04/F04_B3_CW30_M3.wav"
# audio2 = AudioSegment.from_file(file_path)
# print(audio2.duration_seconds)
# file_path = "/data/zaki/UASpeech/preprocess-failed-2/F04/F04_B3_CW30_M5.wav"
# audio3 = AudioSegment.from_file(file_path)
# print(audio3.duration_seconds)
# file_path = "/data/zaki/UASpeech/preprocess-failed-2/F04/F04_B3_CW30_M7.wav"
# audio4 = AudioSegment.from_file(file_path)
# print(audio4.duration_seconds)
# file_path = "/data/zaki/UASpeech/preprocess-failed-2/F04/F04_B3_CW30_M8.wav"
# audio5 = AudioSegment.from_file(file_path)
# print(audio5.duration_seconds)
audio1

In [43]:
dir = "preprocess-failed-2"
files = []
durations = []
for p in os.listdir(dir):
    for f in os.listdir(f"{dir}/{p}"):
        durations.append(AudioSegment.from_file(f"{dir}/{p}/{f}").duration_seconds)
        files.append(f)

In [44]:
np.min(durations), np.max(durations), files[np.argmin(durations)], files[np.argmax(durations)]

(0.6, 10.38, 'CM01_B2_CW51_M4.wav', 'M05_B1_UW41_M7.wav')

In [45]:
np.mean(durations)

1.6166648590021693

In [46]:
np.histogram(durations, bins=[0,1,2,3,4,5,10,20], range=None)

(array([1826, 3963, 1083,  302,   93,  105,    4]),
 array([ 0,  1,  2,  3,  4,  5, 10, 20]))

In [47]:
len(files)

7376

In [40]:
dir = "preprocess"
files = []
durations = []
for p in os.listdir(dir):
    for f in os.listdir(f"{dir}/{p}"):
        durations.append(AudioSegment.from_file(f"{dir}/{p}/{f}").duration_seconds)
        files.append(f)
np.histogram(durations, bins=[0,1,2,3,4,5,10,20], range=None)

(array([59465, 67521,  5238,  1042,   291,   149,    13]),
 array([ 0,  1,  2,  3,  4,  5, 10, 20]))

In [41]:
np.mean(durations), np.median(durations)

(1.1647109236533326, 1.05)

In [42]:
len(files)

133719

In [48]:
dir = "preprocess-failed"
files = []
durations = []
for p in os.listdir(dir):
    for f in os.listdir(f"{dir}/{p}"):
        durations.append(AudioSegment.from_file(f"{dir}/{p}/{f}").duration_seconds)
        files.append(f)
np.histogram(durations, bins=[0,1,2,3,4,5,10,20], range=None)

(array([  0, 238,  70,  11,   1,   8,   3]),
 array([ 0,  1,  2,  3,  4,  5, 10, 20]))

In [49]:
len(files)

331

In [56]:
import shutil

target_dir = "preprocess"
source_dir = "preprocess-failed-2"
for p in os.listdir(source_dir):
    for f in os.listdir(f"{source_dir}/{p}"):
        print(f"Copying {source_dir}/{p}/{f} to {target_dir}/{p}/{f}")
        shutil.copy2(f"{source_dir}/{p}/{f}", f"{target_dir}/{p}")

Copying preprocess-failed-2/M12/M12_B1_UW21_M3.wav to preprocess/M12/M12_B1_UW21_M3.wav
Copying preprocess-failed-2/M12/M12_B1_UW73_M5.wav to preprocess/M12/M12_B1_UW73_M5.wav
Copying preprocess-failed-2/M12/M12_B2_UW96_M3.wav to preprocess/M12/M12_B2_UW96_M3.wav
Copying preprocess-failed-2/M12/M12_B1_CW57_M8.wav to preprocess/M12/M12_B1_CW57_M8.wav
Copying preprocess-failed-2/M12/M12_B1_CW94_M3.wav to preprocess/M12/M12_B1_CW94_M3.wav
Copying preprocess-failed-2/M12/M12_B1_UW100_M5.wav to preprocess/M12/M12_B1_UW100_M5.wav
Copying preprocess-failed-2/M12/M12_B2_C19_M5.wav to preprocess/M12/M12_B2_C19_M5.wav
Copying preprocess-failed-2/M12/M12_B1_UW3_M6.wav to preprocess/M12/M12_B1_UW3_M6.wav
Copying preprocess-failed-2/M12/M12_B3_LU_M7.wav to preprocess/M12/M12_B3_LU_M7.wav
Copying preprocess-failed-2/M12/M12_B1_CW49_M4.wav to preprocess/M12/M12_B1_CW49_M4.wav
Copying preprocess-failed-2/M12/M12_B3_UW44_M5.wav to preprocess/M12/M12_B3_UW44_M5.wav
Copying preprocess-failed-2/M12/M12_B3

In [57]:
dir = "preprocess"
files = []
durations = []
for p in os.listdir(dir):
    for f in os.listdir(f"{dir}/{p}"):
        durations.append(AudioSegment.from_file(f"{dir}/{p}/{f}").duration_seconds)
        files.append(f)
np.histogram(durations, bins=[0,1,2,3,4,5,10,20], range=None)

(array([61291, 71722,  6391,  1355,   385,   262,    20]),
 array([ 0,  1,  2,  3,  4,  5, 10, 20]))

In [58]:
len(files)

141426

In [59]:
dir = "audio/noisereduce"
files = []
durations = []
for p in os.listdir(dir):
    for f in os.listdir(f"{dir}/{p}"):
        durations.append(AudioSegment.from_file(f"{dir}/{p}/{f}").duration_seconds)
        files.append(f)
np.histogram(durations, bins=[0,1,2,3,4,5,10,20], range=None)

CouldntDecodeError: Decoding failed. ffmpeg returned error code: 1

Output from ffmpeg/avlib:

ffmpeg version 4.2.2 Copyright (c) 2000-2019 the FFmpeg developers
  built with gcc 7.3.0 (crosstool-NG 1.23.0.449-a04d0)
  configuration: --prefix=/tmp/build/80754af9/ffmpeg_1587154242452/_h_env_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placeho --cc=/tmp/build/80754af9/ffmpeg_1587154242452/_build_env/bin/x86_64-conda_cos6-linux-gnu-cc --disable-doc --enable-avresample --enable-gmp --enable-hardcoded-tables --enable-libfreetype --enable-libvpx --enable-pthreads --enable-libopus --enable-postproc --enable-pic --enable-pthreads --enable-shared --enable-static --enable-version3 --enable-zlib --enable-libmp3lame --disable-nonfree --enable-gpl --enable-gnutls --disable-openssl --enable-libopenh264 --enable-libx264
  libavutil      56. 31.100 / 56. 31.100
  libavcodec     58. 54.100 / 58. 54.100
  libavformat    58. 29.100 / 58. 29.100
  libavdevice    58.  8.100 / 58.  8.100
  libavfilter     7. 57.100 /  7. 57.100
  libavresample   4.  0.  0 /  4.  0.  0
  libswscale      5.  5.100 /  5.  5.100
  libswresample   3.  5.100 /  3.  5.100
  libpostproc    55.  5.100 / 55.  5.100
audio/noisereduce/CM12/._CM12_B1_C2_M8.wav: Invalid data found when processing input
