In [1]:
import os
import sys
from IPython import display
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import csv
from sklearn import metrics
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_io as tfio
from math import ceil
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

In [43]:
@tf.function
def load_wav_16k_mono(filename):
    """ Load a WAV file, convert it to a float tensor, resample to 16 kHz single-channel audio. """
    file_contents = tf.io.read_file(filename)
    wav, sample_rate = tf.audio.decode_wav(
          file_contents,
          desired_channels=1)
    wav = tf.squeeze(wav, axis=-1)
    sample_rate = tf.cast(sample_rate, dtype=tf.int64)
    wav = tfio.audio.resample(wav, rate_in=sample_rate, rate_out=16000)
    return wav

def speech_predicted_time(results):
    time = np.linspace(0, len(results)*hop_length, len(results)+1)
    time = time[0:len(time)-1]
    return np.stack([time, results.numpy()],axis=1)

def speech_second2minute(speech_seconds):
    speech_num_minutes = ceil(len(speech_seconds)/125)
    speech_minute = np.zeros(speech_num_minutes)
    minutes = np.linspace(0, speech_num_minutes*125, speech_num_minutes+1)
    for i in range(0, len(minutes)-2):
        start_t = int(minutes[i])
        end_t = start_t + 124
        this_min = speech_seconds[start_t:end_t]
        speech = evaluate_minute(this_min)
        speech_minute[i] = speech
    return speech_minute

def evaluate_minute(speech, window_size=2):
    windows = np.lib.stride_tricks.sliding_window_view(speech, window_size)
    speech = windows[:, 0] * windows[:, 1]
    speech = np.sum(speech)
    if speech > 0:
        return 1
    return 0

def see_specific_min(speech, timestamp):
    start = int((timestamp*60)/0.48)
    end = start + 149
    return speech[start:end]

In [7]:
saved_model_path = './my_models'
model = tf.saved_model.load(saved_model_path)
my_classes = ['not speech', 'speech']
map_class_to_id = {'not speech':0, 'speech':1}
hop_length = 0.48

In [9]:
test_audio = load_wav_16k_mono('20210916T022746+1000_sensor_-27.4766+153.029.wav')
results = model(test_audio)

In [34]:
speech = speech_predicted_time(results)
sMin = speech_second2minute(speech[:, 1])

In [45]:
thisMin = see_specific_min(speech[:,1], 36)

In [35]:
np.argwhere(sMin ==1)

array([[10],
       [15],
       [32],
       [36],
       [37],
       [41],
       [47],
       [49],
       [50],
       [55]], dtype=int64)