In [1]:
import IPython
from pvrecorder import PvRecorder
import wave
import struct
import time
import sounddevice as sd
import numpy as np
import matplotlib.pyplot as plt
import wandb

In [2]:
recorder = PvRecorder(device_index=0, frame_length=512)
sound = []

recorder.start()
print('recording')
t_0 = time.time()
while time.time()-t_0<4:
    frame = recorder.read()
    sound.extend(frame)
        # Do something ...
recorder.stop()
recorder.delete()
fs = 16000

In [3]:
path = 'test.wav' 
with wave.open(path, 'w') as f:
                    f.setparams((1, 2, fs, 512, "NONE", "NONE"))
                    f.writeframes(struct.pack("h" * len(sound), *sound))

In [4]:
data = np.array(sound).astype(np.int16)
for i in range(100):
    fft = np.fft.fft(data)*0.1
    data = np.fft.ifft(fft)
    data = ((2**(16-4)) * data/data.max()).astype(np.int16)
   

In [5]:
data = np.array(sound).astype(np.int16)
fft = np.fft.fft(data)
fft

array([-16757737.             +0.j        ,
         -135839.30295385-147497.34922677j,
         -164858.91537085-291262.32565593j, ...,
          -41693.57867003+236245.17605371j,
         -164858.91537085+291262.32565593j,
         -135839.30295385+147497.34922677j])

In [6]:
plt.plot(data)

[<matplotlib.lines.Line2D at 0x117cdc2e0>]

In [7]:
plt.plot(fft)
  

[<matplotlib.lines.Line2D at 0x1300a27f0>]

In [8]:
roll = np.roll(fft,100)
plt.plot(roll)

[<matplotlib.lines.Line2D at 0x13015e940>]

In [9]:
ifft = np.fft.ifft(roll)
plt.plot(ifft)

[<matplotlib.lines.Line2D at 0x1301e0520>]

In [10]:
fft

array([-16757737.             +0.j        ,
         -135839.30295385-147497.34922677j,
         -164858.91537085-291262.32565593j, ...,
          -41693.57867003+236245.17605371j,
         -164858.91537085+291262.32565593j,
         -135839.30295385+147497.34922677j])

In [11]:
roll = np.roll(fft,10)
data = np.fft.ifft(roll)
data = ((2**(16-4)) * data/data.max()).astype(np.int16)

In [12]:
plt.plot(data*0.9)

[<matplotlib.lines.Line2D at 0x1301ff640>]

In [13]:
!env | grep env

In [14]:
from scipy.io.wavfile import read
import wave
import struct
import ipywidgets as widgets
import IPython
from pvrecorder import PvRecorder
import wave
import struct
import time
import sounddevice as sd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

In [15]:
duration = 1000000  # seconds
fs = 16000
myrecording = sd.rec(duration * fs, samplerate=fs, channels=1)

In [16]:
myrecording

array([[-0.00162337],
       [-0.00236033],
       [-0.00132801],
       ...,
       [ 0.        ],
       [ 0.        ],
       [ 0.        ]], dtype=float32)

In [17]:
class Response:
    def __init__(self,frame):
        self.frame_length=frame
        self.path =  './test.wav'
        self.t_delta=1
        self.fs = 16000
        self.root = Path('./')
        
    def set_time(self,delta_sec:int):
        self.t_delta=delta_sec
    
    def createdirs(self):
        if not self.dir.exists():
            self.dir.mkdir(exist_ok=True)
        for sound_class in self.class_dirs:
            sound_dir = self.dir/sound_class
            sound_dir.mkdir(exist_ok=True)

            
    def get_classes(self,sound_classes:tuple[str,str]):
        self.dir = self.root/'data'  
        self.class_dirs = sound_classes
        self.createdirs()
        self.classes={sound:{'class':idx,'count':len(list((self.dir/sound).iterdir()))} for idx,sound in enumerate(sound_classes)}
        print(self.classes)
        self.set_widgets()
        
        
    def set_widgets(self):
        
        actions=[
            widgets.Button(description=f'record {name}') for name,entry in self.classes.items()]
        for act in actions:
            act.on_click(self.record)
            
            
        save = widgets.Button(description='save \U0001F4BE')
        play = widgets.Button(description='play ▶️')
        
        delete = widgets.Button(description='delete \U0000274C')
        save.on_click(self.save)
        play.on_click(self.play)
        delete.on_click(self.drop_recording)
        self.out = widgets.Output()
        actions+=[play,save,delete,self.out]
        acts = tuple(actions)
        self.vbox=widgets.VBox(children=acts)
        display(self.vbox)
        
    def record(self,button:widgets.Button):
        with self.out:
            key = button.description.split(' ')[1]
            self.state = key
            self.classes[key]['count']+=1
            self.classes[key][f'record {self.classes[key]["count"]}']=np.array([])
#             print(dir(self.out))
#             print(self.out._Output__counter)
            self.recorder = PvRecorder(device_index=0, frame_length=self.frame_length)
            self.recorder.start()
            t_0 = time.time()
            record = np.array([]).astype(np.int16)
            while time.time()-t_0<self.t_delta:
                frame = self.recorder.read()
                record = np.append(record,np.array(frame)).astype(np.int16)
            self.recorder.stop()
            self.recorder.delete()
            self.classes[key][f'record {self.classes[key]["count"]}']=record
            

    def save(self,_):
        path = self.root/'data'
        if not path.exists():
            path.mkdir(exist_ok=True)
        with self.out:
            for state in self.classes:
                class_dir = path/state
                if not class_dir.exists():
                    class_dir.mkdir(exist_ok=True)
                for key in self.classes[state]:
                    if 'record' in key:
                        rec = self.classes[state][key]
                        fid = class_dir/f'{state}_{key}.wav'
                        with wave.open(str(fid), 'w') as f:
                            f.setparams((1, 2, self.fs, 512, "NONE", "NONE"))
                            f.writeframes(struct.pack("h" * len(rec), *rec))
        
    def play(self,_):
        try:
            count = self.classes[self.state]['count']
            print(self.classes[self.state][f'record {count}'])
            for key in self.classes[self.state]:
                if 'record' in key:
                    sd.play(self.classes[self.state][key], self.fs)
                    sd.wait()
        except(KeyError,AttributeError):
            print('no recordings')

    def drop_recording(self,_):
        try:
            if self.classes[self.state]['count']!=0:
                self.classes[self.state].popitem()
                self.classes[self.state]['count']-=1
        except(KeyError,AttributeError):
            print('no recordings')

In [18]:
sample = Response(frame=512)
sample.set_time(delta_sec=1)
sample.get_classes(('yes','no','background'))

In [19]:
!tree data

In [20]:
import wave
import numpy as np
import matplotlib.pyplot as plt
def read_wav(fid:str):
    # Read file to get buffer                                                                                               
    ifile = wave.open(fid)
    samples = ifile.getnframes()
    audio = ifile.readframes(samples)

    # Convert buffer to float32 using NumPy                                                                                 
    audio_as_np_int16 = np.frombuffer(audio, dtype=np.int16)
    return audio_as_np_int16

In [21]:
!pwd

In [22]:
 !mkdir datasets
 !wget 'https://github.com/karoldvl/ESC-50/archive/master.zip' -P ~/pico/audio-classifier/datasets
 !unzip -q ~/datasets/master.zip -d ./datasets/

In [23]:
#  !mkdir datasets
#  !wget 'https://github.com/karoldvl/ESC-50/archive/master.zip' -P ~/pico/audio-classifier/datasets
#  !unzip -q ~/datasets/master.zip -d ./datasets/

In [24]:
import pandas as pd
esc50_csv = './datasets/ESC-50-master/meta/esc50.csv'
base_data_path = './datasets/ESC-50-master/audio/'

df = pd.read_csv(esc50_csv)
df.head(100)

             filename  fold  target        category  esc10  src_file take
0    1-100032-A-0.wav     1       0             dog   True    100032    A
1   1-100038-A-14.wav     1      14  chirping_birds  False    100038    A
2   1-100210-A-36.wav     1      36  vacuum_cleaner  False    100210    A
3   1-100210-B-36.wav     1      36  vacuum_cleaner  False    100210    B
4   1-101296-A-19.wav     1      19    thunderstorm  False    101296    A
..                ...   ...     ...             ...    ...       ...  ...
95   1-20133-A-39.wav     1      39  glass_breaking  False     20133    A
96   1-202111-A-3.wav     1       3             cow  False    202111    A
97   1-20545-A-28.wav     1      28         snoring  False     20545    A
98   1-20736-A-18.wav     1      18    toilet_flush  False     20736    A
99   1-208757-A-2.wav     1       2             pig  False    208757    A

[100 rows x 7 columns]

In [25]:
import tensorflow as tf
import tensorflow_io as tfio

In [26]:
sample_rate = 16000
chans = 1
sound = read_wav('./data/yes/yes_record 3.wav')
sound = sound.astype(np.float32, order='F') / 32768.0

In [27]:
file_contents = tf.io.read_file('./data/yes/yes_record 5.wav')
wav, sample_rate = tf.audio.decode_wav(file_contents, desired_channels=chans)
squeeze_wav = tf.squeeze(wav, axis=-1)
spect = tf.signal.stft(squeeze_wav, frame_length=512, frame_step=128)

In [28]:
wav

<tf.Tensor: shape=(16384, 1), dtype=float32, numpy=
array([[ 0.        ],
       [ 0.        ],
       [ 0.        ],
       ...,
       [-0.00704956],
       [-0.00631714],
       [-0.00570679]], dtype=float32)>

In [29]:
import scipy

In [30]:
import cmsisdsp
from numpy import pi as PI

window_size = 512
step_size = 64

hanning_window_f32 = np.zeros(window_size)
for i in range(window_size):
  hanning_window_f32[i] = 0.5 * (1 - cmsisdsp.arm_cos_f32(2 * PI * i / window_size ))
hanning_window_q15 = cmsisdsp.arm_float_to_q15(hanning_window_f32)
rfftq15 = cmsisdsp.arm_rfft_instance_q15()
status = cmsisdsp.arm_rfft_init_q15(rfftq15, window_size, 0, 1)

def get_arm_spectrogram(waveform):
  num_frames = int(1 + (len(waveform) - window_size) // step_size)
  fft_size = int(window_size // 2 + 1)
  # Convert the audio to q15
  waveform_q15 = cmsisdsp.arm_float_to_q15(waveform)
  # Create empty spectrogram array
  spectrogram_q15 = np.empty((num_frames, fft_size), dtype = np.int16)
  start_index = 0
  for index in range(num_frames):
    # Take the window from the waveform.
    window = waveform_q15[start_index:start_index + window_size]
    # Apply the Hanning Window.
    window = cmsisdsp.arm_mult_q15(window, hanning_window_q15)
    # Calculate the FFT, shift by 7 according to docs
    window = cmsisdsp.arm_rfft_q15(rfftq15, window)
    # Take the absolute value of the FFT and add to the Spectrogram.
    spectrogram_q15[index] = cmsisdsp.arm_cmplx_mag_q15(window)[:fft_size]
    # Increase the start index of the window by the overlap amount.
    start_index += step_size
  # Convert to numpy output ready for keras
  return cmsisdsp.arm_q15_to_float(spectrogram_q15).reshape(num_frames,fft_size) * 512

In [31]:
spct = get_arm_spectrogram(wav)

In [32]:
spect

<tf.Tensor: shape=(125, 257), dtype=complex64, numpy=
array([[-1.1825697e+00+0.0000000e+00j,  7.9305696e-01-3.3905774e-02j,
        -2.6453486e-01-1.7655689e-01j, ...,
         8.3236247e-03-5.0675496e-03j, -7.6637864e-03+3.2727597e-03j,
         5.6102276e-03+0.0000000e+00j],
       [-1.1571851e+00+0.0000000e+00j,  5.1088202e-01+7.2440378e-02j,
         8.9413024e-02+5.3528726e-02j, ...,
        -2.6634298e-03-8.3412975e-05j,  4.6035647e-03+5.5335462e-03j,
        -3.3004284e-03+0.0000000e+00j],
       [-1.1590379e+00+0.0000000e+00j,  5.0499785e-01-6.7214325e-02j,
         3.9693721e-02-8.3785877e-02j, ...,
        -2.0413008e-03+1.2983121e-03j,  4.9695969e-03-2.0664409e-03j,
        -7.0039034e-03+0.0000000e+00j],
       ...,
       [-1.9084167e+00+0.0000000e+00j,  1.0758590e+00+9.9544629e-02j,
        -1.5023670e-01-7.5578287e-02j, ...,
         5.2796602e-03+1.4986992e-03j, -6.4264536e-03-7.1737394e-03j,
         4.9974322e-03+0.0000000e+00j],
       [-1.7120256e+00+0.0000000e+00j,

In [33]:
plt.plot(spct)

[<matplotlib.lines.Line2D at 0x28d4a2820>,
 <matplotlib.lines.Line2D at 0x28d4a2880>,
 <matplotlib.lines.Line2D at 0x28d4927c0>,
 <matplotlib.lines.Line2D at 0x28d4927f0>,
 <matplotlib.lines.Line2D at 0x28d492820>,
 <matplotlib.lines.Line2D at 0x28d4a29a0>,
 <matplotlib.lines.Line2D at 0x28d4a2b50>,
 <matplotlib.lines.Line2D at 0x28d4a2c40>,
 <matplotlib.lines.Line2D at 0x28d4a2d30>,
 <matplotlib.lines.Line2D at 0x28d4a2e20>,
 <matplotlib.lines.Line2D at 0x28d4a2f10>,
 <matplotlib.lines.Line2D at 0x28d4a2850>,
 <matplotlib.lines.Line2D at 0x28d4b1040>,
 <matplotlib.lines.Line2D at 0x28d4b11f0>,
 <matplotlib.lines.Line2D at 0x28ca18fd0>,
 <matplotlib.lines.Line2D at 0x117b8e550>,
 <matplotlib.lines.Line2D at 0x117b8e520>,
 <matplotlib.lines.Line2D at 0x28d4b14c0>,
 <matplotlib.lines.Line2D at 0x28d4b15b0>,
 <matplotlib.lines.Line2D at 0x28d4b16a0>,
 <matplotlib.lines.Line2D at 0x28d4b1790>,
 <matplotlib.lines.Line2D at 0x28d4b1880>,
 <matplotlib.lines.Line2D at 0x28d4b1970>,
 <matplotli

In [34]:
!pip install pydub

In [35]:
fid = "./datasets/ESC-50-master/audio/1-100032-A-0.wav"
# times between which to extract the wave from
start = 0 # seconds
end = 1 # seconds

def segment(fid:str, chunk:int):
    # file to extract the snippet from
    data = [ ]
    with wave.open(fid, "rb") as infile:
        # get file data
        nchannels = infile.getnchannels()
        sampwidth = infile.getsampwidth()
        framerate = infile.getframerate()
        # set position in wave to start of segment
        for sec in range(4):
            infile.setpos(int(sec * framerate))
            data.append(infile.readframes(chunk * framerate))   
    return data, (nchannels, sampwidth , framerate)
        

In [36]:
data = segment(fid = "./datasets/ESC-50-master/audio/1-100032-A-0.wav",
       chunk=1)

in_paranet = Path('./datasets/ESC-50-master/audio/')
out_data_dir = Path('ESC-50')
out_data_dir.mkdir(exist_ok=True)
all_out = [ ]
for idx, row in df.iterrows():
    out_dir = out_data_dir/row.category
    out_dir.mkdir(exist_ok=True)
    data, vals = segment(fid = str(in_paranet/row.filename),
       chunk=1)
    chans, samp_width , rate = vals
    out_fids = [ ]
    for idx,sound in enumerate(data):
        out_fid = out_dir/f'{idx}_{row.filename}'
        out_fids.append(out_fid)
        with wave.open(str(out_fid), 'w') as outfile:
            outfile.setnchannels(chans)
            outfile.setsampwidth(samp_width)
            outfile.setframerate(rate)
            outfile.setnframes(int(len(sound) /  samp_width))
            outfile.writeframes(sound)
    all_out.append(out_fids)
        
    
    
    

In [37]:
sec_files = np.array(all_out).astype(str)
for files in range(sec_files.shape[-1]):
    df[f'{files}_{files+1}_sec']=sec_files[...,files].astype(str)

In [38]:
columns = list(df.columns)

In [39]:
table = wandb.Table(data=df,columns=columns)

In [40]:
for col in df.columns[-4:]:
    sounds = [wandb.Audio(fid) for fid in df[col].values]
    table.add_column(name=f'sound_{col}',data=sounds)

In [41]:
run = wandb.init(entity='tiny-ml',project = 'wake_word_detection')

In [42]:
run.log({'all_sound_table':table})

In [43]:
run.finish()