### Following steps are data preprocessing and can checked the autoencoder model after load the pickle file

In [1]:
import librosa
import numpy as np
import pandas as pd
import os
import soundfile as sf
import wave

## Data Processing

### Read the original sound files

In [3]:
meta_data = pd.read_csv('../training/UrbanSound8K/metadata/metadata.csv')

1. Combines all noise files into one
```
merge-noise.py
```

In [10]:


noise_dir = "../noise/"

infiles = os.listdir(noise_dir)
outfile = noise_dir + "combined-noise.wav"

combined_data = []
for infile in infiles:
    w = wave.open(noise_dir + infile, 'rb')
    combined_data.append([w.getparams(), w.readframes(w.getnframes())])
    w.close()

output = wave.open(outfile, 'wb')
output.setparams(combined_data[0][0])
output.writeframes(combined_data[0][1])
output.writeframes(combined_data[1][1])
output.close()

2. Takes in the noise files and spits them out at the specified sample rate
```
downsample-noise.py
```

In [11]:
#!/usr/bin/env python
# coding: utf-8

import os
import librosa
import soundfile as sf

target = '../noise_downsampled'

files = librosa.util.find_files('../noise', ext='wav')

for file in files:
    basename = os.path.basename(file)
    print(file)
    print(basename)
    y, sr = librosa.load(file, sr=44100, mono = True)
    sf.write((target + "/" + basename), y, sr, subtype = 'PCM_16')

C:\Users\kailf\Python_code\UChicago\Capstone\noise\combined-noise.wav
combined-noise.wav
C:\Users\kailf\Python_code\UChicago\Capstone\noise\noise (1).wav
noise (1).wav
C:\Users\kailf\Python_code\UChicago\Capstone\noise\noise (2).wav
noise (2).wav


3. Cuts the merged noise file into n-second slices
```
slice-noise.py
```



In [12]:
from pydub import AudioSegment
import numpy as np

noise_downsampled_dir = "../noise_downsampled/"

audio_file = noise_downsampled_dir + "combined-noise.wav"
audio = AudioSegment.from_wav(audio_file)
list_of_timestamps = list(np.arange(4,120,4))  #and so on in *seconds*

start = 0
for  idx,t in enumerate(list_of_timestamps):
    #break loop if at last element of list
    if idx == len(list_of_timestamps):
        break

    end = t * 1000 #pydub works in millisec
    print("split at [{}:{}] s".format(start/1000, end/1000))
    audio_chunk = audio[start:end]
    audio_chunk.export(noise_downsampled_dir + "noise_chunk_{}.wav".format(end/1000), format="wav")

    start = end  #pydub works in millisec


split at [0.0:4.0] s
split at [4.0:8.0] s
split at [8.0:12.0] s
split at [12.0:16.0] s
split at [16.0:20.0] s
split at [20.0:24.0] s
split at [24.0:28.0] s
split at [28.0:32.0] s
split at [32.0:36.0] s
split at [36.0:40.0] s
split at [40.0:44.0] s
split at [44.0:48.0] s
split at [48.0:52.0] s
split at [52.0:56.0] s
split at [56.0:60.0] s
split at [60.0:64.0] s
split at [64.0:68.0] s
split at [68.0:72.0] s
split at [72.0:76.0] s
split at [76.0:80.0] s
split at [80.0:84.0] s
split at [84.0:88.0] s
split at [88.0:92.0] s
split at [92.0:96.0] s
split at [96.0:100.0] s
split at [100.0:104.0] s
split at [104.0:108.0] s
split at [108.0:112.0] s
split at [112.0:116.0] s




4. Take in training files and spit them out in a single directory at the specified sample rate
```
downsample-training.py
```



Use 44100 as the target sample rate

In [16]:
import os
import librosa
import soundfile as sf

sr = 44100
audio_class = []
current_dir = '../training/UrbanSound8K/audio'
target_dir = "../training_downsampled"

for folder in os.listdir(current_dir):
	
	if folder != '.DS_Store':
 
		path = os.path.join(current_dir, folder)
		for filename in os.listdir(path):
			if filename != '.DS_Store':	
				a,b = librosa.core.load(os.path.join(path, filename),sr=sr, mono=True)
				sf.write((target_dir + "/" + filename), a, b, subtype = 'PCM_16')



5. Randomly mix noise samples with training samples
```
mix-noises-training.py
```


In [27]:
import os
import pandas as pd
from pydub import AudioSegment
from random import seed
from random import randint 
import glob
import shutil

n_iterations = 2

noise_downsampled_dir = '../noise_downsampled/'
down_sampled_training_dir = '../training_downsampled/'
#noise_downsampled_dir = '../noise_downsampled/'
target_dir = '../mixed/'

# delete all existing contents
files = glob.glob(target_dir + '*')
for f in files:
    os.remove(f)

# move clean downsampled files over
for filename in glob.glob(os.path.join(down_sampled_training_dir, '*.*')):
    shutil.copy(filename, target_dir)

mixed_meta_data = pd.read_csv('../training/UrbanSound8K/metadata/metadata.csv')

# read in noise files
noise_chunks = []
for i in os.listdir(noise_downsampled_dir):
    if os.path.isfile(os.path.join(noise_downsampled_dir,i)) and 'noise_chunk' in i:
        noise_chunks.append(i)

# depends on the number of times you want to randomly mix each file
for i in range(n_iterations):
    
    seed(i)

    # for one iteration, fix each file in the training directory
    for filename in os.listdir(down_sampled_training_dir):

        if filename != ".DS_Store":
    
            # get the downsampled training clip
            file_path1 = os.path.join(down_sampled_training_dir, filename)
            sound1 = AudioSegment.from_file(file_path1)

            # random pick a noise chunk
            random_int = randint(0, (len(noise_chunks)-1))
            noise_file = noise_chunks[random_int]
            file_path2 = os.path.join(noise_downsampled_dir, noise_file)
            sound2 = AudioSegment.from_file(file_path2)
        
            # combine both sound files
            combined = sound1.overlay(sound2)
        
            # export resulting wav to target dir
            combined.export(target_dir + "mixed_" + str(i) + "_" + filename, format="wav")

            # update metadata
            row = mixed_meta_data[mixed_meta_data['slice_file_name'] == filename]
            mixed_meta_data = mixed_meta_data.append(row)
            updated_row = mixed_meta_data.iloc[len(mixed_meta_data)-1].replace({mixed_meta_data.iloc[len(mixed_meta_data)-1,0]:"mixed_%s_%s" % (str(i),filename)})
            mixed_meta_data.iloc[len(mixed_meta_data)-1] = updated_row

            #print("length:",len(mixed_meta_data)-1)
            newname = "mixed_%s_%s" % (str(i), filename)
            #print("newname:",newname)
            
mixed_meta_data.to_csv(target_dir+'mixed_metadata.csv', index = False)


In [28]:
print('down')

down


## load the mixed sound file and loabels

In [1]:
import os
import pandas as pd
import librosa 
import scipy


meta_data = pd.read_csv('../mixed/mixed_metadata.csv')

x = []
sr = []
audio_class = []
folder_name = '../mixed/'

for filename in os.listdir(folder_name):
    if filename != "mixed_metadata.csv":
        file = os.path.join(folder_name, filename)
        #print(file)

        temp_sr, temp_x = scipy.io.wavfile.read(file)
        #print(file)
        x.append(temp_x)
        sr.append(temp_sr)

        temp_index = meta_data[meta_data['slice_file_name'] == filename].index
        audio_class.append(meta_data['classID'][temp_index[0]])


In [2]:
print('down')

down


In [3]:
len(audio_class)

26196

In [4]:
len(x)

26196

In [5]:
len(sr)

26196

f: array
Array of sample frequencies.

t: array
Array of segment times.

Z: array
STFT of x. By default, the last axis of Zxx corresponds to the segment times.

In [7]:
stft_sf = []
stft_t = []
stft_x = []
#stft_sf, stft_t, stft_x = signal.stft(x[1], sr[1])

for i in range(len(x)):
    
    f, t, Zx = signal.stft(x[i], sr[i])
    if Zx.shape[1] == 1380:
        stft_sf.append(f)    
        stft_t.append(t)
        stft_x.append(Zx)    

In [8]:
len(stft_x), len(stft_sf), len(stft_t)

(21975, 21975, 21975)

In [49]:
from sklearn.externals import joblib
filename = 'stft_x.sav'
joblib.dump(stft_x, filename)  



['stft_x.sav']

In [51]:
from sklearn.externals import joblib
filename = 'train_x.sav'
joblib.dump(train_x, filename)  

['train_x.sav']

# Load the pickle file

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.io.wavfile as wavfile
import os

from python_speech_features import mfcc, logfbank
import tensorflow as tf
from scipy import signal

In [5]:
from sklearn.externals import joblib
stft_x = joblib.load('stft_x.sav')

In [6]:
train_x = np.array(stft_x)

In [7]:
train_x.shape

(21975, 129, 1380)

In [12]:
#del X_train_split_temp,  #X_train_split,  X_valid_split, X_test_split, 

In [8]:
from sklearn.model_selection import train_test_split

X_train_split_temp, X_test_split = train_test_split(train_x, test_size=0.05,random_state= 220)

In [9]:
X_train_split, X_valid_split = train_test_split(X_train_split_temp, test_size=0.2,
                                   random_state= 22)

In [11]:
X_train_split.shape, X_valid_split.shape, X_test_split.shape

((16700, 129, 1380), (4176, 129, 1380), (1099, 129, 1380))

In [None]:
# train_len = train_x.shape[0]
# train_len*0.8

In [12]:
# X_train_split = train_x[0:17580, :, :]
# X_train_split.shape

In [346]:
# X_valid_split = train_x[17851:, :, :]
# X_valid_split.shape

(4124, 129, 1380)

In [2]:
import matplotlib.pyplot as plt
from scipy import stats
import tensorflow as tf
import seaborn as sns
from pylab import rcParams
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
import h5py
from keras.models import Model, load_model
from keras.layers import Input, Dense, BatchNormalization, Dropout
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras import regularizers

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


In [14]:
input_dim = train_x.shape[1:3]
layer1_dim = 256
layer2_dim = 64
layer3_dim = 32
encoder_dim = 8

In [15]:
input_dim

(129, 1380)

In [16]:
input_layer = Input(shape=(input_dim))

encoder1 = Dense(layer1_dim, activation="tanh", 
                activity_regularizer=regularizers.l1(10e-5))(input_layer)
normalization1 = BatchNormalization()(encoder1)
drop1 = Dropout(rate = 0.1)(normalization1)

encoder2 = Dense(layer2_dim, activation="relu")(drop1)
normalization2 = BatchNormalization()(encoder2)
drop2 = Dropout(rate = 0.1)(normalization2)

encoder3 = Dense(layer3_dim, activation="relu")(drop2)
normalization3 = BatchNormalization()(encoder3)
drop3 = Dropout(rate = 0.1)(normalization3)

encoder4 = Dense(encoder_dim, activation="relu")(drop3)
normalization4 = BatchNormalization()(encoder4)
drop4 = Dropout(rate = 0.1)(normalization4)

decoder1 = Dense(layer3_dim, activation='tanh')(drop4)

decoder2 = Dense(layer2_dim, activation='tanh')(decoder1)

decoder3 = Dense(layer1_dim, activation='tanh')(decoder2)

decoder4 = Dense(1380, activation='linear')(decoder3)

print('input_layer: ',input_layer)
print('encoder1',encoder1)
print('encoder2',encoder2)
print('encoder3',encoder3)
print('encoder4',encoder4)
print('decoder1',decoder1)
print('decoder2',decoder2)
print('decoder3',decoder3)
print('decoder4',decoder4)

input_layer:  Tensor("input_1:0", shape=(None, 129, 1380), dtype=float32)
encoder1 Tensor("dense_1/Tanh:0", shape=(None, 129, 256), dtype=float32)
encoder2 Tensor("dense_2/Relu:0", shape=(None, 129, 64), dtype=float32)
encoder3 Tensor("dense_3/Relu:0", shape=(None, 129, 32), dtype=float32)
encoder4 Tensor("dense_4/Relu:0", shape=(None, 129, 8), dtype=float32)
decoder1 Tensor("dense_5/Tanh:0", shape=(None, 129, 32), dtype=float32)
decoder2 Tensor("dense_6/Tanh:0", shape=(None, 129, 64), dtype=float32)
decoder3 Tensor("dense_7/Tanh:0", shape=(None, 129, 256), dtype=float32)
decoder4 Tensor("dense_8/add:0", shape=(None, 129, 1380), dtype=float32)


In [17]:
autoencoder = Model(inputs=input_layer, outputs=decoder4)
autoencoder.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 129, 1380)         0         
_________________________________________________________________
dense_1 (Dense)              (None, 129, 256)          353536    
_________________________________________________________________
batch_normalization_1 (Batch (None, 129, 256)          1024      
_________________________________________________________________
dropout_1 (Dropout)          (None, 129, 256)          0         
_________________________________________________________________
dense_2 (Dense)              (None, 129, 64)           16448     
_________________________________________________________________
batch_normalization_2 (Batch (None, 129, 64)           256       
_________________________________________________________________
dropout_2 (Dropout)          (None, 129, 64)           0   

In [20]:
print('down')

down


In [None]:
nb_epoch = 150
batch_size = 32

autoencoder.compile(optimizer='adam', 
                    loss='mean_squared_error')

checkpointer = ModelCheckpoint(filepath="4_6_autoencoder.h5",
                               verbose=0,
                               save_best_only=True)

earlystopping = EarlyStopping(monitor='val_loss', patience=2, verbose=0) # 'patience' number of not improving epochs

history = autoencoder.fit(X_train_split, X_train_split,
                    epochs=nb_epoch,
                    batch_size=batch_size,
                    shuffle=True,
                    validation_data=(X_valid_split, X_valid_split),
                    verbose=1,
                    callbacks=[checkpointer, #tensorboard, 
                               earlystopping]).history

Train on 16700 samples, validate on 4176 samples
Epoch 1/150


  return ops.EagerTensor(value, ctx.device_name, dtype)


Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78/150
Epoch 7

Epoch 141/150
Epoch 142/150
Epoch 143/150
Epoch 144/150
Epoch 145/150
Epoch 146/150
Epoch 147/150
Epoch 148/150
Epoch 149/150
Epoch 150/150

## Model not finished, it's still not converge and the val_loss is still decreasing for 150 epochs

In [3]:
autoencoder = load_model('4_6_autoencoder.h5')