In [0]:
import os
import os.path
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

import csv
import sys
import errno

from pydub import AudioSegment
from pydub.playback import play
from pydub.silence import split_on_silence
from pydub.utils import make_chunks

%matplotlib inline

In [0]:
# Upload the cleaned csv files to identify and determine 
# audio files with accented labels

header = ['filename', 'text',	'up_votes', 'down_votes',	'age', 'gender', 'accent']

df_accent_dev = pd.read_csv('/path/datafolder/file.csv', names=header, delimiter=",")
df_accent_train = pd.read_csv('/path/datafolder/file.csv', names=header, delimiter=",")
df_accent_test = pd.read_csv('/path/datafolder/file.csv', names=header, delimiter=",")

row_acc_dev, col_acc_dev = df_accent_dev.shape
row_acc_train, col_acc_train = df_accent_train.shape
row_acc_test, col_acc_test = df_accent_test.shape

print("There are %d accented audiofiles in dev directory" %(row_acc_dev))
print("There are %d accented audiofiles in training directory" %(row_acc_train))
print("There are %d accented audiofiles in test directory" %(row_acc_test))

In [0]:
# df_accent_dev.head(5)
# df_accent_train.head(5)
# df_accent_test.head(5)

In [0]:
datadir = '/path/datafolder' #Insert folder path of the audio dataset

for i, audiofile in enumerate(os.listdir(datadir)):
  if audiofile.endswith('.wav'):
    # Insert directory you'd like to save in
    savedir = '/path/savefolder/'

    # put the dataframe relevant to the files we are running to check
    if(df_accent_dev['filename'].str.contains(audiofile) == False):
      # Skips audiofiles that are not found in the accented audiofiles list
      continue

    #audio = AudioSegment.from_mp3(audiofile)
    #audio.export()

    # Audible sound to a human is 22050 Hz (22 KHz)
    SAMPLE_RATE = 22050
    # chop at 5 seconds so the differences in the plots are more perceptible and consistent
    duration = 5 #try with and without and save in separate directories

    y, sr = audio_plot(audiofile, SAMPLE_RATE, duration, datadir)

    sig, sampling_rate = read_audio(audiofile)

    mfcc, delta_mfcc, delta2_mfcc = mfcc_log(sample_rate, MelPS_log, input_dir, audiofile, i, datadir)




In [0]:
# read in wav audiofile 
# return signal as np array and sampling rate as int
def read_audio(audiofile):
  sampling_rate, sig = wav.read(audiofile)
  return sig, sampling_rate

In [0]:
# plot audio file wave form
def audio_plot(audiofile, SAMPLE_RATE, duration, input_dir):
  y, sr = librosa.load(audiofile, sr = SAMPLE_RATE, duration = duration)

  savedir = '/path/savefolder/' 

  plt.figure(figsize=(14,4))
  plt.figure()
  plt.title("Audio sample at {} for {} ({} of {}) ".format(SAMPLE_RATE, filename, i, len(os.listdir(input_dir))))
  librose.display.waveplot(y, sr = sr)
  plt.grid
  plt.savefig(savedir + 'audio_at_{}_{}'.format(SAMPLE_RATE,audiofile))
  return y, sr

In [0]:
# chunk up audio into separate words
# audio is silent if quiter than a threshold of -16 dBFS
# whenever this threshold is met in the audio, we assume the word ended
# and start another word audio when the threshold is exceeded again

def word_chunk(audio, audiofile):
  # min silence length betn words = 100ms = 0.1s
  # define silence as 16 db quieter than average loudness 
  # don't leave silence at start and end of word chunk
  chunks = split_on_silence(audio, min_silence_len=100, silence_thresh=-16);

  # chunks is the original audio with all words zipped
  # to export separate words
  for i, chunk in enumerate(chunks):
    # output files are chunk0.wav, ..., chunk#.wav
    chunk.export("/path/to/output/words/{}chunk{}.wav".format(audiofile, i), format="wav")

In [0]:
# MFCC (Mel Frequency Cepstral Coefficients)
# MFCC decorrelates features
# Using Librosa python package
def mfcc_log(sample_rate, MelPS_log, input_dir, audiofile, i):
  # n_mfcc = 13 to extract top 13 MFCCs
  mfcc = librosa.feature.mfcc(S=MelPS_log, n_mfcc=13)

  # pad first and second deltas
  delta_mfcc = librosa.feature.delta(mfcc)
  delta2_mfcc = librosa.feature.delta(mfcc, order=2)

  savedir = '/path/savefolder/'

  # Visualization
  plt.figure(figsize=(12,4))

  plt.subplot(3,1,1)
  librosa.display.specshow(mfcc)
  plt.title("MFCC for {} ({} of {}) ".format(audiofile, i, len(os.listdir(input_dir))))")
  plt.ylabel("MFCC Coefficients")
  plt.colorbar()

  plt.subplot(3,1,2)
  librosa.display.specshow(delta_mfcc)
  plt.ylabel("MFCC-$\Delta$ Coefficients")
  plt.colorbar()

  plt.subplot(3,1,3)
  librosa.display.specshow(delta2_mfcc, sr=sample_rate, x_axis="Time")
  plt.ylabel("MFCC-$\Delta^2$ Coefficients")
  plt.xlabel("Time")
  plt.colorbar()

  plt.tight_layout()
  plt.savefig(savedir + 'MFCC_{}_{}'.format(SAMPLE_RATE,audiofile))

  return mfcc, delta_mfcc, delta2_mfcc