In [1]:
!pip3 install librosa

Collecting librosa
Collecting decorator>=3.0.0 (from librosa)
  Using cached https://files.pythonhosted.org/packages/5f/88/0075e461560a1e750a0dcbf77f1d9de775028c37a19a346a6c565a257399/decorator-4.4.0-py2.py3-none-any.whl
Collecting audioread>=2.0.0 (from librosa)
Collecting resampy>=0.2.0 (from librosa)
Collecting six>=1.3 (from librosa)
  Using cached https://files.pythonhosted.org/packages/73/fb/00a976f728d0d1fecfe898238ce23f502a721c0ac0ecfedb80e0d88c64e9/six-1.12.0-py2.py3-none-any.whl
Collecting joblib>=0.12 (from librosa)
  Using cached https://files.pythonhosted.org/packages/cd/c1/50a758e8247561e58cb87305b1e90b171b8c767b15b12a1734001f41d356/joblib-0.13.2-py2.py3-none-any.whl
Collecting numba>=0.38.0 (from librosa)
  Using cached https://files.pythonhosted.org/packages/84/8e/18e74153e6bddda68a6ff9382b9d347d5da8599ea2326b34ded099df5216/numba-0.44.1-cp35-cp35m-manylinux1_x86_64.whl
Collecting numpy>=1.8.0 (from librosa)
  Using cached https://files.pythonhosted.org/packages/bb/ef/d5

In [4]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot
%matplotlib inline
import librosa

import tensorflow as tf
import lib.util as ut

In [5]:
audio_path = "./data/audiodata"
labels_file = "./data/conflictlevel.csv"

intervals_seconds = 3
sample_rate = 8000

In [None]:
labels_df = pd.read_csv(labels_file, header=None)
labels_df.head()

In [None]:
labels_df[1].hist(bins=20)

## Randomly separate files in Train, Test data validation data

In [None]:
def train_test_val_split(df, test_percentage, validation_percentage):
    """
    shuffles the data and divide the dataframe in train, test and validation dataframes
    according to the percentages given
    returns the 3 dataframes
    """
    
    indices = np.random.permutation(df.index)
    test_size = int(df.shape[0]*test_percentage)
    val_size = int(df.shape[0]*validation_percentage)
    
    test = df.iloc[indices[:test_size]]
    validation = df.iloc[indices[test_size:test_size+val_size]]
    train = df.iloc[indices[test_size+val_size:]]
    
    assert (test.shape[0]+validation.shape[0]+train.shape[0]) == df.shape[0]
    
    return train, test, validation

In [None]:
train_df, test_df, validation_df = train_test_val_split(labels_df, 0.2, 0.15)

In [None]:
train_df.to_csv("./data/train_files.csv", index=False)
test_df.to_csv("./data/test_files.csv", index=False)
validation_df.to_csv("./data/validation_files.csv", index=False)

In [None]:
train_df.head()

In [None]:
train_df["0"].values

In [14]:
train_df = pd.read_csv("./data/train_files.csv", header=None, names=[0,1, "class"], skiprows=1)
#test_df = pd.read_csv("./data/test_files.csv")
#validation_df = pd.read_csv("./data/validation_files.csv")



## Prepare data for training (convert sample rate, divide files in N seconds intervals, create dataset)

In [7]:
def divide_audio_file(path, intervals_seconds, interval_step, sample_rate=8000):
    """

    """
    
    # loads file and converts to the specified sample rate    
    audio, fs = librosa.load(path, sample_rate)
    
    audio_array = []
    for i in range(0, audio.shape[0], interval_step):   
        interval = audio[i:i+sample_rate*intervals_seconds]
#         print("interval from {} to {}".format(i, i+sample_rate*intervals_seconds))
        
        # if the last interval is shorter han the interval in seconds we define we are going to ignore it
        if interval.shape[0] < sample_rate*intervals_seconds:
            break
        else:
            if (not ut.is_silence(interval,thresold_samples=0.70)):
                audio_array.append(interval)
            else:
                print("Omitting chunk with silences in file {}".format(path))

    return np.array(audio_array)

In [8]:
def create_dataset(audio_path, file_names_df):
    
    x_data = []
    labels = []
    
    for file in os.listdir(audio_path):

        file_path = audio_path + "/" + file

        short_name = file.split(".")[0]

        # if the file is in the dataframe with the file names(train or test) we divide it, if not we ignore
        if short_name in file_names_df[0].values:
            print("reading file {}".format(file))

            divided_file =  divide_audio_file(file_path, intervals_seconds, sample_rate)

            file_label = file_names_df[file_names_df[0] == short_name]["class"].values[0]
            labels_array = np.ones(divided_file.shape[0]) * file_label
            print(labels_array)
            
            x_data.extend(divided_file)
            labels.extend(labels_array)
            
        else:
            print("file {} not in the dataframe".format(file))

    return np.array(x_data), np.array(labels)


# Work with tf records

In [9]:
def _float_feature(value):
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

In [10]:
def convert_and_save_records(x_data, labels, save_path, tf_record_files):
    
    filename = '{}/{}_{}.tfrecords'.format(save_path, len(x_data), tf_record_files)
    writer = tf.python_io.TFRecordWriter(filename)

    for i, interval in enumerate(x_data):
        feature = {'sound':  _bytes_feature(tf.compat.as_bytes(interval.tostring())),
                   'label':  _float_feature(labels[i])}

        example = tf.train.Example(features=tf.train.Features(feature=feature))
        writer.write(example.SerializeToString())

    writer.close()
    

In [11]:
def create_tf_records(audio_path, file_names_df, save_path):
    
    x_data = []
    labels = []
    tf_record_files = 1  
    for file in os.listdir(audio_path):

        file_path = audio_path + "/" + file
        short_name = file.split(".")[0]

        # if the file is in the dataframe with the file names(train or test) we divide it, if not we ignore
        if short_name in file_names_df[0].values:
            print("reading file {}".format(file))

            divided_file =  divide_audio_file(file_path, intervals_seconds, sample_rate)

            file_label = file_names_df[file_names_df[0] == short_name]["class"].values[0]
            labels_array = np.ones(divided_file.shape[0]) * file_label
#             print(labels_array)
            
            x_data.extend(divided_file)
            labels.extend(labels_array)            
            
            # if we already have more than 2500 files dump them in a tf records file
            if len(x_data) > 2500:
                convert_and_save_records(x_data, labels, save_path, tf_record_files)
            
                x_data = []
                labels = []
                tf_record_files += 1
                        
        else:
            print("file {} not in the dataframe".format(file))
       
    #  for the final data
    if len(x_data) > 0:
        convert_and_save_records(x_data, labels, save_path, tf_record_files)
                
    return 0


In [None]:
create_tf_records(audio_path, train_df,"./data/tf_data/train_regression" )

In [None]:
create_tf_records(audio_path, validation_df,"./data/tf_data/validation_regression" )

In [None]:
# x, y = create_dataset(audio_path, train_df)

In [None]:
# save_dataset(x, y, "./data/train_dataset.npy")