
# 1_0 Preprocessing Raw *.m4a into temporal chunks

**Input** Raw audio files in .m4a format

**Output** DataFrame and .csv data file of sample extractions





---



# Library Imports

In [0]:
import pandas as pd
import numpy as no
import librosa
import matplotlib.pyplot as plt
import random
import glob
import os
import shutil
from os import listdir
from os.path import isfile, join


# Global Parameters

In [0]:
raw_path = '/content/drive/My Drive/CAPSTONE/raw_data/*.m4a'
data_path = '/content/drive/My Drive/CAPSTONE/data/'
processed_path = '/content/drive/My Drive/CAPSTONE/processed_files/'
sampling_rate = 1600
num_of_chops = 5
chop_size = 10
num_of_songs = len(glob.glob(raw_path))
csv_name = '{}_songs_{}_chops_{}_seconds.csv'.format(num_of_songs,num_of_chops,chop_size)

# Samples Extraction

In [0]:
def load_song(song):
  # STEP 1: apply librosa sample load method
  samples, sampling_rate = librosa.load(song,
                                        sr = 1600,
                                        mono = True,
                                        offset = 0.0,
                                        duration = None)
  return samples, sampling_rate, song

# Label Extraction

In [0]:
def extract_label(song):
  # STEP 1: split on hyphen
  labels = song.split('-')

  # STEP 2: select composer label
  composer = labels[-2]

  # STEP 3: select and clean era label
  era = labels[-1].split('.')[0]

  return composer, era

# Chop Function

In [0]:
def chop_song(samples, sampling_rate, song, composer, era, chop_size, num_of_chops):
  # STEP 2: make dataframe from samples
  df_amps = pd.DataFrame(samples)
  
  # STEP 3: make time window by multiplying sampling rate by chop size (number of samples)
  time_window = int(sampling_rate * chop_size)

  # STEP 4: make empty dataframe to store chops as rows
  df_chops = pd.DataFrame()

  # STEP 5: for number of chops, extract random sample
  for _ in range(num_of_chops):
    # determine abs end of samples 
    end = df_amps.index[-1]
    # set last sample to be used in randomizer
    last = df_amps.index[end - time_window]
    # randomly select start point (integer between 0 and last)
    start_point = random.randrange(0, last, 1)
    # set end point as start point plus time window
    end_point = start_point + time_window
    
    # STEP 6: chop sample
    df_chop = df_amps[start_point:end_point]

    # STEP 7: convert to dataframe
    df_chop = df_chop.reset_index()
    df_chop = df_chop.drop(columns=['index'])
    df_chop = df_chop.transpose()
    df_chop['song'] = str(song)
    df_chop['sampling_rate'] = sampling_rate
    df_chop['composer'] = composer
    df_chop['era'] = era

    # STEP 8: add to all chops dataframe
    df_chops = df_chops.append(df_chop)

  return df_chops

# Building CSV - Optional

In [0]:
def instantiate_csv():
  index = []
  for n in range(sampling_rate * chop_size):
    index.append(n)
  
  index.append('song')
  index.append('sampling_rate')
  index.append('composer')
  index.append('era')
  csv_structure = pd.DataFrame(columns=index)
  


  csv_structure.to_csv(data_path + csv_name)
                               
  return csv_structure

# Building DataFrame to CSV

In [0]:
#df_local = pd.DataFrame()
c = 0
for i in glob.glob(raw_path):
  samples, sampling_rate, song = load_song(i)
  composer, era = extract_label(song)
  df_chops = chop_song(samples=samples, 
                        sampling_rate=sampling_rate,
                        song=song,
                        composer=composer,
                        era=era,
                        chop_size=chop_size,
                        num_of_chops=num_of_chops)
  
  print(df_chops.head(1))
  df_local = df_local.append(df_chops)
  df_local.to_csv(data_path + csv_name)
  c += 1
  new_path = processed_path + i.split('/')[-1]
  shutil.move(i, new_path)

  print('Song Processed: '+i)
  print('Total Processed: '+str(c))

print('Done Processing All Songs')