# Data Extraction

## Import Libraries

In [9]:
# import libraries
import os
import random
import numpy as np
import pandas as pd
import glob
import matplotlib.pyplot as plt
import seaborn as sns   
from music21 import *
from IPython.display import Audio
from intervaltree import Interval,IntervalTree
from collections import Counter
import librosa
import librosa.display
from sklearn.model_selection import train_test_split

- The dataset is 21 gigabytes and is too large to upload to github. We uploaded it to google drive and  mounted the data on a Google Collab notebook in order to extract the data features. 

- The raw file location references the google drive file locations. 

- Once the run was complete, the extracted features were then exported to csv files for further processing.

In [None]:
#Import all train label csvs into pandas and concatenate into one dataframe

#Use glob to get all the csv files in the test folder

train_path = '/content/drive/My Drive/w207_dataset/train_labels'

#Use os.path.join as this makes concatenation OS independent
all_train_labels = glob.glob(os.path.join(train_path, "*.csv"))     

df_from_each_train_file = (pd.read_csv(f).assign(file_name=os.path.basename(f).split('.')[0]) for f in all_train_labels)

df_train_labels   = pd.concat(df_from_each_train_file, ignore_index=True)

#Save it as a csv
df_train_labels.to_csv('/content/drive/My Drive/w207_dataset/df_train_labels_consol.csv')

In [None]:
# Read first 5 rows of data

df_train_labels.head()

In [None]:
#import all test label csvs into pandas and concatenate into one dataframe
  
# use glob to get all the csv files in the test folder

test_path = '/content/drive/My Drive/w207_dataset/test_labels'

# use os.path.join as this makes concatenation OS independent
all_test_labels = glob.glob(os.path.join(test_path, "*.csv"))     

df_from_each_test_file = (pd.read_csv(f).assign(file_name=os.path.basename(f).split('.')[0]) for f in all_test_labels)

df_test_labels   = pd.concat(df_from_each_test_file, ignore_index=True)

#Save it as a csv
df_test_labels.to_csv('/content/drive/My Drive/w207_dataset/df_test_labels_consol.csv')

In [None]:
#First 5 rows of data
df_test_labels.head()

### Extract Spectral Features from Training Wav Data and Save to csv. 
#### The extraction was done in three parts for the training set due to the size and this process took 6 hours due to file size.

Brief Summary of the features being extracted:
- Harmonic or Percussive Instrument : Harmonic sound is perceived as pitched sounds while Percussive sound is consonant like e.g. drums

-   Mel-Frequency Cepstral Coefficients  : Representation of the short-term power spectrum of a sound

- Chroma Features : Tonal content of a musical audio signal in a condensed form

-  Spectral Contrast : The level difference between peaks and valleys in the sound spectrum.

-  Mel-Scaled Spectogram  : Represents signal loudness as it varies over time at different frequencies

In [None]:
def feature_extract(file):
#function to extract features in Wav files with librosa

    
    #wave representation
    y, sr = librosa.load(file)
        
    #determine if instrument is harmonic or percussive 
    y_harmonic, y_percussive = librosa.effects.hpss(y)
    if np.mean(y_harmonic)>np.mean(y_percussive):
        harmonic=1
    else:
        harmonic=0
        
    #Mel-frequency cepstral coefficients
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    #averaging mfcc over time
    mfcc=np.mean(mfcc,axis=1)
    
    #get the mel-scaled spectrogram
    spectrogram = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128,fmax=8000)  
    #average spectrogram over time
    spectrogram = np.mean(spectrogram, axis = 1)
    
    #get chroma energy
    chroma = librosa.feature.chroma_cens(y=y, sr=sr)
    #average chroma over time
    chroma = np.mean(chroma, axis = 1)
    
    #spectral contrast
    contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
    contrast = np.mean(contrast, axis= 1)
    
    return [harmonic, mfcc, spectrogram, chroma, contrast]

In [None]:
train_wav_path = '/content/drive/My Drive/w207_dataset/train_data'
  
# use os.path.join as this makes concatenation OS independent
all_train_wav = glob.glob(os.path.join(train_wav_path, "*.wav")) 

In [None]:
#Extract training data in in 3 parts, part 1 : first 100
df_train_wav1 = pd.DataFrame(columns=["harmonic", "mfcc", "spectrogram", "chroma", "contrast","filename"])
for filename in all_train_wav[0:100]:
  a = feature_extract(filename)
  a.append(os.path.basename(filename).split('.')[0])
  df_train_wav1.loc[len(df_train_wav1)]= a


#extract mfccs
mfcc_train1 = pd.DataFrame(df_train_wav1.mfcc.values.tolist(),index=df_train_wav1.index)
mfcc_train1 = mfcc_train1.add_prefix('mfcc_')

#extract spectro
spectro_train1 = pd.DataFrame(df_train_wav1.spectrogram.values.tolist(),index=df_train_wav1.index)
spectro_train1 = spectro_train1.add_prefix('spectro_')

#extract chroma
chroma_train1 = pd.DataFrame(df_train_wav1.chroma.values.tolist(),index=df_train_wav1.index)
chroma_train1 = chroma_train1.add_prefix('chroma_')

#extract contrast
contrast_train1 = pd.DataFrame(df_train_wav1.contrast.values.tolist(),index=df_train_wav1.index)
contrast_train1 = chroma_train1.add_prefix('contrast_')

#drop the old columns
df_train_wav1 = df_train_wav1.drop(labels=['mfcc', 'spectrogram', 'chroma', 'contrast'], axis=1)

#concatenate
df_train_wav_fin1=pd.concat([df_train_wav1, mfcc_train1, spectro_train1, chroma_train1, contrast_train1],
                           axis=1, join='inner')
df_train_wav_fin1.head()

#Save to csv
df_train_wav_fin1.to_csv('/content/drive/My Drive/w207_dataset/df_train_wav_finc1.csv')



#Extract training data in in 3 parts, part 2 : second 100
df_train_wav2 = pd.DataFrame(columns=["harmonic", "mfcc", "spectrogram", "chroma", "contrast","filename"])
for filename in all_train_wav[100:200]:
  a = feature_extract(filename)
  a.append(os.path.basename(filename).split('.')[0])
  df_train_wav2.loc[len(df_train_wav2)]= a


#extract mfccs
mfcc_train2 = pd.DataFrame(df_train_wav2.mfcc.values.tolist(),index=df_train_wav2.index)
mfcc_train2 = mfcc_train2.add_prefix('mfcc_')

#extract spectro
spectro_train2 = pd.DataFrame(df_train_wav2.spectrogram.values.tolist(),index=df_train_wav2.index)
spectro_train2 = spectro_train2.add_prefix('spectro_')

#extract chroma
chroma_train2 = pd.DataFrame(df_train_wav2.chroma.values.tolist(),index=df_train_wav2.index)
chroma_train2 = chroma_train2.add_prefix('chroma_')

#extract contrast
contrast_train2 = pd.DataFrame(df_train_wav2.contrast.values.tolist(),index=df_train_wav2.index)
contrast_train2 = chroma_train2.add_prefix('contrast_')

#drop the old columns
df_train_wav2 = df_train_wav2.drop(labels=['mfcc', 'spectrogram', 'chroma', 'contrast'], axis=1)

#concatenate
df_train_wav_fin2=pd.concat([df_train_wav2, mfcc_train2, spectro_train2, chroma_train2, contrast_train2],
                           axis=1, join='inner')
df_train_wav_fin2.head()

#Save to csv
df_train_wav_fin2.to_csv('/content/drive/My Drive/w207_dataset/df_train_wav_finc2.csv')

#Extract training data in in 3 parts, part 3 : last 120
df_train_wav3 = pd.DataFrame(columns=["harmonic", "mfcc", "spectrogram", "chroma", "contrast","filename"])
for filename in all_train_wav[200:320]:
  a = feature_extract(filename)
  a.append(os.path.basename(filename).split('.')[0])
  df_train_wav3.loc[len(df_train_wav3)]= a

#extract mfccs
mfcc_train3 = pd.DataFrame(df_train_wav3.mfcc.values.tolist(),index=df_train_wav3.index)
mfcc_train3 = mfcc_train3.add_prefix('mfcc_')

#extract spectro
spectro_train3 = pd.DataFrame(df_train_wav3.spectrogram.values.tolist(),index=df_train_wav3.index)
spectro_train3 = spectro_train3.add_prefix('spectro_')

#extract chroma
chroma_train3 = pd.DataFrame(df_train_wav3.chroma.values.tolist(),index=df_train_wav3.index)
chroma_train3 = chroma_train3.add_prefix('chroma_')

#extract contrast
contrast_train3 = pd.DataFrame(df_train_wav3.contrast.values.tolist(),index=df_train_wav3.index)
contrast_train3 = chroma_train3.add_prefix('contrast_')

#drop the old columns
df_train_wav3 = df_train_wav3.drop(labels=['mfcc', 'spectrogram', 'chroma', 'contrast'], axis=1)

#concatenate
df_train_wav_fin3=pd.concat([df_train_wav3, mfcc_train3, spectro_train3, chroma_train3, contrast_train3],
                           axis=1, join='inner')

df_train_wav_fin3.head()

df_train_wav_fin3.to_csv('/content/drive/My Drive/w207_dataset/df_train_wav_finc3.csv')

#Concatenate the three training sample dfs and save to a consolidated csv
train_frames = [df_train_wav_fin1,df_train_wav_fin2,df_train_wav_fin3]
df_train_wav_consol = pd.concat(train_frames , ignore_index=True)
df_train_wav_consol.to_csv('/content/drive/My Drive/w207_dataset/df_train_wav_consolidated.csv')

### Extract Spectral Features from Test Wav File Data and Save to csv

In [None]:
test_wav_path = '/content/drive/My Drive/w207_dataset/test_data'

# use os.path.join as this makes concatenation OS independent
all_test_wav = glob.glob(os.path.join(test_wav_path, "*.wav"))   

In [None]:
#Create Dataframe for Test Wav data , Extract File Names
df_test_wav = pd.DataFrame(columns=["harmonic", "mfcc", "spectrogram", "chroma", "contrast","filename"])
for filename in all_test_wav:
  a = feature_extract(filename)
  a.append(os.path.basename(filename).split('.')[0])
  df_test_wav.loc[len(df_test_wav)]= a

#extract mfccs
mfcc_test = pd.DataFrame(df_test_wav.mfcc.values.tolist(),index=df_test_wav.index)
mfcc_test = mfcc_test.add_prefix('mfcc_')

#extract spectro
spectro_test = pd.DataFrame(df_test_wav.spectrogram.values.tolist(),index=df_test_wav.index)
spectro_test = spectro_test.add_prefix('spectro_')

#extract chroma
chroma_test = pd.DataFrame(df_test_wav.chroma.values.tolist(),index=df_test_wav.index)
chroma_test = chroma_test.add_prefix('chroma_')


#extract contrast
contrast_test = pd.DataFrame(df_test_wav.contrast.values.tolist(),index=df_test_wav.index)
contrast_test = chroma_test.add_prefix('contrast_')

#drop the old columns
df_test_wav = df_test_wav.drop(labels=['mfcc', 'spectrogram', 'chroma', 'contrast'], axis=1)

#concatenate
df_test_wav_fin=pd.concat([df_test_wav, mfcc_test, spectro_test, chroma_test, contrast_test],
                           axis=1, join='inner')
df_test_wav_fin.head()

#Save to csv
df_test_wav_fin.to_csv('/content/drive/My Drive/w207_dataset/df_test_wav_finc.csv')

### In the following section of the notebook, we committed the exported csvs to github and file locations referenced are for csvs in github.


## Import csv features which were extracted from earlier Data PreProcessing notebook

In [10]:
#Import data

#Get metadata
metadata=pd.read_csv('../data/musicnet_metadata.csv')

# Get consolidated saved csvs of WAV Spectral features only
train_wav = pd.read_csv('../data/df_train_wav_consolidated.csv',index_col=0)
test_wav = pd.read_csv('../data/df_test_wav_finc.csv',index_col=0)


## Import Wav Data for Spectral Features only

In [11]:
#Create X and y for Wav Data only

#Make a copy of the metadata
meta_data_copy = metadata.copy(deep=True)
meta_data_copy.reset_index(inplace=True)

#Rename column name
meta_data_copy = meta_data_copy.rename(columns = {'id':'filename'})

#Merge Metadata and Wav Data Only
merged_train_data_w = pd.merge(train_wav , meta_data_copy , on="filename")
merged_train_data_w = merged_train_data_w.drop(["composer", "composition", "movement","source","transcriber","catalog_name","index"], axis=1)

merged_test_data_w = pd.merge(test_wav , meta_data_copy , on="filename")
merged_test_data_w = merged_test_data_w.drop(["composer", "composition", "movement","source","transcriber","catalog_name","index"], axis=1)

#Checked that unique ensembles in test are a subset of train
#Get list of unique ensembles
ens_list = merged_train_data_w['ensemble'].unique()

#Map list of unique ensemble names to integer
mapping = {item:i for i, item in enumerate(ens_list)}
merged_train_data_w["ensemble"] = merged_train_data_w["ensemble"].apply(lambda x: mapping[x])
merged_test_data_w["ensemble"] = merged_test_data_w["ensemble"].apply(lambda x: mapping[x])

#This is the original train test split given in kaggle. 
#The code is selecting only the ensemble for the y and letting the rest of the features be in X

X_original_train = merged_train_data_w.iloc[:,np.r_[:167,168]]
X_original_test = merged_test_data_w.iloc[:,np.r_[:167,168]]

y_original_train = merged_train_data_w.iloc[:,167:168]
y_original_test = merged_test_data_w.iloc[:,167:168]

#Concatenate the training and test data
x_frames = [X_original_train,X_original_test]
X_wav = pd.concat(x_frames , ignore_index=True)

y_frames = [y_original_train,y_original_test]
y_wav = pd.concat(y_frames , ignore_index=True)

#Find the index position of viola quintet and drop it since there is only one ensemble of that type
index_violaquintet_wav_only = y_wav[ y_wav['ensemble'] == 3 ].index
y_wav.drop(index_violaquintet_wav_only , inplace=True)
X_wav.drop(index_violaquintet_wav_only , inplace=True)

#This is the train test split for Spectral Wav Data only
X_train_w, X_test_w, y_train_w, y_test_w = train_test_split(X_wav, y_wav, train_size=0.80, test_size=0.20, stratify=y_wav, random_state=101)

#Get list of file names in train and test to apply split consistently on other datasets
filenames_in_train = list(X_train_w['filename'])
filenames_in_test = list(X_test_w['filename'])

#Remove File name from features
X_train_w.drop('filename',axis=1,inplace=True) 
X_test_w.drop('filename',axis=1,inplace=True) 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


## Print list of ensembles for reference

In [12]:
#print ensemble list
ens_list

array(['Piano Quintet', 'Solo Piano', 'Piano Trio', 'Viola Quintet',
       'String Quartet', 'Clarinet Quintet',
       'Pairs Clarinet-Horn-Bassoon', 'Wind Quintet', 'Accompanied Cello',
       'Accompanied Clarinet', 'Wind and Strings Octet', 'String Sextet',
       'Piano Quartet', 'Horn Piano Trio', 'Solo Violin', 'Solo Flute',
       'Solo Cello', 'Violin and Harpsichord',
       'Clarinet-Cello-Piano Trio', 'Accompanied Violin', 'Wind Octet'],
      dtype=object)

## Import MIDI Features for Engineered Features from Granular Data



In [13]:
#import consolidated saved csvs of wav features

train_wav = pd.read_csv('../data/df_train_wav_consolidated.csv',index_col=0)
test_wav = pd.read_csv('../data/df_test_wav_finc.csv',index_col=0)

#Get metadata
metadata=pd.read_csv('../data/musicnet_metadata.csv')

In [14]:
#Make a copy of the metadata
meta_data_copy = metadata.copy(deep=True)
meta_data_copy.reset_index(inplace=True)
#Rename column name
meta_data_copy = meta_data_copy.rename(columns = {'id':'filename'})

merged_train_data = pd.merge(train_wav , meta_data_copy , on="filename")
merged_train_data = merged_train_data.drop(["composer", "composition", "movement","source","transcriber","catalog_name","index"], axis=1)

merged_test_data = pd.merge(test_wav , meta_data_copy , on="filename")
merged_test_data = merged_test_data.drop(["composer", "composition", "movement","source","transcriber","catalog_name","index"], axis=1)

### Feature Engineering by calculating derived features from granular MIDI Data

In [15]:
#import midi file tables
train_labels = pd.read_csv('../data/df_train_labels_consol.csv')
test_labels = pd.read_csv('../data/df_test_labels_consol.csv')

#combine train and test data
labels_frames = [train_labels,test_labels]
labels = pd.concat(labels_frames , ignore_index=True)

#create series with various midi features

#number of unique instruments
midi_nunique_inst = labels.groupby('file_name').agg({"instrument": "nunique"}).reset_index().rename(columns={"instrument": "midi_nunique_inst"})

#number of unique notes
midi_nunique_note = labels.groupby('file_name').agg({"note": "nunique"}).reset_index().rename(columns={"note": "midi_nunique_note"})

#total number of notes
midi_num_notes = labels.groupby('file_name').sum()[['note']].reset_index().rename(columns={'note':'midi_num_notes'})

#quintiles of the distribution of note values (pitches)
midi_min_note = labels.groupby('file_name').min()[['note']].reset_index().rename(columns={'note':'midi_min_note'})
midi_second_quintile_note = labels[['file_name','note']].groupby('file_name').quantile(q=0.25,interpolation='nearest')[['note']].reset_index().rename(columns={'note':'midi_second_quintile_note'})
midi_median_note = labels[['file_name','note']].groupby('file_name').quantile(interpolation='nearest')[['note']].reset_index().rename(columns={'note':'midi_median_note'})
midi_fourth_quintile_note = labels[['file_name','note']].groupby('file_name').quantile(q=0.75,interpolation='nearest')[['note']].reset_index().rename(columns={'note':'midi_fourth_quintile_note'})
midi_max_note = labels[['file_name','note']].groupby('file_name').max()[['note']].reset_index().rename(columns={'note':'midi_max_note'})

#average number of notes per instrument
midi_avg_notes_inst = labels.groupby(['file_name','instrument']).sum()[['note']].groupby('file_name').mean().reset_index().rename(columns={'note':'midi_avg_notes_inst'})

#merge all columns together
midi_features = midi_nunique_inst.merge(midi_nunique_note, on='file_name')
midi_features = midi_features.merge(midi_num_notes,on='file_name')
midi_features = midi_features.merge(midi_min_note,on='file_name')
midi_features = midi_features.merge(midi_second_quintile_note,on='file_name')
midi_features = midi_features.merge(midi_median_note,on='file_name')
midi_features = midi_features.merge(midi_fourth_quintile_note,on='file_name')
midi_features = midi_features.merge(midi_max_note,on='file_name')
midi_features = midi_features.merge(midi_avg_notes_inst,on='file_name')

midi_features.to_csv('../data/midi_features.csv')

In [16]:
#Get midi features only
midi_features = pd.read_csv('../data/midi_features.csv',index_col=0)

#Create X and y for Engineered MIDI Data Attributes only

#Make a copy of the midi features
midi_features_copy = midi_features.copy(deep=True)
midi_features_copy.reset_index(inplace=True)

#Rename column name in both the copy and the original
midi_features_copy = midi_features_copy.rename(columns = {'file_name':'filename'})
midi_features = midi_features.rename(columns = {'file_name':'filename'})


#Merge Metadata and MIDI Data only

#For original midi
merged_midi_data = pd.merge(midi_features , meta_data_copy , on="filename")
merged_midi_data = merged_midi_data.drop(["composer", "composition", "movement","source","transcriber","catalog_name","index"], axis=1)

#For midi copy
merged_midi_data_copy = pd.merge(midi_features_copy , meta_data_copy , on="filename")
merged_midi_data_copy = merged_midi_data_copy.drop(["composer", "composition", "movement","source","transcriber","catalog_name","index_x"], axis=1)

#Map list of unique ensemble names to integer

#For original midi
merged_midi_data["ensemble"] = merged_midi_data["ensemble"].apply(lambda x: mapping[x])

#For midi copy
merged_midi_data_copy["ensemble"] = merged_midi_data_copy["ensemble"].apply(lambda x: mapping[x])

#To select data for train and test for midi based on the same splits done for wav based on multiple conditions you can use &:
#Removes file name as a feature and also removes viola quintet
midi_train = merged_midi_data.loc[merged_midi_data['filename'].isin(filenames_in_train)]
X_train_m = midi_train.iloc[:,np.r_[1:10,11]]
y_train_m = midi_train.iloc[:,np.r_[10]]

midi_test = merged_midi_data.loc[merged_midi_data['filename'].isin(filenames_in_test)]
X_test_m = midi_test.iloc[:,np.r_[1:10,11]]
y_test_m = midi_test.iloc[:,np.r_[10]]


## Create Merged Dataset comprising Engineered Features from MIDI Data and Spectral Features from Wav Data

In [17]:
# Create Dataset with MIDI and WAV Spectral Data

#Drop duplicate columns
merged_midi_data_copy.drop(['seconds','index_y'],axis=1,inplace=True) 

#Find the index position of viola quintet and drop it since there is only one ensemble of that type
index_violaquintet_midi_only = merged_midi_data_copy[ merged_midi_data_copy['ensemble'] == 3 ].index
merged_midi_data_copy.drop(index_violaquintet_midi_only , inplace=True)

#Merge Midi and Wav Data 
merged_data_c = pd.merge(X_wav, merged_midi_data_copy , on="filename")

#To select data for train and test for midi based on the same splits done for wav based on multiple conditions you can use &:
#Removes file name as a feature and also removes viola quintet
comb_train = merged_data_c.loc[merged_data_c['filename'].isin(filenames_in_train)]
X_train_c = comb_train.iloc[:,np.r_[0,2:177]]
y_train_c = comb_train.iloc[:,np.r_[177]]

comb_test = merged_data_c.loc[merged_data_c['filename'].isin(filenames_in_test)]
X_test_c = comb_test.iloc[:,np.r_[0,2:177]]
y_test_c = comb_test.iloc[:,np.r_[177]]


In [18]:
print('Shape of Wav Data :')
print( X_train_w.shape, y_train_w.shape , X_test_w.shape , y_test_w.shape)

Shape of Wav Data :
(263, 167) (263, 1) (66, 167) (66, 1)


In [19]:
print('Shape of MIDI Data :')
print( X_train_m.shape, y_train_m.shape , X_test_m.shape , y_test_m.shape)

Shape of MIDI Data :
(263, 10) (263, 1) (66, 10) (66, 1)


In [20]:
print('Shape of Combined Wav and MIDI Data :')
print( X_train_c.shape, y_train_c.shape , X_test_c.shape , y_test_c.shape)

Shape of Combined Wav and MIDI Data :
(263, 176) (263, 1) (66, 176) (66, 1)
