# Data Cleaning

## Numeric Feature Data

In [1]:
# Imports
import pandas as pd

In [2]:
# Reading in the data
genre = pd.read_csv('../Data/genre.csv')

In [3]:
# Taking a look at the data
genre.head()

Unnamed: 0,files,zero_crossing_rate,spectral_centroid,spectral_rolloff,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,mfcc_9,mfcc_10,mfcc_11,mfcc_12,mfcc_13,labels
0,<DirEntry 'blues.00000.wav'>,0.083045,1784.122641,3805.72303,-113.59882,121.57067,-19.162262,42.363937,-6.362266,18.621931,-13.699734,15.339802,-12.274304,10.970946,-8.326061,8.802088,-3.669941,<DirEntry 'blues
1,<DirEntry 'blues.00001.wav'>,0.05604,1530.261767,3550.713616,-207.52383,123.98514,8.947019,35.86715,2.909594,21.519472,-8.556514,23.370676,-10.103608,11.899242,-5.558824,5.377876,-2.234492,<DirEntry 'blues
2,<DirEntry 'blues.00002.wav'>,0.076291,1552.832481,3042.410115,-90.757164,140.44087,-29.084547,31.686693,-13.976547,25.753752,-13.66499,11.634442,-11.778322,9.714757,-13.125314,5.791247,-8.901967,<DirEntry 'blues
3,<DirEntry 'blues.00003.wav'>,0.033309,1070.153418,2184.879029,-199.57513,150.0861,5.663404,26.855282,1.770071,14.232647,-4.827845,9.286853,-0.75612,8.134435,-3.200026,6.078081,-2.478445,<DirEntry 'blues
4,<DirEntry 'blues.00004.wav'>,0.101461,1835.128513,3579.957471,-160.35417,126.20948,-35.581394,22.139256,-32.47355,10.850701,-23.35007,0.493249,-11.796539,1.203519,-13.084959,-2.810499,-6.934471,<DirEntry 'blues


### Creating Labels

In [4]:
# Fixing the file names and labels
genre['files'] = genre['files'].map(lambda x: x[11:-2])
genre['labels'] = genre['labels'].map(lambda x: x[11:])

In [5]:
# Mapping the labels to numeric values
label_map = {
    'blues': 1,
    'classical': 2,
    'country': 3,
    'disco': 4,
    'hiphop': 5,
    'jazz': 6,
    'metal': 7,
    'pop': 8,
    'reggae': 9,
    'rock': 10
}

genre['y'] = genre['labels'].map(label_map)

In [6]:
genre.head()

Unnamed: 0,files,zero_crossing_rate,spectral_centroid,spectral_rolloff,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,mfcc_9,mfcc_10,mfcc_11,mfcc_12,mfcc_13,labels,y
0,blues.00000.wav,0.083045,1784.122641,3805.72303,-113.59882,121.57067,-19.162262,42.363937,-6.362266,18.621931,-13.699734,15.339802,-12.274304,10.970946,-8.326061,8.802088,-3.669941,blues,1
1,blues.00001.wav,0.05604,1530.261767,3550.713616,-207.52383,123.98514,8.947019,35.86715,2.909594,21.519472,-8.556514,23.370676,-10.103608,11.899242,-5.558824,5.377876,-2.234492,blues,1
2,blues.00002.wav,0.076291,1552.832481,3042.410115,-90.757164,140.44087,-29.084547,31.686693,-13.976547,25.753752,-13.66499,11.634442,-11.778322,9.714757,-13.125314,5.791247,-8.901967,blues,1
3,blues.00003.wav,0.033309,1070.153418,2184.879029,-199.57513,150.0861,5.663404,26.855282,1.770071,14.232647,-4.827845,9.286853,-0.75612,8.134435,-3.200026,6.078081,-2.478445,blues,1
4,blues.00004.wav,0.101461,1835.128513,3579.957471,-160.35417,126.20948,-35.581394,22.139256,-32.47355,10.850701,-23.35007,0.493249,-11.796539,1.203519,-13.084959,-2.810499,-6.934471,blues,1


#### Export

In [7]:
genre.to_csv('../data/genre_clean.csv', index=False)

## Mel Spectrogram Data

In [8]:
# Reading in the data
mel_specs = pd.read_csv('../data/genre_mel_specs.csv', encoding='gbk')

In [9]:
# Taking a look at the data
mel_specs.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,84471,84472,84473,84474,84475,84476,84477,84478,84479,label
0,-33.144764,-34.706394,-30.291473,-31.983377,-34.712517,-18.91896,-20.786,-28.10627,-35.85604,-35.90428,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,blues
1,-33.348503,-17.828934,-16.082321,-21.42917,-23.70478,-29.580908,-28.251745,-31.71809,-36.745407,-38.221283,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,blues
2,-33.07819,-38.951023,-38.105515,-36.680084,-33.89211,-34.302193,-28.007008,-34.43401,-39.12656,-44.014328,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,blues
3,-28.873878,-38.427704,-44.643795,-43.34305,-49.78785,-49.41929,-45.321747,-45.88495,-48.977386,-47.32661,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,blues
4,-57.91653,-53.033882,-52.625343,-54.882698,-45.236115,-44.02924,-50.053173,-53.21808,-59.152702,-54.893375,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,blues


### Creating Labels

In [12]:
# Renaming the label column and mapping them to numeric values using the same map as above
mel_specs['y'] = mel_specs['label'].map(label_map)

#### Export

In [13]:
mel_specs.to_csv('../data/genre_mel_specs_clean.csv', index=False)