In [112]:
from pathlib import Path
import pandas as pd
import re

# Preprocessing

## Label data

In [113]:
data_path = Path.cwd()/'..'/'data'

# Read metadata file gtzan
feature_path = data_path/'gtzan'/'features_30_sec.csv'
gtzan_metadata_df = pd.read_csv(feature_path)

# Read metadata file mtat
annotations_path = data_path/'mtat'/'annotations_final.csv'
mtat_metadata_df = pd.read_csv(annotations_path, quotechar='"', delim_whitespace=True)

# Specify cols to keep
gtzan_metadata_df = gtzan_metadata_df[['filename', 'label']]
mtat_metadata_df = mtat_metadata_df.drop('clip_id', axis=1)

# Add source dataset col
gtzan_metadata_df['data_origin'] = 0
mtat_metadata_df['data_origin'] = 1

In [114]:
gtzan_metadata_df.head()


Unnamed: 0,filename,label,data_origin
0,blues.00000.wav,blues,0
1,blues.00001.wav,blues,0
2,blues.00002.wav,blues,0
3,blues.00003.wav,blues,0
4,blues.00004.wav,blues,0


In [115]:
# Define a function to change filename to match our folder structure
def apply_regex(filename):
    match = re.match(r'([^\.]+)\.\d+\.wav', filename)
    if match:
        genre = match.group(1)  
        return f"gtzan/genres_original/{genre}/{filename}"
    return filename

gtzan_metadata_df['filename'] = gtzan_metadata_df['filename'].apply(apply_regex)

In [116]:
mtat_metadata_df.head()

Unnamed: 0,no voice,singer,duet,plucking,hard rock,world,bongos,harpsichord,female singing,clasical,...,metal,hip hop,quick,water,baroque,women,fiddle,english,mp3_path,data_origin
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,f/american_bach_soloists-j_s__bach_solo_cantat...,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,f/american_bach_soloists-j_s__bach_solo_cantat...,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,f/american_bach_soloists-j_s__bach_solo_cantat...,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,f/american_bach_soloists-j_s__bach_solo_cantat...,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,f/american_bach_soloists-j_s__bach_solo_cantat...,1


In [117]:
# adjust path name to be consistent with other df before concatenating
mtat_metadata_df['mp3_path'] = ["mtat/"]*len(mtat_metadata_df)+mtat_metadata_df['mp3_path'] 

In [118]:
gtzan_metadata_df.label.value_counts()

label
blues        100
classical    100
country      100
disco        100
hiphop       100
jazz         100
metal        100
pop          100
reggae       100
rock         100
Name: count, dtype: int64

In [119]:
# get dummies for labels
dummies = pd.get_dummies(gtzan_metadata_df['label'])
dummies

Unnamed: 0,blues,classical,country,disco,hiphop,jazz,metal,pop,reggae,rock
0,True,False,False,False,False,False,False,False,False,False
1,True,False,False,False,False,False,False,False,False,False
2,True,False,False,False,False,False,False,False,False,False
3,True,False,False,False,False,False,False,False,False,False
4,True,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
995,False,False,False,False,False,False,False,False,False,True
996,False,False,False,False,False,False,False,False,False,True
997,False,False,False,False,False,False,False,False,False,True
998,False,False,False,False,False,False,False,False,False,True


In [120]:
# join dummies with original and drop label cols (one hot encoding)
gtzan_metadata_df.drop('label', axis=1, inplace=True)
gtzan_metadata_df = gtzan_metadata_df.join(dummies)

In [121]:
# check if all columns of gtzan are in mtat
gtzan_metadata_df.columns.tolist() in mtat_metadata_df.columns.tolist()

False

In [122]:
# Making column lists
mtat_col = set(mtat_metadata_df.columns.tolist())
gtzan_col = set(gtzan_metadata_df.columns.tolist())

In [123]:
# Finding cols that are not in mtat
gtzan_col - mtat_col

{'filename', 'hiphop'}

In [124]:
mtat_metadata_df.columns

Index(['no voice', 'singer', 'duet', 'plucking', 'hard rock', 'world',
       'bongos', 'harpsichord', 'female singing', 'clasical',
       ...
       'metal', 'hip hop', 'quick', 'water', 'baroque', 'women', 'fiddle',
       'english', 'mp3_path', 'data_origin'],
      dtype='object', length=190)

In [125]:
# Rename cols so that they match other df
mtat_metadata_df.rename(columns={'hip hop': 'hiphop', 'mp3_path':'filename'}, inplace=True)

In [126]:
# check if problem solved
mtat_col = set(mtat_metadata_df.columns.tolist())
gtzan_col = set(gtzan_metadata_df.columns.tolist())
gtzan_col - mtat_col

set()

In [127]:
# Concatenate dataframes together
df_join = pd.concat([mtat_metadata_df, gtzan_metadata_df], axis=0, join='outer', ignore_index=True)

In [128]:
# Checking if gtzan data is merged nicely into mtat
print(df_join.iloc[25890, :].dropna())

classical                                                0
jazz                                                     0
country                                                  0
reggae                                                   0
disco                                                    0
pop                                                      0
blues                                                    1
rock                                                     0
metal                                                    0
hiphop                                                   0
filename       gtzan/genres_original/blues/blues.00027.wav
data_origin                                              0
Name: 25890, dtype: object


In [129]:
df_join.tail()

Unnamed: 0,no voice,singer,duet,plucking,hard rock,world,bongos,harpsichord,female singing,clasical,...,metal,hiphop,quick,water,baroque,women,fiddle,english,filename,data_origin
26858,,,,,,,,,,,...,0,0,,,,,,,gtzan/genres_original/rock/rock.00095.wav,0
26859,,,,,,,,,,,...,0,0,,,,,,,gtzan/genres_original/rock/rock.00096.wav,0
26860,,,,,,,,,,,...,0,0,,,,,,,gtzan/genres_original/rock/rock.00097.wav,0
26861,,,,,,,,,,,...,0,0,,,,,,,gtzan/genres_original/rock/rock.00098.wav,0
26862,,,,,,,,,,,...,0,0,,,,,,,gtzan/genres_original/rock/rock.00099.wav,0


In [130]:
# fill NaNs with 0
df_join.fillna(0, inplace=True)

In [131]:
# checking if all NaNs have been handled
df_join.isnull().sum().sum()

0

In [132]:
for col in df_join.select_dtypes(include=['float']).columns:
    df_join[col] = df_join[col].astype('int64') 

In [133]:
df_join.dtypes


no voice        int64
singer          int64
duet            int64
plucking        int64
hard rock       int64
                ...  
women           int64
fiddle          int64
english         int64
filename       object
data_origin     int64
Length: 190, dtype: object

In [134]:
df_join.to_csv('../data/label.csv')

## Audio data