In [1]:
from pathlib import Path
import pandas as pd
import re

# Preprocessing

## Label data

In [2]:
data_path = Path.cwd()/'..'/'data'

# Read metadata file gtzan
feature_path = data_path/'gtzan'/'features_30_sec.csv'
gtzan_metadata_df = pd.read_csv(feature_path)

# Read metadata file mtat
annotations_path = data_path/'mtat'/'annotations_final.csv'
mtat_metadata_df = pd.read_csv(annotations_path, quotechar='"', delim_whitespace=True)

# Specify cols to keep
gtzan_metadata_df = gtzan_metadata_df[['filename', 'label']]
mtat_metadata_df = mtat_metadata_df.drop('clip_id', axis=1)


In [3]:
gtzan_metadata_df.head()

Unnamed: 0,filename,label
0,blues.00000.wav,blues
1,blues.00001.wav,blues
2,blues.00002.wav,blues
3,blues.00003.wav,blues
4,blues.00004.wav,blues


In [4]:
# Define a function to change filename to match our folder structure
def apply_regex(filename):
    match = re.match(r'([^\.]+)\.\d+\.wav', filename)
    if match:
        genre = match.group(1)  
        return f"gtzan/genres_original/{genre}/{filename}"
    return filename

gtzan_metadata_df['filename'] = gtzan_metadata_df['filename'].apply(apply_regex)

In [5]:
mtat_metadata_df.head()

Unnamed: 0,no voice,singer,duet,plucking,hard rock,world,bongos,harpsichord,female singing,clasical,...,rap,metal,hip hop,quick,water,baroque,women,fiddle,english,mp3_path
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,f/american_bach_soloists-j_s__bach_solo_cantat...
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,f/american_bach_soloists-j_s__bach_solo_cantat...
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,f/american_bach_soloists-j_s__bach_solo_cantat...
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,f/american_bach_soloists-j_s__bach_solo_cantat...
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,f/american_bach_soloists-j_s__bach_solo_cantat...


In [6]:
# adjust path name to be consistent with other df before concatenating
mtat_metadata_df['mp3_path'] = ["mtat/"]*len(mtat_metadata_df)+mtat_metadata_df['mp3_path'] 

In [7]:
gtzan_metadata_df.label.value_counts()

label
blues        100
classical    100
country      100
disco        100
hiphop       100
jazz         100
metal        100
pop          100
reggae       100
rock         100
Name: count, dtype: int64

In [8]:
# get dummies for labels
dummies = pd.get_dummies(gtzan_metadata_df['label'])
dummies

Unnamed: 0,blues,classical,country,disco,hiphop,jazz,metal,pop,reggae,rock
0,True,False,False,False,False,False,False,False,False,False
1,True,False,False,False,False,False,False,False,False,False
2,True,False,False,False,False,False,False,False,False,False
3,True,False,False,False,False,False,False,False,False,False
4,True,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
995,False,False,False,False,False,False,False,False,False,True
996,False,False,False,False,False,False,False,False,False,True
997,False,False,False,False,False,False,False,False,False,True
998,False,False,False,False,False,False,False,False,False,True


In [9]:
# join dummies with original and drop label cols (one hot encoding)
gtzan_metadata_df.drop('label', axis=1, inplace=True)
gtzan_metadata_df = gtzan_metadata_df.join(dummies)

In [10]:
# check null data
gtzan_metadata_df.isnull().sum()

filename     0
blues        0
classical    0
country      0
disco        0
hiphop       0
jazz         0
metal        0
pop          0
reggae       0
rock         0
dtype: int64

In [11]:
# check null data
mtat_metadata_df.isnull().sum().sum()

0

In [12]:
# check if all columns of gtzan are in mtat to insure consistency
gtzan_metadata_df.columns.tolist() in mtat_metadata_df.columns.tolist()

False

In [13]:
# Making column lists
mtat_col = set(mtat_metadata_df.columns.tolist())
gtzan_col = set(gtzan_metadata_df.columns.tolist())

In [14]:
# Finding cols that are not in mtat
gtzan_col - mtat_col

{'filename', 'hiphop'}

In [15]:
mtat_metadata_df.columns

Index(['no voice', 'singer', 'duet', 'plucking', 'hard rock', 'world',
       'bongos', 'harpsichord', 'female singing', 'clasical',
       ...
       'rap', 'metal', 'hip hop', 'quick', 'water', 'baroque', 'women',
       'fiddle', 'english', 'mp3_path'],
      dtype='object', length=189)

In [16]:
# Rename cols so that they match other df
mtat_metadata_df.rename(columns={'hip hop': 'hiphop', 'mp3_path':'filepath'}, inplace=True)
gtzan_metadata_df.rename(columns={'filename':'filepath'}, inplace=True)

In [17]:
# check if problem solved
mtat_col = set(mtat_metadata_df.columns.tolist())
gtzan_col = set(gtzan_metadata_df.columns.tolist())
gtzan_col - mtat_col

set()

In [18]:
for col in mtat_metadata_df.select_dtypes(include=['float']).columns:
    mtat_metadata_df[col] = mtat_metadata_df[col].astype('int64') 

In [19]:
mtat_metadata_df.dtypes

no voice      int64
singer        int64
duet          int64
plucking      int64
hard rock     int64
              ...  
baroque       int64
women         int64
fiddle        int64
english       int64
filepath     object
Length: 189, dtype: object

In [20]:
gtzan_metadata_df.dtypes

filepath     object
blues          bool
classical      bool
country        bool
disco          bool
hiphop         bool
jazz           bool
metal          bool
pop            bool
reggae         bool
rock           bool
dtype: object

Handling overlapping labels and typos in mtat data
- "female singing" and "female singer", "female voice" , "woman singing," , "female" "female vocal" "woman" "woman 
singing"  "female vocals" "women"
- "male vocal" and "male vocals", "men", "male voice" "male singer" "male singing" "man singing" "man" "male"
- "female opera" and "female operatic" 
- "no voice" and "no voices" 
- "harpsichord" and "harpsicord" 
- "classical" and "clasical" 
- "orchestra" "orchestral"

In [21]:
synonyms = [['beat', 'beats'],
                ['chant', 'chanting'],
                ['choir', 'choral'],
                ['classical', 'clasical', 'classic'],
                ['drum', 'drums'],
                ['electro', 'electronic', 'electronica', 'electric'],
                ['fast', 'fast beat', 'quick'],
                ['female', 'female singer', 'female singing', 'female vocals', 'female vocal', 'female voice', 'woman', 'woman singing', 'women'],
                ['flute', 'flutes'],
                ['guitar', 'guitars'],
                ['hard', 'hard rock'],
                ['harpsichord', 'harpsicord'],
                ['heavy', 'heavy metal', 'metal'],
                ['horn', 'horns'],
                ['india', 'indian'],
                ['jazz', 'jazzy'],
                ['male', 'male singer', 'male vocal', 'male vocals', 'male voice', 'man', 'man singing', 'men'],
                ['no beat', 'no drums'],
                ['no singer', 'no singing', 'no vocal','no vocals', 'no voice', 'no voices', 'instrumental'],
                ['opera', 'operatic'],
                ['orchestra', 'orchestral'],
                ['quiet', 'silence'],
                ['singer', 'singing'],
                ['space', 'spacey'],
                ['string', 'strings'],
                ['synth', 'synthesizer'],
                ['violin', 'violins'],
                ['vocal', 'vocals', 'voice', 'voices'],
                ['strange', 'weird']]

In [22]:
def consolidate_labels(df, synonyms):
    for synonym_group in synonyms:
        # Base label is the first element in the synonym group
        base_label = synonym_group[0]
        # Additional labels are the remaining elements in the synonym group
        additional_labels = synonym_group[1:]
        # Check if the additional labels are in the dataframe
        present_labels = [label for label in additional_labels if label in df.columns]
        # Update the base label column to be true if any of the synonyms are true
        if present_labels:
            df[base_label] = df[base_label] | df[present_labels].any(axis=1)
        # Drop the additional synonym columns that are present
        df.drop(columns=present_labels, inplace=True, errors='ignore')
    return df

mtat_metadata_df = consolidate_labels(mtat_metadata_df, synonyms)
mtat_metadata_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25863 entries, 0 to 25862
Columns: 135 entries, singer to filepath
dtypes: bool(29), int64(105), object(1)
memory usage: 21.6+ MB


In [23]:
mtat_metadata_df.shape

(25863, 135)

In [24]:
mtat_metadata_df.to_csv('../data/mtat_label.csv')

In [25]:
gtzan_metadata_df.to_csv('../data/gtzan_label.csv')