# Audio music genre classifier

Train a classifier with maximum accuracy measured as Correctly Classified Instances percentage (C.C.I.%) using 10-fold cross-validation applying different Machine Learning algorithms and given data of music genre in a .arff file.

## Setup / import libraries and data loading

In [27]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.metrics import accuracy_score, precision_score
from scipy.io import arff
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from sklearn.model_selection import (
    KFold,
    cross_val_score,
    train_test_split
)

#genre music classification using LR and SVM
data = arff.loadarff('./assets/genre.arff')
dataframe = pd.DataFrame(data[0])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Columns: 241 entries, highlevel_danceability_value to genre
dtypes: float64(237), object(4)
memory usage: 1.5+ MB


Unnamed: 0,highlevel_danceability_value,highlevel_equalization_profile_value,highlevel_excitement_value,highlevel_intensity_value,highlevel_speech_music_value,highlevel_voice_instrumental_value,loudness_dynamic_complexity_dvar,loudness_dynamic_complexity_mean,loudness_dynamic_complexity_var,loudness_larm_dvar,...,tonal_chords_strength_dvar,tonal_chords_strength_mean,tonal_chords_strength_var,tonal_dissonance_dvar,tonal_dissonance_mean,tonal_dissonance_var,tonal_key_mode_value,tonal_key_strength_value,tonal_tuning_equal_tempered_deviation_value,genre
0,0.206472,0.372766,0.029087,0.093526,b'music',b'voice',-0.411385,0.024215,-0.201421,-0.532833,...,-0.255337,1.140774,-1.490395,0.494393,0.149725,-0.013634,b'minor',0.526779,-0.205793,b'blu'
1,-0.368479,0.372766,1.341977,0.093526,b'music',b'voice',-0.349921,-0.015672,-0.173045,-0.452769,...,-0.121114,0.497749,-0.901662,0.347729,-0.51764,0.184709,b'major',-0.129683,-0.667979,b'blu'
2,-0.631747,0.372766,-1.283803,-1.324366,b'speech',b'instrumental',4.235612,5.779127,3.973258,1.713613,...,-0.190639,1.408974,-0.502029,1.174496,-1.45416,2.144489,b'major',-1.022781,0.937795,b'blu'
3,-0.190116,0.372766,1.341977,0.093526,b'music',b'voice',-0.307576,0.068818,-0.216986,-0.519222,...,0.257633,0.622749,1.399033,0.791524,-0.601227,0.873673,b'major',0.074454,-0.406373,b'blu'
4,0.588278,0.372766,1.341977,0.802473,b'music',b'instrumental',-0.332032,-1.078238,0.35068,-0.765884,...,-0.309481,1.301291,-0.708195,-0.093155,0.287901,-0.644858,b'major',0.919307,-0.917709,b'blu'


## Pre-processing
Converting feature type and scaling values to be usable by the model

In [39]:
genre = dataframe['genre']
unique_genres = genre.unique()
print('Unique genres labels:', unique_genres)
print('Unique genres number:', len(unique_genres))
X = dataframe.drop(['genre'], axis=1)
print('Number of features:', len(X.columns))

# Numerical and categorical features data splitting
categorical_features = X.select_dtypes(include='object')
numeric_features = X.select_dtypes(include='float64')

# Column transformers definition
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")),
           ("scaler", StandardScaler())]
)
categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
        ("selector", SelectPercentile(chi2, percentile=50)),
    ]
)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, selector(dtype_exclude="object")),
        ("cat", categorical_transformer, selector(dtype_include="object")),
    ]
)

label_encoder = LabelEncoder()
encoded_y = label_encoder.fit_transform(y)
label_encoder_name_mapping = dict(zip(label_encoder.classes_,
                                         label_encoder.fit_transform(label_encoder.classes_)))

Unique genres labels: [b'blu' b'cla' b'cou' b'dis' b'hip' b'jaz' b'met' b'pop' b'reg' b'roc']
Unique genres number: 10
Number of features: 240


## Logistic Regression

In [38]:
clf = Pipeline(
    
    steps=[("preprocessor", preprocessor),
           ("classifier", LogisticRegression(max_iter = 1000))]
)

X_train, X_test, y_train, y_test = train_test_split(
    X, encoded_y, test_size = 0.2, random_state = 42)

clf.fit(X_train, y_train)
scores = cross_val_score(clf, X, encoded_y, cv = 10)
scores_mean = scores.mean() * 100 # Accuracy
scores_std = scores.std() * 100 # Deviation
print(f"{scores_mean} % of accuracy with standard deviation of {scores_std} %")

81.37500000000001 % of accuracy with standard deviation of 5.137180647008628 %


In [40]:
# Get the features and coefficients
num_names = clf.named_steps['preprocessor'].transformers_[0][1].get_feature_names_out()
cat_names = clf.named_steps['preprocessor'].transformers_[1][1].get_feature_names_out()
feature_names = np.concatenate([num_names, cat_names])
coef = clf.named_steps['classifier'].coef_

# Create a pandas dataframe with the feature names and coefficients and sort it 
coef_df = pd.DataFrame({"feature": feature_names, "coef": coef[0]})
coef_df["abs_coef"] = np.abs(coef_df["coef"])
coef_df = coef_df.sort_values("abs_coef", ascending=False)

# Print the top 5 most important features
print("Logistic Regression top 5 most important features:")
print(coef_df.head(5))

Logistic Regression top 5 most important features:
                               feature      coef  abs_coef
10          loudness_replay_gain_value -1.087578  1.087578
85          spectral_barkbands_21_mean  0.722917  0.722917
123  spectral_energybandratio_low_dvar  0.556905  0.556905
151              spectral_mfcc_02_mean  0.541051  0.541051
13                 loudness_vicker_var -0.536735  0.536735


## Support Vector Machines with KFold cross-validation

In [35]:
# Pipeline building and k-fold cross-validation splitter
clf = Pipeline(
    steps=[("preprocessor", preprocessor),
           ("classifier", SVC(kernel='linear', C=1))]
)
X_train, X_test, y_train, y_test = train_test_split(
    X, encoded_y, test_size=0.2, random_state=0)
k = 10 # Ten-fold cross-validation as required from the assignment
kf = KFold(n_splits=k, shuffle = True, random_state = 42)
scores = []
kf.get_n_splits(X)
for i, (train_index, test_index) in enumerate(kf.split(X)):
    clf.fit(X_train, y_train)
    
    score = cross_val_score(clf, X, encoded_y, cv = 10)
    scores.append(score)
scores_mean = np.mean(scores) * 100 # Accuracy
scores_std = np.std(scores) * 100 # Deviation
print(f"{scores_mean} % accuracy with a standard deviation of {scores_std} %")

81.24999999999996 % accuracy with a standard deviation of 4.031128874149275 %


In [41]:
# Get the features and coefficients
coef = clf.named_steps['classifier'].coef_
coef_df = pd.DataFrame({"feature": feature_names, "coef": coef[0]})
# Create a pandas dataframe with the feature names and coefficients and sort it 
coef_df["abs_coef"] = np.abs(coef_df["coef"])
coef_df = coef_df.sort_values("abs_coef", ascending=False)
# Print the top 5 most important features
print("Support vector machines top 5 most important features::")
print(coef_df.head(5))

Support vector machines top 5 most important features::
                               feature      coef  abs_coef
10          loudness_replay_gain_value -1.087578  1.087578
85          spectral_barkbands_21_mean  0.722917  0.722917
123  spectral_energybandratio_low_dvar  0.556905  0.556905
151              spectral_mfcc_02_mean  0.541051  0.541051
13                 loudness_vicker_var -0.536735  0.536735
