## Testing different implementations with sklearn



## Setup

In [14]:
import numpy as np
import pandas as pd
import seaborn as sns

from scipy.io import arff

from sklearn import preprocessing
from sklearn.model_selection import train_test_split


## Loading Data

In [15]:
data = scipy.io.arff.loadarff('./assets/genre.arff')
df = pd.DataFrame(data[0])

df.head()
df.describe()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Columns: 241 entries, highlevel_danceability_value to genre
dtypes: float64(237), object(4)
memory usage: 1.5+ MB


## Preprocessing
Converting feature type and scaling values to be usable by the model

In [16]:
df_target = df['genre']
unique_genres = df_target.unique()
print('Unique genres labels:', unique_genres)

df_features = df.drop(['genre'], axis=1)

df_numerical = df_features.select_dtypes(include='float64')
df_categorical = df_features.select_dtypes(include='object')

numerical_features = df_numerical.columns.to_list()
categorical_features = df_categorical.columns.to_list()

features_list = numerical_features + categorical_features

print('Number of features:', len(features_list))
print('Features:', features_list)

Unique genres labels: [b'blu' b'cla' b'cou' b'dis' b'hip' b'jaz' b'met' b'pop' b'reg' b'roc']
Number of features: 240
Features: ['highlevel_danceability_value', 'highlevel_equalization_profile_value', 'highlevel_excitement_value', 'highlevel_intensity_value', 'loudness_dynamic_complexity_dvar', 'loudness_dynamic_complexity_mean', 'loudness_dynamic_complexity_var', 'loudness_larm_dvar', 'loudness_larm_mean', 'loudness_larm_var', 'loudness_replay_gain_value', 'loudness_vicker_dvar', 'loudness_vicker_mean', 'loudness_vicker_var', 'rhythm_beats_loudness_bass_dvar', 'rhythm_beats_loudness_bass_mean', 'rhythm_beats_loudness_bass_var', 'rhythm_beats_loudness_dvar', 'rhythm_beats_loudness_mean', 'rhythm_beats_loudness_var', 'rhythm_onset_rate_value', 'spectral_barkbands_00_dvar', 'spectral_barkbands_00_mean', 'spectral_barkbands_00_var', 'spectral_barkbands_01_dvar', 'spectral_barkbands_01_mean', 'spectral_barkbands_01_var', 'spectral_barkbands_02_dvar', 'spectral_barkbands_02_mean', 'spectral

In [17]:
encoder = preprocessing.OneHotEncoder()
encoded_categorical = encoder.fit_transform(df_categorical)

scaler = preprocessing.StandardScaler()
scaled_numerical = scaler.fit_transform(df_numerical)

processed_features = np.concatenate((scaled_numerical, encoded_categorical.toarray()), axis=1)
print('Processed features shape:', processed_features.shape)

label_encoder = preprocessing.LabelEncoder()
encoded_target = label_encoder.fit_transform(df_target)

y_one_hot = to_categorical(encoded_target, num_classes=10)

X_train, X_test, y_train_one_hot, y_test_one_hot = train_test_split(processed_features, y_one_hot, test_size=0.2, random_state=42)

print('X_train shape:', X_train.shape)
print('y_train_one_hot shape:', y_train_one_hot.shape)
print('X_test shape:', X_test.shape)
print('y_test_one_hot shape:', y_test_one_hot.shape)

Processed features shape: (800, 243)
X_train shape: (640, 243)
y_train_one_hot shape: (640, 10)
X_test shape: (160, 243)
y_test_one_hot shape: (160, 10)


## K-fold cross-validation with Support Vector Machines

In [18]:
# Initialize an empty list to store the accuracy scores
scores = []
coefficients = np.zeros((45, 243))

# Define the number of folds for k-fold cross-validation
k = 10

# Initialize a k-fold cross-validation splitter
kf = KFold(n_splits=k, shuffle=True, random_state=42)
# clf = SVC(gamma='auto')
clf = SVC(kernel='linear')

# Iterate over the folds and train/test the model
for train_index, test_index in kf.split(processed_features):
    X_train, X_test = processed_features[train_index], processed_features[test_index]
    y_train, y_test = encoded_target[train_index], encoded_target[test_index]

    # Fit the model to the training data for this fold
    clf.fit(X_train, y_train)

    coefficients += np.abs(clf.coef_)
    # print('Coefficients shape:', clf.coef_.shape)

    # Evaluate the model on the test data for this fold and compute the accuracy score
    y_pred = clf.predict(X_test)

    score = accuracy_score(y_test, y_pred)
    scores.append(score)

num = df_numerical.columns.to_list()  
cat = df_categorical.columns.to_list()
feat_list = num + cat

most_used_feat = np.argsort(coefficients, axis=1)
counter = []

for classifier in most_used_feat:
    for top_feature_classifier in classifier[::-1][:5]:
        counter.append(feat_list[top_feature_classifier])

counter = collections.Counter(counter)
most_common = counter.most_common(5)
print('Most common features:', most_common)

mean_score = np.mean(scores)
std_score = np.std(scores)

print('Mean Accuracy score:', mean_score * 100, '%')
print('Standard deviation of Accuracy score:', std_score * 100, '%')

Most common features: [('loudness_vicker_dvar', 20), ('loudness_vicker_var', 15), ('loudness_replay_gain_value', 14), ('tonal_chords_strength_mean', 10), ('tonal_key_strength_value', 8)]
Mean Accuracy score: 82.0 %
Standard deviation of Accuracy score: 2.968585521759479 %


## K-fold cross-validation with scikit-learn MLPClassifier

In [22]:
# Initialize an empty list to store the accuracy scores
scores = []

# Define the number of folds for k-fold cross-validation
k = 10

# Initialize a k-fold cross-validation splitter
kf = KFold(n_splits=k, shuffle=True, random_state=42)

# Define the neural network model
model = MLPClassifier(hidden_layer_sizes=(64,32), max_iter=500, solver='adam', activation='relu', random_state=0)

# Iterate over the folds and train/test the model
for train_index, test_index in kf.split(processed_features):
    # Split the data into training and test sets for this fold
    X_train, X_test = processed_features[train_index], processed_features[test_index]
    y_train, y_test = y_one_hot[train_index], y_one_hot[test_index]
    
    # Fit the model to the training data for this fold
    model.fit(X_train, y_train)
    
    # Evaluate the model on the test data for this fold and compute the accuracy score
    y_pred = model.predict(X_test)
    y_pred_classes = np.argmax(y_pred, axis=1)
    score = accuracy_score(np.argmax(y_test, axis=1), y_pred_classes)
    scores.append(score)

# Compute the mean and standard deviation of the accuracy scores
mean_score = np.mean(scores)
std_score = np.std(scores)

# Compute the 95% confidence interval for the mean accuracy score
ci = t.interval(0.95, k-1, loc=mean_score, scale=std_score/np.sqrt(k))

print('Mean C.C.I.%:', mean_score * 100, '%')
print('Standard deviation of C.C.I.%:', std_score * 100, '%')
print('Confidence interval:', ci)

Mean C.C.I.%: 76.0 %
Standard deviation of C.C.I.%: 6.749999999999999 %
Confidence interval: (0.7117134088482013, 0.8082865911517987)


## K-fold cross-validation with Keras

In [23]:
# Initialize an empty list to store the accuracy scores
scores = []

# Define the number of folds for k-fold cross-validation
k = 10

# Initialize a k-fold cross-validation splitter
kf = KFold(n_splits=k, shuffle=True, random_state=42)

# Define the model architecture outside the loop
model = keras.Sequential([
    keras.layers.Dense(64, input_shape=(processed_features.shape[1],), activation='relu'),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(10, activation='softmax')
])

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

@tf.function
def predict(model, X):
    return model(X)

# Iterate over the folds and train/test the model
for train_index, test_index in kf.split(processed_features):
    # Split the data into training and test sets for this fold
    X_train, X_test = processed_features[train_index], processed_features[test_index]
    y_train, y_test = encoded_target[train_index], encoded_target[test_index]
    
    # Convert the encoded target labels to one-hot encoded format
    y_train_one_hot = to_categorical(y_train, num_classes=10)
    y_test_one_hot = to_categorical(y_test, num_classes=10)
    
    # Fit the model to the training data for this fold
    model.fit(X_train, y_train_one_hot, batch_size=32, epochs=25, verbose=0)
    
    # Evaluate the model on the test data for this fold and compute the accuracy score
    y_pred = model.predict(X_test)
    y_pred_classes = np.argmax(y_pred, axis=1)
    score = accuracy_score(y_test, y_pred_classes)
    scores.append(score)

mean_score = np.mean(scores)
std_score = np.std(scores)

# Compute the 95% confidence interval of the C.C.I.% (percentage of correctly classified instances)
ci = t.interval(confidence=0.95, df=k-1, loc=mean_score, scale=std_score/np.sqrt(k))

print('Mean C.C.I.%:', mean_score)
print('Standard deviation of C.C.I.%:', std_score)
print('Confidence interval:', ci)

Mean C.C.I.%: 0.9775
Standard deviation of C.C.I.%: 0.06344288770224761
Confidence interval: (0.9321156921486237, 1.0228843078513763)


## LinearSVM 

In [32]:
# Initialize an empty list to store the accuracy scores
scores = []
coefficients = np.zeros((10, 243))

# Define the number of folds for k-fold cross-validation
k = 10

# Initialize a k-fold cross-validation splitter
kf = KFold(n_splits=k, shuffle=True, random_state=42)
model = LinearSVC(C=0.1, max_iter=1000, random_state=42, dual=False)

# Iterate over the folds and train/test the model
for train_index, test_index in kf.split(processed_features):
    X_train, X_test = processed_features[train_index], processed_features[test_index]
    y_train, y_test = encoded_target[train_index], encoded_target[test_index]

    # Fit the model to the training data for this fold
    model.fit(X_train, y_train)

    # Evaluate the model on the test data for this fold and compute the accuracy score
    y_pred = model.predict(X_test)

    score = accuracy_score(y_test, y_pred)
    scores.append(score)

coefficients = model.coef_

top_indices = np.argsort(coefficients.mean(axis=0))[::-1][:5]

print("Top 5 most important features:")
for i in top_indices:
    print('Feature name:', features_list[i], 'Coefficient:', coefficients.mean(axis=0)[i])

mean_score = np.mean(scores)
std_score = np.std(scores)

print('Mean Accuracy score:', mean_score * 100, '%')
print('Standard deviation of Accuracy score:', std_score * 100, '%')

Top 5 most important features:
Feature name: spectral_energybandratio_middle_high_var Coefficient: 0.057474102911797983
Feature name: spectral_flux_mean Coefficient: 0.05740200509376051
Feature name: spectral_mfcc_01_mean Coefficient: 0.053137279287320295
Feature name: spectral_mfcc_02_var Coefficient: 0.04598652450763889
Feature name: spectral_barkbands_14_mean Coefficient: 0.04405639559329348
Mean Accuracy score: 82.75 %
Standard deviation of Accuracy score: 4.286607049870561 %
