# Setup Cell

In [1]:
%run 'Setup.py'

8 different classes: Electronic, Experimental, Folk, Hip-Hop, Instrumental, International, Pop or Rock.
objective 1: construct a classifier which, based on the features of a song, predicts its genre
objective 2: estimate its generalisation error under the 0–1 loss.
Features are real-valued, correspond to summary statistics (mean, sd, skewness, kurtosis, median, min, max) of 
time series of various music features, such as the chromagram or the Mel-frequency cepstrum.
Feature description: 

Feature description: 
chroma_cens: Chroma Energy Normalized (CENS, 12 chroma) - 84 features
chroma_cqt: Constant-Q chromagram (12 chroma) - 84 features
chroma_stft: Chromagram (12 chroma) - 84 features
mfcc: Mel-frequency cepstrum (20 coefficients) - 140 features
rmse: Root-mean-square - 7 features
spectral_bandwidth: Spectral bandwidth - 7 features
spectral_centroid: Spectral centroid - 7 features
spectral_contrast: Spectral contrast (7 frequency bands) - 49 features
spectral_rolloff: Roll-off freque

# Data Splitting, Scaling

In [2]:
# Prepare data
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train_np.ravel()) #

# Split training data into training and temporary validation sets
X_train, X_temp, Y_train, Y_temp = train_test_split(x_train, y_train_encoded, test_size=0.4, random_state=42)

# Split the temporary validation set into validation and test sets
X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=42)


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)
X_real_test_scaled = scaler.transform(x_test_np)




# Function Split Features

In [3]:
def split_features_by_type(X, feature_structure):
    """
    Splits the dataset into subsets based on the feature structure provided.

    :param X: numpy array, the dataset to be split (features only)
    :param feature_structure: dict, keys are feature names and values are the number of features of that type
    :return: dict of feature subsets
    """
    feature_subsets = {}
    start_idx = 0

    for feature_name, feature_count in feature_structure.items():
        end_idx = start_idx + feature_count
        feature_subsets[feature_name] = X[:, start_idx:end_idx]
        start_idx = end_idx

    return feature_subsets

# Define the structure of your features based on the information you've provided
feature_structure = {
    'chroma_cens': 84,
    'chroma_cqt': 84,
    'chroma_stft': 84,
    'mfcc': 140,
    'rmse': 7,
    'spectral_bandwidth': 7,
    'spectral_centroid': 7,
    'spectral_contrast': 49,
    'spectral_rolloff': 7,
    'tonnetz': 42,
    'zcr': 7
}

# Example usage with a hypothetical dataset X_train_scaled
# This would be your preprocessed and scaled training data as a NumPy array
feature_subsets = split_features_by_type(X_train_scaled, feature_structure)

# Now feature_subsets is a dictionary where, for example,
# feature_subsets['mfcc'] contains only the MFCC features of the dataset.


# Random Forest Feature Subset Fit

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np

# Assuming the preprocessing and data splitting is already done as per your provided code

# Initialize a dictionary to store your best Random Forest models for each feature subset
best_rf_models = {}

# Train a Random Forest model for each feature subset
for feature_name, X_train_subset in feature_subsets.items():
    rf_model = RandomForestClassifier(random_state=42)
    rf_model.fit(X_train_subset, Y_train)
    best_rf_models[feature_name] = rf_model

# Evaluate each model on the validation set and calculate weights
weights = []
val_feature_subsets = split_features_by_type(X_val_scaled, feature_structure)

for feature_name, model in best_rf_models.items():
    X_val_subset = val_feature_subsets[feature_name]
    val_accuracy = model.score(X_val_subset, Y_val)
    weights.append(val_accuracy)
    print(f"Validation accuracy for {feature_name} features: {val_accuracy}")

# Normalize weights
weights = np.array(weights) / np.sum(weights)

# Split the test set using the same feature structure and prepare for weighted predictions
test_feature_subsets = split_features_by_type(X_test_scaled, feature_structure)
weighted_test_predictions = np.zeros((X_test_scaled.shape[0], len(np.unique(Y_train))), dtype=float)

for i, (feature_name, model) in enumerate(best_rf_models.items()):
    X_test_subset = test_feature_subsets[feature_name]
    predictions = model.predict_proba(X_test_subset)
    weighted_predictions = predictions * weights[i]
    weighted_test_predictions += weighted_predictions

# Combine weighted predictions
combined_test_predictions = np.argmax(weighted_test_predictions, axis=1)

# Calculate and print test accuracy
test_accuracy = np.mean(combined_test_predictions == Y_test)
print(f"Test accuracy with combined Random Forest models using weighted voting: {test_accuracy}")

Validation accuracy for chroma_cens features: 0.29
Validation accuracy for chroma_cqt features: 0.33
Validation accuracy for chroma_stft features: 0.3883333333333333
Validation accuracy for mfcc features: 0.5133333333333333
Validation accuracy for rmse features: 0.26666666666666666
Validation accuracy for spectral_bandwidth features: 0.33166666666666667
Validation accuracy for spectral_centroid features: 0.3525
Validation accuracy for spectral_contrast features: 0.4608333333333333
Validation accuracy for spectral_rolloff features: 0.3425
Validation accuracy for tonnetz features: 0.2966666666666667
Validation accuracy for zcr features: 0.3325
Test accuracy with combined Random Forest models using weighted voting: 0.5408333333333334


# KNN Feature Subset Fit (from KNN.ipynb)

In [8]:
from sklearn.neighbors import KNeighborsClassifier

# Dictionary to store the trained KNN models for each feature subset
knn_models = {}

# Train a KNN model for each feature subset
from sklearn.model_selection import cross_val_score

# Dictionary to store the best KNN models for each feature subset
best_knn_models = {}

# Train a KNN model for each feature subset and find the best k using cross-validation
for feature_name, X_subset in feature_subsets.items():
    best_score = 0
    best_k = 1
    # Try different values of k
    for k in range(1, 16):  # Let's try k from 1 to 15 as an example
        knn = KNeighborsClassifier(n_neighbors=k)
        scores = cross_val_score(knn, X_subset, Y_train, cv=5)
        mean_score = scores.mean()
        if mean_score > best_score:
            best_score = mean_score
            best_k = k

    # Train a new KNN model on the full training set with the best k
    best_knn = KNeighborsClassifier(n_neighbors=best_k)
    best_knn.fit(X_subset, Y_train)
    best_knn_models[feature_name] = best_knn
    print(f"Best K for {feature_name} features: {best_k} with cross-validation score: {best_score}")

# Now best_knn_models dictionary contains the best KNN model for each feature subset


# Now knn_models dictionary contains a trained KNN model for each feature subset
# For example, knn_models['mfcc'] is the KNN model trained on the MFCC features

# To make predictions, use the corresponding model for each feature subset
# For instance, for MFCC features:
# predictions_mfcc = knn_models['mfcc'].predict(feature_subsets['mfcc'])

from scipy.stats import mode

# Assume we have a validation set X_val_scaled
# Split it using the same function we defined earlier
val_feature_subsets = split_features_by_type(X_val_scaled, feature_structure)

# Gather predictions from all models on the validation set
val_predictions = []
for feature_name, model in best_knn_models.items():
    # Ensure that we predict on the correct feature subset
    X_val_subset = val_feature_subsets[feature_name]
    predictions = model.predict(X_val_subset)
    val_predictions.append(predictions)

# Combine predictions using majority voting
combined_val_predictions = mode(val_predictions, axis=0).mode

# Calculate accuracy or any other metric based on the combined predictions
val_accuracy = np.mean(combined_val_predictions.ravel() == Y_val)
print(f"Validation accuracy with combined KNN models: {val_accuracy}")



Best K for chroma_cens features: 9 with cross-validation score: 0.2663888888888889
Best K for chroma_cqt features: 13 with cross-validation score: 0.27361111111111114
Best K for chroma_stft features: 13 with cross-validation score: 0.30194444444444446
Best K for mfcc features: 15 with cross-validation score: 0.46611111111111103
Best K for rmse features: 15 with cross-validation score: 0.23555555555555552
Best K for spectral_bandwidth features: 15 with cross-validation score: 0.29083333333333333
Best K for spectral_centroid features: 11 with cross-validation score: 0.32166666666666666
Best K for spectral_contrast features: 12 with cross-validation score: 0.40861111111111115
Best K for spectral_rolloff features: 12 with cross-validation score: 0.3061111111111111
Best K for tonnetz features: 6 with cross-validation score: 0.2725000000000001
Best K for zcr features: 14 with cross-validation score: 0.30583333333333335
Validation accuracy with combined KNN models: 0.4725


# Stacked Models

## RF and KNN Stacked

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np

# Prepare the data for the meta-model
# Generate predictions from RF and KNN models on the validation set
X_meta_train = np.hstack([
    np.concatenate([
        model.predict_proba(val_feature_subsets[feature_name])[:, :, np.newaxis]  # Add a new axis for stacking
        for model in (best_rf_models[feature_name], best_knn_models[feature_name])
    ], axis=2).mean(axis=2)  # Average predictions from RF and KNN models for each feature subset
    for feature_name in feature_subsets.keys()
]).reshape(len(Y_val), -1)  # Reshape to have a standard 2D array

# Train the meta-model
meta_model = LogisticRegression(random_state=42)
meta_model.fit(X_meta_train, Y_val)

# Prepare test data in a similar manner
X_meta_test = np.hstack([
    np.concatenate([
        model.predict_proba(test_feature_subsets[feature_name])[:, :, np.newaxis]
        for model in (best_rf_models[feature_name], best_knn_models[feature_name])
    ], axis=2).mean(axis=2)
    for feature_name in feature_subsets.keys()
]).reshape(len(Y_test), -1)

# Make final predictions with the meta-model
final_predictions = meta_model.predict(X_meta_test)

# Calculate and print the test accuracy
test_accuracy = accuracy_score(Y_test, final_predictions)
print(f"Test accuracy with stacked RF and KNN models: {test_accuracy}")


Test accuracy with stacked RF and KNN models: 0.5716666666666667


## RF, KNN, and LR Stacked

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np

# Initialize a dictionary to store the logistic regression models for each feature subset
best_lr_models = {}

# Train a logistic regression model for each feature subset
for feature_name, X_train_subset in feature_subsets.items():
    lr_model = LogisticRegression(random_state=42, max_iter=1000)
    lr_model.fit(X_train_subset, Y_train)
    best_lr_models[feature_name] = lr_model

# Update the preparation of the data for the meta-model to include logistic regression predictions
X_meta_train = np.hstack([
    np.concatenate([
        best_rf_models[feature_name].predict_proba(val_feature_subsets[feature_name])[:, :, np.newaxis],
        best_knn_models[feature_name].predict_proba(val_feature_subsets[feature_name])[:, :, np.newaxis],
        best_lr_models[feature_name].predict_proba(val_feature_subsets[feature_name])[:, :, np.newaxis]
    ], axis=2).mean(axis=2)  # Average predictions from RF, KNN, and LR models for each feature subset
    for feature_name in feature_subsets.keys()
]).reshape(len(Y_val), -1)  # Reshape to have a standard 2D array

# Train the meta-model with the updated training data
meta_model.fit(X_meta_train, Y_val)

# Update the preparation of the test data in a similar manner to include logistic regression predictions
X_meta_test = np.hstack([
    np.concatenate([
        best_rf_models[feature_name].predict_proba(test_feature_subsets[feature_name])[:, :, np.newaxis],
        best_knn_models[feature_name].predict_proba(test_feature_subsets[feature_name])[:, :, np.newaxis],
        best_lr_models[feature_name].predict_proba(test_feature_subsets[feature_name])[:, :, np.newaxis]
    ], axis=2).mean(axis=2)
    for feature_name in feature_subsets.keys()
]).reshape(len(Y_test), -1)

# Make final predictions with the updated meta-model
final_predictions = meta_model.predict(X_meta_test)

# Calculate and print the test accuracy with the inclusion of logistic regression models
test_accuracy = accuracy_score(Y_test, final_predictions)
print(f"Test accuracy with stacked RF, KNN, and LR models: {test_accuracy}")


Test accuracy with stacked RF, KNN, and LR models: 0.5983333333333334



## Stacked Bootstrap

In [11]:
from sklearn.utils import resample

# Define the number of bootstrap samples and models
n_bootstrap_samples = 10

# Placeholder for trained bootstrap models
bootstrap_models_rf = {feature_name: [] for feature_name in feature_subsets.keys()}
bootstrap_models_knn = {feature_name: [] for feature_name in feature_subsets.keys()}
bootstrap_models_lr = {feature_name: [] for feature_name in feature_subsets.keys()}

# Train bootstrap models for RandomForest
for feature_name, X_train_subset in feature_subsets.items():
    for _ in range(n_bootstrap_samples):
        # Create a bootstrap sample
        X_boot, Y_boot = resample(X_train_subset, Y_train)
        # Initialize and train the model on the bootstrap sample
        model = RandomForestClassifier(random_state=42)
        model.fit(X_boot, Y_boot)
        bootstrap_models_rf[feature_name].append(model)

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample


# Train bootstrap models for KNN
for feature_name, X_train_subset in feature_subsets.items():
    for _ in range(n_bootstrap_samples):
        X_boot, Y_boot = resample(X_train_subset, Y_train)
        knn_model = KNeighborsClassifier()
        knn_model.fit(X_boot, Y_boot)
        bootstrap_models_knn[feature_name].append(knn_model)

# Train bootstrap models for Logistic Regression
for feature_name, X_train_subset in feature_subsets.items():
    for _ in range(n_bootstrap_samples):
        X_boot, Y_boot = resample(X_train_subset, Y_train)
        lr_model = LogisticRegression(random_state=42, max_iter=1000)
        lr_model.fit(X_boot, Y_boot)
        bootstrap_models_lr[feature_name].append(lr_model)

# Generate averaged predictions for the validation set
# This assumes `val_feature_subsets` is already prepared similarly to `feature_subsets`
X_meta_train = np.hstack([
    np.mean([
        np.mean([model.predict_proba(val_feature_subsets[feature_name]) for model in models], axis=0)
        for models in (bootstrap_models_rf[feature_name], bootstrap_models_knn[feature_name], bootstrap_models_lr[feature_name])
    ], axis=0)
    for feature_name in feature_subsets.keys()
]).reshape(len(Y_val), -1)

# Train the meta-model
meta_model.fit(X_meta_train, Y_val)

# Assuming `test_feature_subsets` is prepared, generate predictions for the test set
X_meta_test = np.hstack([
    np.mean([
        np.mean([model.predict_proba(test_feature_subsets[feature_name]) for model in models], axis=0)
        for models in (bootstrap_models_rf[feature_name], bootstrap_models_knn[feature_name], bootstrap_models_lr[feature_name])
    ], axis=0)
    for feature_name in feature_subsets.keys()
]).reshape(len(Y_test), -1)

# Final predictions with the meta-model
final_predictions = meta_model.predict(X_meta_test)

# Calculate and print the test accuracy
test_accuracy = accuracy_score(Y_test, final_predictions)
print(f"Test accuracy with bootstrapped and stacked RF, KNN, and LR models: {test_accuracy}")


Test accuracy with bootstrapped and stacked RF, KNN, and LR models: 0.5933333333333334


In [12]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np

# Assuming `feature_subsets`, `X_val_scaled`, and `Y_val` are already defined

# Step 1: Train models on each feature subset
model_types = ['KNN', 'RF', 'LR']
best_models = {}

for feature_name, X_train_subset in feature_subsets.items():
    # Track the best model for this subset
    best_model = None
    best_accuracy = 0
    X_val_subset = val_feature_subsets[feature_name]

    # KNN Model
    knn_model = KNeighborsClassifier()
    knn_model.fit(X_train_subset, Y_train)
    knn_accuracy = knn_model.score(X_val_subset, Y_val)

    if knn_accuracy > best_accuracy:
        best_accuracy = knn_accuracy
        best_model = ('KNN', knn_model)

    # RF Model
    rf_model = RandomForestClassifier(random_state=42)
    rf_model.fit(X_train_subset, Y_train)
    rf_accuracy = rf_model.score(X_val_subset, Y_val)

    if rf_accuracy > best_accuracy:
        best_accuracy = rf_accuracy
        best_model = ('RF', rf_model)

    # LR Model
    lr_model = LogisticRegression(random_state=42, max_iter=1000)
    lr_model.fit(X_train_subset, Y_train)
    lr_accuracy = lr_model.score(X_val_subset, Y_val)

    if lr_accuracy > best_accuracy:
        best_accuracy = lr_accuracy
        best_model = ('LR', lr_model)

    best_models[feature_name] = best_model

# Step 2: Evaluate and select the best model per subset (already done in the loop)

# Step 3: Ensemble selected models for final prediction
# Generate predictions for each selected model on the test set
ensemble_predictions = np.zeros((len(Y_test), len(np.unique(Y_train))))

for feature_name, (model_type, model) in best_models.items():
    X_test_subset = test_feature_subsets[feature_name]
    predictions = model.predict_proba(X_test_subset)
    ensemble_predictions += predictions

# Final ensemble prediction
final_predictions = np.argmax(ensemble_predictions, axis=1)

# Calculate and print the test accuracy
test_accuracy = accuracy_score(Y_test, final_predictions)
print(f"Test accuracy with the best model from each feature subset ensemble: {test_accuracy}")


Test accuracy with the best model from each feature subset ensemble: 0.5333333333333333
