# Setup Cell

In [32]:
%run 'Setup.py'

8 different classes: Electronic, Experimental, Folk, Hip-Hop, Instrumental, International, Pop or Rock.
objective 1: construct a classifier which, based on the features of a song, predicts its genre
objective 2: estimate its generalisation error under the 0–1 loss.
Features are real-valued, correspond to summary statistics (mean, sd, skewness, kurtosis, median, min, max) of 
time series of various music features, such as the chromagram or the Mel-frequency cepstrum.
Feature description: 

Feature description: 
chroma_cens: Chroma Energy Normalized (CENS, 12 chroma) - 84 features
chroma_cqt: Constant-Q chromagram (12 chroma) - 84 features
chroma_stft: Chromagram (12 chroma) - 84 features
mfcc: Mel-frequency cepstrum (20 coefficients) - 140 features
rmse: Root-mean-square - 7 features
spectral_bandwidth: Spectral bandwidth - 7 features
spectral_centroid: Spectral centroid - 7 features
spectral_contrast: Spectral contrast (7 frequency bands) - 49 features
spectral_rolloff: Roll-off freque

# Data Splitting, Scaling

In [33]:
# Prepare data
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train_np.ravel()) #

# Split training data into training and temporary validation sets
X_train, X_temp, Y_train, Y_temp = train_test_split(x_train, y_train_encoded, test_size=0.4, random_state=42)

# Split the temporary validation set into validation and test sets
X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=42)


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)
X_real_test_scaled = scaler.transform(x_test_np)




# Function Split Features

In [34]:
def split_features_by_type(X, feature_structure):
    """
    Splits the dataset into subsets based on the feature structure provided.

    :param X: numpy array, the dataset to be split (features only)
    :param feature_structure: dict, keys are feature names and values are the number of features of that type
    :return: dict of feature subsets
    """
    feature_subsets = {}
    start_idx = 0

    for feature_name, feature_count in feature_structure.items():
        end_idx = start_idx + feature_count
        feature_subsets[feature_name] = X[:, start_idx:end_idx]
        start_idx = end_idx

    return feature_subsets

# Define the structure of your features based on the information you've provided
feature_structure = {
    'chroma_cens': 84,
    'chroma_cqt': 84,
    'chroma_stft': 84,
    'mfcc': 140,
    'rmse': 7,
    'spectral_bandwidth': 7,
    'spectral_centroid': 7,
    'spectral_contrast': 49,
    'spectral_rolloff': 7,
    'tonnetz': 42,
    'zcr': 7
}

# Example usage with a hypothetical dataset X_train_scaled
# This would be your preprocessed and scaled training data as a NumPy array
feature_subsets = split_features_by_type(X_train_scaled, feature_structure)

# Now feature_subsets is a dictionary where, for example,
# feature_subsets['mfcc'] contains only the MFCC features of the dataset.


# Subset Fits for Ensemble Learners

## Random Forest Feature Subset Fit

In [35]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np

# Assuming the preprocessing and data splitting is already done as per your provided code

# Initialize a dictionary to store your best Random Forest models for each feature subset
best_rf_models = {}

# Train a Random Forest model for each feature subset
for feature_name, X_train_subset in feature_subsets.items():
    rf_model = RandomForestClassifier(random_state=42)
    rf_model.fit(X_train_subset, Y_train)
    best_rf_models[feature_name] = rf_model

### RF Cross Validation

In [53]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Initialize a dictionary to store your best Random Forest models for each feature subset
best_rf_models = {}

# Define a parameter grid to search for best parameters for Random Forest
rf_param_grid = {
    'n_estimators': [100, 200],  # Reduced from 3 to 2 options
    'max_depth': [None, 20],  # Reduced from 4 to 2 options
    'min_samples_split': [2, 10],  # Reduced from 3 to 2 options
    'min_samples_leaf': [1, 4]  # Reduced from 3 to 2 options
}


for feature_name, X_train_subset in feature_subsets.items():
    print(f"Starting GridSearchCV for Random Forest on {feature_name}...")

    rf_model = RandomForestClassifier(random_state=42)
    grid_search = GridSearchCV(estimator=rf_model, param_grid=rf_param_grid, cv=3, scoring='accuracy', verbose=1)
    grid_search.fit(X_train_subset, Y_train)

    best_rf_models[feature_name] = grid_search.best_estimator_
    print(f"Best parameters for {feature_name}: {grid_search.best_params_}")


Starting GridSearchCV for Random Forest on chroma_cens...
Fitting 3 folds for each of 16 candidates, totalling 48 fits
Best parameters for chroma_cens: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Starting GridSearchCV for Random Forest on chroma_cqt...
Fitting 3 folds for each of 16 candidates, totalling 48 fits
Best parameters for chroma_cqt: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}
Starting GridSearchCV for Random Forest on chroma_stft...
Fitting 3 folds for each of 16 candidates, totalling 48 fits
Best parameters for chroma_stft: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}
Starting GridSearchCV for Random Forest on mfcc...
Fitting 3 folds for each of 16 candidates, totalling 48 fits
Best parameters for mfcc: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Starting GridSearchCV for Random Forest on rmse...
Fitti

In [4]:

# Evaluate each model on the validation set and calculate weights
weights = []
val_feature_subsets = split_features_by_type(X_val_scaled, feature_structure)

for feature_name, model in best_rf_models.items():
    X_val_subset = val_feature_subsets[feature_name]
    val_accuracy = model.score(X_val_subset, Y_val)
    weights.append(val_accuracy)
    print(f"Validation accuracy for {feature_name} features: {val_accuracy}")

# Normalize weights
weights = np.array(weights) / np.sum(weights)

# Split the test set using the same feature structure and prepare for weighted predictions
test_feature_subsets = split_features_by_type(X_test_scaled, feature_structure)
weighted_test_predictions = np.zeros((X_test_scaled.shape[0], len(np.unique(Y_train))), dtype=float)

for i, (feature_name, model) in enumerate(best_rf_models.items()):
    X_test_subset = test_feature_subsets[feature_name]
    predictions = model.predict_proba(X_test_subset)
    weighted_predictions = predictions * weights[i]
    weighted_test_predictions += weighted_predictions

# Combine weighted predictions
combined_test_predictions = np.argmax(weighted_test_predictions, axis=1)

# Calculate and print test accuracy
test_accuracy = np.mean(combined_test_predictions == Y_test)
print(f"Test accuracy with combined Random Forest models using weighted voting: {test_accuracy}")

Validation accuracy for chroma_cens features: 0.29
Validation accuracy for chroma_cqt features: 0.33
Validation accuracy for chroma_stft features: 0.3883333333333333
Validation accuracy for mfcc features: 0.5133333333333333
Validation accuracy for rmse features: 0.26666666666666666
Validation accuracy for spectral_bandwidth features: 0.33166666666666667
Validation accuracy for spectral_centroid features: 0.3525
Validation accuracy for spectral_contrast features: 0.4608333333333333
Validation accuracy for spectral_rolloff features: 0.3425
Validation accuracy for tonnetz features: 0.2966666666666667
Validation accuracy for zcr features: 0.3325
Test accuracy with combined Random Forest models using weighted voting: 0.5408333333333334


## KNN Feature Subset Fit (from KNN.ipynb)

In [36]:
from sklearn.neighbors import KNeighborsClassifier

# Dictionary to store the trained KNN models for each feature subset
knn_models = {}

# Train a KNN model for each feature subset
from sklearn.model_selection import cross_val_score

# Dictionary to store the best KNN models for each feature subset
best_knn_models = {}

# Train a KNN model for each feature subset and find the best k using cross-validation
for feature_name, X_subset in feature_subsets.items():
    best_score = 0
    best_k = 1
    # Try different values of k
    for k in range(1, 16):  # Let's try k from 1 to 15 as an example
        knn = KNeighborsClassifier(n_neighbors=k)
        scores = cross_val_score(knn, X_subset, Y_train, cv=5)
        mean_score = scores.mean()
        if mean_score > best_score:
            best_score = mean_score
            best_k = k

    # Train a new KNN model on the full training set with the best k
    best_knn = KNeighborsClassifier(n_neighbors=best_k)
    best_knn.fit(X_subset, Y_train)
    best_knn_models[feature_name] = best_knn
    print(f"Best K for {feature_name} features: {best_k} with cross-validation score: {best_score}")

# Now best_knn_models dictionary contains the best KNN model for each feature subset


# Now knn_models dictionary contains a trained KNN model for each feature subset
# For example, knn_models['mfcc'] is the KNN model trained on the MFCC features

# To make predictions, use the corresponding model for each feature subset
# For instance, for MFCC features:
# predictions_mfcc = knn_models['mfcc'].predict(feature_subsets['mfcc'])

from scipy.stats import mode

# Assume we have a validation set X_val_scaled
# Split it using the same function we defined earlier
val_feature_subsets = split_features_by_type(X_val_scaled, feature_structure)

# Gather predictions from all models on the validation set
val_predictions = []
for feature_name, model in best_knn_models.items():
    # Ensure that we predict on the correct feature subset
    X_val_subset = val_feature_subsets[feature_name]
    predictions = model.predict(X_val_subset)
    val_predictions.append(predictions)

# Combine predictions using majority voting
combined_val_predictions = mode(val_predictions, axis=0).mode

# Calculate accuracy or any other metric based on the combined predictions
val_accuracy = np.mean(combined_val_predictions.ravel() == Y_val)
print(f"Validation accuracy with combined KNN models: {val_accuracy}")



Best K for chroma_cens features: 9 with cross-validation score: 0.2663888888888889
Best K for chroma_cqt features: 13 with cross-validation score: 0.27361111111111114
Best K for chroma_stft features: 13 with cross-validation score: 0.30194444444444446
Best K for mfcc features: 15 with cross-validation score: 0.46611111111111103
Best K for rmse features: 15 with cross-validation score: 0.23555555555555552
Best K for spectral_bandwidth features: 15 with cross-validation score: 0.29083333333333333
Best K for spectral_centroid features: 11 with cross-validation score: 0.32166666666666666
Best K for spectral_contrast features: 12 with cross-validation score: 0.40861111111111115
Best K for spectral_rolloff features: 12 with cross-validation score: 0.3061111111111111
Best K for tonnetz features: 6 with cross-validation score: 0.2725000000000001
Best K for zcr features: 14 with cross-validation score: 0.30583333333333335
Validation accuracy with combined KNN models: 0.4725


## Logistic Regression Feature Subset Fit

In [37]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np

# Initialize a dictionary to store the logistic regression models for each feature subset
best_lr_models = {}

# Train a logistic regression model for each feature subset
for feature_name, X_train_subset in feature_subsets.items():
    lr_model = LogisticRegression(random_state=42, max_iter=1000)
    lr_model.fit(X_train_subset, Y_train)
    best_lr_models[feature_name] = lr_model

## SVM Feature Subset Fit

In [None]:
from sklearn.svm import SVC

# Initialize a dictionary to store the SVM models for each feature subset
best_svm_models = {}

# Train a polynomial kernel SVM model for each feature subset
for feature_name, X_train_subset in feature_subsets.items():
    # Specify the polynomial kernel using the `kernel` parameter
    svm_model = SVC(probability=True, kernel='poly', degree=3, random_state=42)
    svm_model.fit(X_train_subset, Y_train)
    best_svm_models[feature_name] = svm_model


In [45]:
print(best_svm_models)

{'chroma_cens': SVC(probability=True, random_state=42), 'chroma_cqt': SVC(probability=True, random_state=42), 'chroma_stft': SVC(probability=True, random_state=42), 'mfcc': SVC(probability=True, random_state=42), 'rmse': SVC(probability=True, random_state=42), 'spectral_bandwidth': SVC(probability=True, random_state=42), 'spectral_centroid': SVC(probability=True, random_state=42), 'spectral_contrast': SVC(probability=True, random_state=42), 'spectral_rolloff': SVC(probability=True, random_state=42), 'tonnetz': SVC(probability=True, random_state=42), 'zcr': SVC(probability=True, random_state=42)}


In [44]:
# Calculate the validation accuracy for each feature subset and use it as weight for voting
svm_weights = []
for feature_name, model in best_svm_models.items():
    X_val_subset = val_feature_subsets[feature_name]
    val_accuracy = model.score(X_val_subset, Y_val)
    svm_weights.append(val_accuracy)
    print(f"Validation accuracy for {feature_name} features: {val_accuracy:.2f}")

# Normalize weights so they sum up to 1
svm_weights = np.array(svm_weights) / np.sum(svm_weights)

# Predict on the test set with each SVM model and weight the predictions
weighted_test_predictions_svm = np.zeros((X_test_scaled.shape[0], len(np.unique(Y_train))), dtype=float)  # Adjust the shape according to your number of classes

for i, (feature_name, model) in enumerate(best_svm_models.items()):
    X_test_subset = test_feature_subsets[feature_name]
    predictions = model.predict_proba(X_test_subset)
    weighted_predictions = predictions * svm_weights[i]
    weighted_test_predictions_svm += weighted_predictions

# Combine weighted predictions by taking the argmax to get final predictions
combined_test_predictions_svm = np.argmax(weighted_test_predictions_svm, axis=1)

# Calculate accuracy based on the combined weighted predictions
test_accuracy_svm = np.mean(combined_test_predictions_svm == Y_test)
print(f"Test accuracy with combined SVM models using weighted voting: {test_accuracy_svm:.2f}")


Validation accuracy for chroma_cens features: 0.33
Validation accuracy for chroma_cqt features: 0.35
Validation accuracy for chroma_stft features: 0.38
Validation accuracy for mfcc features: 0.56
Validation accuracy for rmse features: 0.27
Validation accuracy for spectral_bandwidth features: 0.33
Validation accuracy for spectral_centroid features: 0.38
Validation accuracy for spectral_contrast features: 0.47
Validation accuracy for spectral_rolloff features: 0.37
Validation accuracy for tonnetz features: 0.32
Validation accuracy for zcr features: 0.34
Test accuracy with combined SVM models using weighted voting: 0.57


## XGBoost Feature Subset Fit

In [55]:
import xgboost as xgb

# Initialize a dictionary to store the XGBoost models for each feature subset
best_xgb_models = {}

# Train an XGBoost model for each feature subset
for feature_name, X_train_subset in feature_subsets.items():
    xgb_model = xgb.XGBClassifier(use_label_encoder=False, objective='multi:softprob', num_class=8, eval_metric='mlogloss', random_state=42)
    xgb_model.fit(X_train_subset, Y_train)
    best_xgb_models[feature_name] = xgb_model

### Cross Validation Optimal Model for each Subset

In [None]:
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

# Assuming X_train, Y_train are defined and feature_subsets is a dictionary with your subsets

best_xgb_models = {}

# Define a parameter grid to search for best parameters for XGBoost
xgb_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.8, 1.0]
}

for feature_name, X_train_subset in feature_subsets.items():
    print(f"Starting GridSearchCV for XGBoost on {feature_name}...")

    xgb_model = xgb.XGBClassifier(use_label_encoder=False, objective='multi:softprob', num_class=8, eval_metric='mlogloss', random_state=42)
    grid_search = GridSearchCV(estimator=xgb_model, param_grid=xgb_param_grid, cv=3, scoring='accuracy', verbose=1)
    grid_search.fit(X_train_subset, Y_train)

    best_xgb_models[feature_name] = grid_search.best_estimator_
    print(f"Best parameters for {feature_name}: {grid_search.best_params_}")


Starting GridSearchCV for XGBoost on chroma_cens...
Fitting 3 folds for each of 16 candidates, totalling 48 fits
Best parameters for chroma_cens: {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 200, 'subsample': 0.8}
Starting GridSearchCV for XGBoost on chroma_cqt...
Fitting 3 folds for each of 16 candidates, totalling 48 fits


## NN

In [90]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
import numpy as np

def train_neural_network(X_train, Y_train_encoded):
    model = Sequential(name='Feature_Subset_NN')
    model.add(Dense(128, input_shape=(X_train.shape[1],), activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(Y_train_encoded.shape[1], activation='softmax'))

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    # Assuming a modest number of epochs for demonstration; adjust as necessary
    model.fit(X_train, Y_train_encoded, epochs=20, batch_size=32, verbose=0)

    return model

In [91]:
best_nn_models = {}

for feature_name, X_train_subset in feature_subsets.items():
    Y_train_encoded = to_categorical(Y_train)
    nn_model = train_neural_network(X_train_subset, Y_train_encoded)
    best_nn_models[feature_name] = nn_model


# Stacked Models

## Meta Models

### Default Meta Model

In [121]:
meta_model = LogisticRegression(random_state=42)

### LR Meta Model

In [117]:
# Train the meta-model
meta_model_lr = LogisticRegression(random_state=42)

### XGBoost Meta Model

In [119]:
import xgboost as xgb

# Initialize the XGBoost classifier as the meta-model
meta_model_xgb = xgb.XGBClassifier(
    use_label_encoder=False,
    n_estimators=500,  # Increased from a default value, which might be around 100
    learning_rate=0.05,  # Adjust learning rate to control the pace of boosting and prevent overfitting
    max_depth=10,  # Consider setting max_depth to regulate tree complexity
    objective='multi:softprob',
    num_class=8,
    eval_metric='mlogloss',
    random_state=42
)

### XGBoost Meta Model

In [114]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest classifier as the meta-model
meta_model_rf = RandomForestClassifier(
    n_estimators=500,  # Number of trees in the forest
    max_depth=10,  # Maximum depth of the trees to limit complexity and prevent overfitting
    min_samples_split=2,  # Minimum number of samples required to split an internal node
    min_samples_leaf=1,  # Minimum number of samples required to be at a leaf node
    criterion='gini',  # Function to measure the quality of a split. 'entropy' can also be used for information gain
    random_state=42,  # Ensures a deterministic outcome for reproducible results
    n_jobs=-1  # Use all available cores for faster training
)

# Note: Adjust the above hyperparameters based on your dataset characteristics and computational resources


### SVM Meta Model

In [79]:
# Initialize the SVM classifier as the meta-model
# 'probability=True' enables the predict_proba method, which is useful for getting predicted probabilities
meta_model = SVC(probability=True, kernel='rbf', random_state=42)




### NN Meta Model

In [88]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical

# Convert target variable to categorical (one-hot encoding) for neural network
Y_val_encoded = to_categorical(Y_val)

# Define the neural network architecture with wider and more layers
meta_model_nn = Sequential(name='Enhanced_MetaModel_NN')
meta_model_nn.add(Dense(128, input_shape=(X_meta_train.shape[1],), activation='relu', name='Dense_Layer_1'))
meta_model_nn.add(Dropout(0.2, name='Dropout_1'))  # Add dropout for regularization
meta_model_nn.add(Dense(64, activation='relu', name='Dense_Layer_2'))
meta_model_nn.add(Dropout(0.2, name='Dropout_2'))  # Add dropout for regularization
meta_model_nn.add(Dense(64, activation='relu', name='Dense_Layer_3'))  # Additional layer
meta_model_nn.add(Dense(Y_val_encoded.shape[1], activation='softmax', name='Output_Layer'))

# Compile the meta-model
meta_model_nn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


## RF and KNN Stacked

In [130]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np

# Prepare the data for the meta-model
# Generate predictions from RF and KNN models on the validation set
X_meta_train = np.hstack([
    np.concatenate([
        model.predict_proba(val_feature_subsets[feature_name])[:, :, np.newaxis]  # Add a new axis for stacking
        for model in (best_rf_models[feature_name], best_knn_models[feature_name])
    ], axis=2).mean(axis=2)  # Average predictions from RF and KNN models for each feature subset
    for feature_name in feature_subsets.keys()
]).reshape(len(Y_val), -1)  # Reshape to have a standard 2D array


meta_model.fit(X_meta_train, Y_val)

# Prepare test data in a similar manner
X_meta_test = np.hstack([
    np.concatenate([
        model.predict_proba(test_feature_subsets[feature_name])[:, :, np.newaxis]
        for model in (best_rf_models[feature_name], best_knn_models[feature_name])
    ], axis=2).mean(axis=2)
    for feature_name in feature_subsets.keys()
]).reshape(len(Y_test), -1)

# Make final predictions with the meta-model
final_predictions = meta_model.predict(X_meta_test)

# Calculate and print the test accuracy
test_accuracy = accuracy_score(Y_test, final_predictions)
print(f"Test accuracy with stacked RF and KNN models: {test_accuracy}")


Test accuracy with stacked RF and KNN models: 0.5758333333333333


## RF, KNN, and LR Stacked

In [125]:

# Update the preparation of the data for the meta-model to include logistic regression predictions
X_meta_train = np.hstack([
    np.concatenate([
        best_rf_models[feature_name].predict_proba(val_feature_subsets[feature_name])[:, :, np.newaxis],
        best_knn_models[feature_name].predict_proba(val_feature_subsets[feature_name])[:, :, np.newaxis],
        best_lr_models[feature_name].predict_proba(val_feature_subsets[feature_name])[:, :, np.newaxis]
    ], axis=2).mean(axis=2)  # Average predictions from RF, KNN, and LR models for each feature subset
    for feature_name in feature_subsets.keys()
]).reshape(len(Y_val), -1)  # Reshape to have a standard 2D array

# Train the meta-model with the updated training data
meta_model.fit(X_meta_train, Y_val)

# Update the preparation of the test data in a similar manner to include logistic regression predictions
X_meta_test = np.hstack([
    np.concatenate([
        best_rf_models[feature_name].predict_proba(test_feature_subsets[feature_name])[:, :, np.newaxis],
        best_knn_models[feature_name].predict_proba(test_feature_subsets[feature_name])[:, :, np.newaxis],
        best_lr_models[feature_name].predict_proba(test_feature_subsets[feature_name])[:, :, np.newaxis]
    ], axis=2).mean(axis=2)
    for feature_name in feature_subsets.keys()
]).reshape(len(Y_test), -1)

# Make final predictions with the updated meta-model
final_predictions = meta_model.predict(X_meta_test)

# Calculate and print the test accuracy with the inclusion of logistic regression models
test_accuracy = accuracy_score(Y_test, final_predictions)
print(f"Test accuracy with stacked RF, KNN, and LR models: {test_accuracy}")


Test accuracy with stacked RF, KNN, and LR models: 0.5941666666666666


### Grid search

In [29]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

# Define the parameter grid to search
param_grid = {
    'C': [0.1, 1, 10, 100],  # Regularization parameter
    'kernel': ['poly'],  # Kernel type
    'degree': [2, 3, 4],  # Degree of the polynomial kernel function
    'gamma': ['scale', 'auto'],  # Kernel coefficient
}

# Initialize a dictionary to store the best SVM models for each feature subset
best_svm_models = {}

# Perform grid search to find the best parameters for the SVM model for each feature subset
for feature_name, X_train_subset in feature_subsets.items():
    # Initialize the SVM model
    svm = SVC(probability=True, random_state=42)

    # Set up the grid search with cross-validation
    grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy', verbose=1)

    # Fit the grid search to the data
    grid_search.fit(X_train_subset, Y_train)

    # Retrieve the best model
    best_svm_models[feature_name] = grid_search.best_estimator_

    # Optionally, print the best parameters for each feature subset
    print(f"Best parameters for {feature_name}: {grid_search.best_params_}")


Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best parameters for chroma_cens: {'C': 10, 'degree': 3, 'gamma': 'scale', 'kernel': 'poly'}
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best parameters for chroma_cqt: {'C': 10, 'degree': 2, 'gamma': 'auto', 'kernel': 'poly'}
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best parameters for chroma_stft: {'C': 10, 'degree': 2, 'gamma': 'auto', 'kernel': 'poly'}
Fitting 5 folds for each of 24 candidates, totalling 120 fits



KeyboardInterrupt



## RF, KNN, LR, and SVM models

In [124]:

# Update the preparation of the data for the meta-model to include SVM predictions
X_meta_train = np.hstack([
    np.concatenate([
        best_rf_models[feature_name].predict_proba(val_feature_subsets[feature_name])[:, :, np.newaxis],
        best_knn_models[feature_name].predict_proba(val_feature_subsets[feature_name])[:, :, np.newaxis],
        best_lr_models[feature_name].predict_proba(val_feature_subsets[feature_name])[:, :, np.newaxis],
        best_svm_models[feature_name].predict_proba(val_feature_subsets[feature_name])[:, :, np.newaxis]  # Add SVM predictions
    ], axis=2).mean(axis=2)  # Average predictions from RF, KNN, LR, and now SVM models for each feature subset
    for feature_name in feature_subsets.keys()
]).reshape(len(Y_val), -1)  # Reshape to have a standard 2D array

# Re-train the meta-model with the updated training data including SVM predictions
meta_model.fit(X_meta_train, Y_val)

# Prepare the test data in a similar manner to include SVM predictions
X_meta_test = np.hstack([
    np.concatenate([
        best_rf_models[feature_name].predict_proba(test_feature_subsets[feature_name])[:, :, np.newaxis],
        best_knn_models[feature_name].predict_proba(test_feature_subsets[feature_name])[:, :, np.newaxis],
        best_lr_models[feature_name].predict_proba(test_feature_subsets[feature_name])[:, :, np.newaxis],
        best_svm_models[feature_name].predict_proba(test_feature_subsets[feature_name])[:, :, np.newaxis]  # Add SVM predictions
    ], axis=2).mean(axis=2)
    for feature_name in feature_subsets.keys()
]).reshape(len(Y_test), -1)

# Make final predictions with the updated meta-model including SVM
final_predictions = meta_model.predict(X_meta_test)

# Calculate and print the test accuracy with the inclusion of SVM models
test_accuracy = accuracy_score(Y_test, final_predictions)
print(f"Test accuracy with stacked RF, KNN, LR, and SVM models: {test_accuracy}")


KeyboardInterrupt: 

## RF, KNN, LR, SVM, and XGBoost models

In [123]:
X_meta_train = np.hstack([
    np.concatenate([
        best_rf_models[feature_name].predict_proba(val_feature_subsets[feature_name])[:, :, np.newaxis],
        best_knn_models[feature_name].predict_proba(val_feature_subsets[feature_name])[:, :, np.newaxis],
        best_lr_models[feature_name].predict_proba(val_feature_subsets[feature_name])[:, :, np.newaxis],
        best_svm_models[feature_name].predict_proba(val_feature_subsets[feature_name])[:, :, np.newaxis],  # Existing SVM predictions
        best_xgb_models[feature_name].predict_proba(val_feature_subsets[feature_name])[:, :, np.newaxis]  # Add XGBoost predictions
    ], axis=2).mean(axis=2)  # Average predictions from all models for each feature subset
    for feature_name in feature_subsets.keys()
]).reshape(len(Y_val), -1)  # Reshape to have a standard 2D array

# Re-train the meta-model with the updated training data including XGBoost predictions
meta_model.fit(X_meta_train, Y_val)

# Prepare the test data in a similar manner to include XGBoost predictions
X_meta_test = np.hstack([
    np.concatenate([
        best_rf_models[feature_name].predict_proba(test_feature_subsets[feature_name])[:, :, np.newaxis],
        best_knn_models[feature_name].predict_proba(test_feature_subsets[feature_name])[:, :, np.newaxis],
        best_lr_models[feature_name].predict_proba(test_feature_subsets[feature_name])[:, :, np.newaxis],
        best_svm_models[feature_name].predict_proba(test_feature_subsets[feature_name])[:, :, np.newaxis],  # Existing SVM predictions
        best_xgb_models[feature_name].predict_proba(test_feature_subsets[feature_name])[:, :, np.newaxis]  # Add XGBoost predictions
    ], axis=2).mean(axis=2)
    for feature_name in feature_subsets.keys()
]).reshape(len(Y_test), -1)

# Make final predictions with the updated meta-model including XGBoost
final_predictions = meta_model.predict(X_meta_test)

# Calculate and print the test accuracy with the inclusion of XGBoost models
test_accuracy = accuracy_score(Y_test, final_predictions)
print(f"Test accuracy with stacked RF, KNN, LR, SVM, and XGBoost models: {test_accuracy}")


Test accuracy with stacked RF, KNN, LR, SVM, and XGBoost models: 0.6116666666666667


##  Previous with NN

In [118]:
X_meta_train = np.hstack([
    np.concatenate([
        best_rf_models[feature_name].predict_proba(val_feature_subsets[feature_name])[:, :, np.newaxis],
        best_knn_models[feature_name].predict_proba(val_feature_subsets[feature_name])[:, :, np.newaxis],
        best_lr_models[feature_name].predict_proba(val_feature_subsets[feature_name])[:, :, np.newaxis],
        best_svm_models[feature_name].predict_proba(val_feature_subsets[feature_name])[:, :, np.newaxis],
        best_xgb_models[feature_name].predict_proba(val_feature_subsets[feature_name])[:, :, np.newaxis],
        # Generate predictions from the NN model for the validation set and add them
        best_nn_models[feature_name].predict(val_feature_subsets[feature_name])[:, :, np.newaxis]
    ], axis=2).mean(axis=2)
    for feature_name in feature_subsets.keys()
]).reshape(len(Y_val), -1)

meta_model.fit(X_meta_train, Y_val)

# Modification for X_meta_test to include neural network predictions
X_meta_test = np.hstack([
    np.concatenate([
        best_rf_models[feature_name].predict_proba(test_feature_subsets[feature_name])[:, :, np.newaxis],
        best_knn_models[feature_name].predict_proba(test_feature_subsets[feature_name])[:, :, np.newaxis],
        best_lr_models[feature_name].predict_proba(test_feature_subsets[feature_name])[:, :, np.newaxis],
        best_svm_models[feature_name].predict_proba(test_feature_subsets[feature_name])[:, :, np.newaxis],
        best_xgb_models[feature_name].predict_proba(test_feature_subsets[feature_name])[:, :, np.newaxis],
        # Add neural network predictions for the test set
        best_nn_models[feature_name].predict(test_feature_subsets[feature_name])[:, :, np.newaxis]
    ], axis=2).mean(axis=2)  # Average predictions from all models for each feature subset
    for feature_name in feature_subsets.keys()
]).reshape(len(Y_test), -1)

# Make final predictions with the updated meta-model including all models
final_predictions = meta_model.predict(X_meta_test)

# Calculate and print the test accuracy with the inclusion of all models
test_accuracy = accuracy_score(Y_test, final_predictions)
print(f"Test accuracy with stacked RF, KNN, LR, SVM, XGBoost, and Neural Network models: {test_accuracy}")


Test accuracy with stacked RF, KNN, LR, SVM, XGBoost, and Neural Network models: 0.6166666666666667


# Meta Learners

## Log Reg Meta learner

In [None]:
meta_model_lr.fit(X_meta_train, Y_val)
# Make final predictions with the updated meta-model including all models
final_predictions = meta_model_lr.predict(X_meta_test)

# Calculate and print the test accuracy with the inclusion of all models
test_accuracy = accuracy_score(Y_test, final_predictions)
print(f"Test accuracy with stacked RF, KNN, LR, SVM, XGBoost, and Neural Network models: {test_accuracy}")


## Random Forest Meta Learner

In [126]:
# Train the meta-model on the meta-training dataset
meta_model_rf.fit(X_meta_train, Y_val)

# Make predictions on the meta-test dataset
final_predictions_rf = meta_model_rf.predict(X_meta_test)

# Evaluate and print the test accuracy
from sklearn.metrics import accuracy_score
test_accuracy_rf = accuracy_score(Y_test, final_predictions_rf)
print(f"Test accuracy with Random Forest as meta-model: {test_accuracy_rf}")


Test accuracy with Random Forest as meta-model: 0.5658333333333333


## XGBoost Meta Learner

In [127]:
# Train the meta-model on the meta-training dataset
meta_model_xgb.fit(X_meta_train, Y_val)

# Make predictions on the meta-test dataset
final_predictions_xgb = meta_model_xgb.predict(X_meta_test)

# Evaluate and print the test accuracy
from sklearn.metrics import accuracy_score
test_accuracy_xgb = accuracy_score(Y_test, final_predictions_rf)
print(f"Test accuracy with Random Forest as meta-model: {test_accuracy_rf}")



Test accuracy with Random Forest as meta-model: 0.5658333333333333


## NN as Meta Learner

In [128]:
from tensorflow.keras.callbacks import EarlyStopping

# Initialize early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the meta-model with early stopping
history = meta_model_nn.fit(X_meta_train, Y_val_encoded, epochs=100, batch_size=32,
                            validation_split=0.2, verbose=1, callbacks=[early_stopping])


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100


In [129]:
Y_test_encoded = to_categorical(Y_test)
test_loss, test_accuracy = meta_model_nn.evaluate(X_meta_test, Y_test_encoded, verbose=1)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")


Test Loss: 2.302931785583496
Test Accuracy: 0.534166693687439



# Stacked Bootstrap

In [11]:
from sklearn.utils import resample

# Define the number of bootstrap samples and models
n_bootstrap_samples = 10

# Placeholder for trained bootstrap models
bootstrap_models_rf = {feature_name: [] for feature_name in feature_subsets.keys()}
bootstrap_models_knn = {feature_name: [] for feature_name in feature_subsets.keys()}
bootstrap_models_lr = {feature_name: [] for feature_name in feature_subsets.keys()}

# Train bootstrap models for RandomForest
for feature_name, X_train_subset in feature_subsets.items():
    for _ in range(n_bootstrap_samples):
        # Create a bootstrap sample
        X_boot, Y_boot = resample(X_train_subset, Y_train)
        # Initialize and train the model on the bootstrap sample
        model = RandomForestClassifier(random_state=42)
        model.fit(X_boot, Y_boot)
        bootstrap_models_rf[feature_name].append(model)

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample


# Train bootstrap models for KNN
for feature_name, X_train_subset in feature_subsets.items():
    for _ in range(n_bootstrap_samples):
        X_boot, Y_boot = resample(X_train_subset, Y_train)
        knn_model = KNeighborsClassifier()
        knn_model.fit(X_boot, Y_boot)
        bootstrap_models_knn[feature_name].append(knn_model)

# Train bootstrap models for Logistic Regression
for feature_name, X_train_subset in feature_subsets.items():
    for _ in range(n_bootstrap_samples):
        X_boot, Y_boot = resample(X_train_subset, Y_train)
        lr_model = LogisticRegression(random_state=42, max_iter=1000)
        lr_model.fit(X_boot, Y_boot)
        bootstrap_models_lr[feature_name].append(lr_model)

# Generate averaged predictions for the validation set
# This assumes `val_feature_subsets` is already prepared similarly to `feature_subsets`
X_meta_train = np.hstack([
    np.mean([
        np.mean([model.predict_proba(val_feature_subsets[feature_name]) for model in models], axis=0)
        for models in (bootstrap_models_rf[feature_name], bootstrap_models_knn[feature_name], bootstrap_models_lr[feature_name])
    ], axis=0)
    for feature_name in feature_subsets.keys()
]).reshape(len(Y_val), -1)

# Train the meta-model
meta_model.fit(X_meta_train, Y_val)

# Assuming `test_feature_subsets` is prepared, generate predictions for the test set
X_meta_test = np.hstack([
    np.mean([
        np.mean([model.predict_proba(test_feature_subsets[feature_name]) for model in models], axis=0)
        for models in (bootstrap_models_rf[feature_name], bootstrap_models_knn[feature_name], bootstrap_models_lr[feature_name])
    ], axis=0)
    for feature_name in feature_subsets.keys()
]).reshape(len(Y_test), -1)

# Final predictions with the meta-model
final_predictions = meta_model.predict(X_meta_test)

# Calculate and print the test accuracy
test_accuracy = accuracy_score(Y_test, final_predictions)
print(f"Test accuracy with bootstrapped and stacked RF, KNN, and LR models: {test_accuracy}")


Test accuracy with bootstrapped and stacked RF, KNN, and LR models: 0.5933333333333334


# Best Model per Feature Subset

In [12]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np

# Assuming `feature_subsets`, `X_val_scaled`, and `Y_val` are already defined

# Step 1: Train models on each feature subset
model_types = ['KNN', 'RF', 'LR', 'SVM']
best_models = {}

for feature_name, X_train_subset in feature_subsets.items():
    # Track the best model for this subset
    best_model = None
    best_accuracy = 0
    X_val_subset = val_feature_subsets[feature_name]

    # KNN Model
    knn_model = KNeighborsClassifier()
    knn_model.fit(X_train_subset, Y_train)
    knn_accuracy = knn_model.score(X_val_subset, Y_val)

    if knn_accuracy > best_accuracy:
        best_accuracy = knn_accuracy
        best_model = ('KNN', knn_model)

    # RF Model
    rf_model = RandomForestClassifier(random_state=42)
    rf_model.fit(X_train_subset, Y_train)
    rf_accuracy = rf_model.score(X_val_subset, Y_val)

    if rf_accuracy > best_accuracy:
        best_accuracy = rf_accuracy
        best_model = ('RF', rf_model)

    # LR Model
    lr_model = LogisticRegression(random_state=42, max_iter=1000)
    lr_model.fit(X_train_subset, Y_train)
    lr_accuracy = lr_model.score(X_val_subset, Y_val)

    if lr_accuracy > best_accuracy:
        best_accuracy = lr_accuracy
        best_model = ('LR', lr_model)

# Step 2: Evaluate and select the best model per subset (already done in the loop)

# Step 3: Ensemble selected models for final prediction
# Generate predictions for each selected model on the test set
ensemble_predictions = np.zeros((len(Y_test), len(np.unique(Y_train))))

for feature_name, (model_type, model) in best_models.items():
    X_test_subset = test_feature_subsets[feature_name]
    predictions = model.predict_proba(X_test_subset)
    ensemble_predictions += predictions

# Final ensemble prediction
final_predictions = np.argmax(ensemble_predictions, axis=1)

# Calculate and print the test accuracy
test_accuracy = accuracy_score(Y_test, final_predictions)
print(f"Test accuracy with the best model from each feature subset ensemble: {test_accuracy}")


Test accuracy with the best model from each feature subset ensemble: 0.5333333333333333


In [14]:
# Assuming `best_models` and their validation accuracies are already calculated

# Placeholder for models, their weights, and validation feature subsets
selected_models = []
model_weights = []
val_feature_subsets = split_features_by_type(X_val_scaled, feature_structure)  # Ensure this is defined

# Calculate the validation accuracy for each selected model and use it as the weight
for feature_name, (model_type, model) in best_models.items():
    X_val_subset = val_feature_subsets[feature_name]
    val_accuracy = model.score(X_val_subset, Y_val)
    selected_models.append(model)
    model_weights.append(val_accuracy)

# Normalize the weights so they sum up to 1
total_weight = sum(model_weights)
normalized_weights = [weight / total_weight for weight in model_weights]

# Apply weighted averaging for the final ensemble predictions
weighted_ensemble_predictions = np.zeros((len(Y_test), len(np.unique(Y_train))))

for model, weight, feature_name in zip(selected_models, normalized_weights, best_models.keys()):
    X_test_subset = test_feature_subsets[feature_name]
    prob_predictions = model.predict_proba(X_test_subset)
    weighted_ensemble_predictions += prob_predictions * weight

# The final prediction is the class with the highest weighted sum across all models
final_predictions_weighted = np.argmax(weighted_ensemble_predictions, axis=1)

# Calculate and print the test accuracy with weighted averaging
test_accuracy_weighted = accuracy_score(Y_test, final_predictions_weighted)
print(f"Test accuracy with weighted averaging ensemble: {test_accuracy_weighted}")


Test accuracy with weighted averaging ensemble: 0.5408333333333334


In [47]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC  # For SVM with RBF kernel
from xgboost import XGBClassifier  # For XGBoost
from sklearn.metrics import accuracy_score
import numpy as np

best_models = {}

for feature_name, X_train_subset in feature_subsets.items():
    # Track the best model for this subset
    best_model = None
    best_accuracy = 0
    X_val_subset = val_feature_subsets[feature_name]

    # Define your models here, including SVM and XGBoost
    models = {
        'KNN': KNeighborsClassifier(),
        'RF': RandomForestClassifier(random_state=42),
        'LR': LogisticRegression(random_state=42, max_iter=1000),
        'SVM': SVC(probability=True, kernel='rbf', random_state=42),  # SVM with RBF kernel
        'XGB': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)  # XGBoost
    }

    for model_name, model in models.items():
        model.fit(X_train_subset, Y_train)
        accuracy = model.score(X_val_subset, Y_val)

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model = (model_name, model)

    best_models[feature_name] = best_model

# Ensemble selected models for final prediction
ensemble_predictions = np.zeros((len(Y_test), len(np.unique(Y_train))))

for feature_name, (model_type, model) in best_models.items():
    X_test_subset = test_feature_subsets[feature_name]
    if hasattr(model, "predict_proba"):
        predictions = model.predict_proba(X_test_subset)
    else:  # For models like SVM that might not support predict_proba by default
        predictions = model.decision_function(X_test_subset)
        predictions = (predictions - predictions.min()) / (predictions.max() - predictions.min())  # Simple scaling to [0,1]
    ensemble_predictions += predictions

# Final ensemble prediction
final_predictions = np.argmax(ensemble_predictions, axis=1)

# Calculate and print the test accuracy
test_accuracy = accuracy_score(Y_test, final_predictions)
print(f"Test accuracy with the best model from each feature subset ensemble: {test_accuracy}")


Test accuracy with the best model from each feature subset ensemble: 0.565


In [48]:
from sklearn.metrics import accuracy_score
import numpy as np

# Initialize an empty dictionary to store validation accuracies
validation_accuracies = {}

best_models = {}

for feature_name, X_train_subset in feature_subsets.items():
    # Track the best model for this subset
    best_model = None
    best_accuracy = 0
    X_val_subset = val_feature_subsets[feature_name]

    # Define your models here, including SVM and XGBoost
    models = {
        'KNN': KNeighborsClassifier(),
        'RF': RandomForestClassifier(random_state=42),
        'LR': LogisticRegression(random_state=42, max_iter=1000),
        'SVM': SVC(probability=True, kernel='rbf', random_state=42),  # SVM with RBF kernel
        'XGB': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)  # XGBoost
    }

    for model_name, model in models.items():
        model.fit(X_train_subset, Y_train)
        accuracy = model.score(X_val_subset, Y_val)

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model = (model_name, model)

    # Store the best model and its validation accuracy
    best_models[feature_name] = best_model
    validation_accuracies[feature_name] = best_accuracy

# Ensemble selected models for final prediction with weighted predictions
ensemble_predictions = np.zeros((len(Y_test), len(np.unique(Y_train))))

for feature_name, (model_type, model) in best_models.items():
    X_test_subset = test_feature_subsets[feature_name]
    weight = validation_accuracies[feature_name]

    if hasattr(model, "predict_proba"):
        predictions = model.predict_proba(X_test_subset) * weight
    else:  # For models like SVM that might not support predict_proba by default
        # Normalize decision_function output to [0,1] and apply weight
        decision_values = model.decision_function(X_test_subset)
        predictions = (decision_values - decision_values.min()) / (decision_values.max() - decision_values.min())
        predictions *= weight

    # Sum the weighted predictions
    ensemble_predictions += predictions

# Normalize the ensemble predictions to ensure they form a valid probability distribution
ensemble_predictions /= ensemble_predictions.sum(axis=1, keepdims=True)

# Make final predictions based on the weighted ensemble
final_predictions = np.argmax(ensemble_predictions, axis=1)

# Calculate and print the test accuracy
test_accuracy = accuracy_score(Y_test, final_predictions)
print(f"Test accuracy with weighted ensemble: {test_accuracy}")


Test accuracy with weighted ensemble: 0.57
