In [25]:
%run 'Setup.py'

8 different classes: Electronic, Experimental, Folk, Hip-Hop, Instrumental, International, Pop or Rock.
objective 1: construct a classifier which, based on the features of a song, predicts its genre
objective 2: estimate its generalisation error under the 0–1 loss.
Features are real-valued, correspond to summary statistics (mean, sd, skewness, kurtosis, median, min, max) of 
time series of various music features, such as the chromagram or the Mel-frequency cepstrum.
Feature description: 

Feature description: 
chroma_cens: Chroma Energy Normalized (CENS, 12 chroma) - 84 features
chroma_cqt: Constant-Q chromagram (12 chroma) - 84 features
chroma_stft: Chromagram (12 chroma) - 84 features
mfcc: Mel-frequency cepstrum (20 coefficients) - 140 features
rmse: Root-mean-square - 7 features
spectral_bandwidth: Spectral bandwidth - 7 features
spectral_centroid: Spectral centroid - 7 features
spectral_contrast: Spectral contrast (7 frequency bands) - 49 features
spectral_rolloff: Roll-off freque

In [26]:
# Prepare data
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train_np.ravel()) #

# Split training data into training and temporary validation sets
X_train, X_temp, Y_train, Y_temp = train_test_split(x_train, y_train_encoded, test_size=0.4, random_state=42)

# Split the temporary validation set into validation and fake test set
X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)  
X_real_test_scaled = scaler.transform(x_test) # real test set we don't have labels for

In [3]:
def split_features_by_type(X, feature_structure):
    """
    Splits the dataset into subsets based on the feature structure provided.

    :param X: numpy array, the dataset to be split (features only)
    :param feature_structure: dict, keys are feature names and values are the number of features of that type
    :return: dict of feature subsets
    """
    feature_subsets = {}
    start_idx = 0
    
    for feature_name, feature_count in feature_structure.items():
        end_idx = start_idx + feature_count
        feature_subsets[feature_name] = X[:, start_idx:end_idx]
        start_idx = end_idx
    
    return feature_subsets

# Define the feature structure
feature_structure = {
    'chroma_cens': 84,
    'chroma_cqt': 84,
    'chroma_stft': 84,
    'mfcc': 140,
    'rmse': 7,
    'spectral_bandwidth': 7,
    'spectral_centroid': 7,
    'spectral_contrast': 49,
    'spectral_rolloff': 7,
    'tonnetz': 42,
    'zcr': 7
}

## Boosting

In [13]:
import optuna
import xgboost as xgb
from sklearn.metrics import accuracy_score

def objective(trial):
    # Hyperparameters to be tuned
    params = {
        'objective': 'multi:softmax',
        'num_class': 8,
        'max_depth': trial.suggest_int('max_depth', 3, 100),
        'eta': trial.suggest_float('eta', 0.01, 0.4),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
    }

    # Convert the dataset into DMatrix form
    dtrain = xgb.DMatrix(X_train_scaled, label=Y_train)
    dval = xgb.DMatrix(X_val_scaled, label=Y_val)

    # List to hold the validation sets
    evals = [(dtrain, 'train'), (dval, 'validation')]
    model = xgb.train(params, dtrain, num_boost_round=5_000, evals=evals,
                      early_stopping_rounds=25, verbose_eval=False)

    # Predictions on the validation set
    preds = model.predict(dval)
    accuracy = accuracy_score(Y_val, preds)

    return accuracy

In [14]:
study = optuna.create_study(direction='maximize', study_name="XGB")
study.optimize(objective, n_trials=100)

[I 2024-03-13 16:27:57,113] A new study created in memory with name: XGB
[I 2024-03-13 16:28:14,380] Trial 0 finished with value: 0.5658333333333333 and parameters: {'max_depth': 49, 'eta': 0.22468942740980324, 'subsample': 0.8795250041220241, 'colsample_bytree': 0.9351923036898544}. Best is trial 0 with value: 0.5658333333333333.
[I 2024-03-13 16:32:12,545] Trial 1 finished with value: 0.5691666666666667 and parameters: {'max_depth': 73, 'eta': 0.011364781099888269, 'subsample': 0.9167484610898501, 'colsample_bytree': 0.8059351842647398}. Best is trial 1 with value: 0.5691666666666667.
[I 2024-03-13 16:32:31,146] Trial 2 finished with value: 0.5783333333333334 and parameters: {'max_depth': 68, 'eta': 0.17474434186893512, 'subsample': 0.7662150483698524, 'colsample_bytree': 0.8148551507175649}. Best is trial 2 with value: 0.5783333333333334.
[I 2024-03-13 16:35:57,233] Trial 3 finished with value: 0.5716666666666667 and parameters: {'max_depth': 65, 'eta': 0.014772979634822636, 'subsam

In [16]:
best_params = study.best_trial.params
print('Best trial:', study.best_trial.params)
params = {
        'objective': 'multi:softmax',
        'num_class': 8,
    }

# Update model parameters
params.update(best_params)

# Merge train and val set to retrain on maximal amount of data possible
X_train_val_combined = np.vstack((X_train_scaled, X_val_scaled))
Y_train_val_combined = np.concatenate((Y_train, Y_val))

# Convert the combined dataset into DMatrix form for XGBoost
dtrain_val_combined = xgb.DMatrix(X_train_val_combined, label=Y_train_val_combined)

# Retrain the model on the full dataset with the best parameters
final_model = xgb.train(params, dtrain_val_combined, num_boost_round=10_000) # 5,000

# Evaluate on the fake test set
dtest = xgb.DMatrix(X_test_scaled)
test_preds = final_model.predict(dtest)
test_accuracy = accuracy_score(Y_test, test_preds)
print(f"Test set accuracy: {test_accuracy}")

Best trial: {'max_depth': 7, 'eta': 0.04842404379213268, 'subsample': 0.6319894057470088, 'colsample_bytree': 0.839924916573817}
Test set accuracy: 0.64


In [27]:
# If worht it, save model:
final_model.save_model('Models/xgboost-64%-all-data')

In [28]:
# Load Model again, important to use same data prepro pipeline
best_current_booster = xgb.Booster()
best_current_booster.load_model('Models/xgboost-64%-all-data')

In [15]:
print(study.best_trial)

FrozenTrial(number=86, state=TrialState.COMPLETE, values=[0.5875], datetime_start=datetime.datetime(2024, 3, 13, 18, 10, 9, 715945), datetime_complete=datetime.datetime(2024, 3, 13, 18, 10, 55, 634526), params={'max_depth': 7, 'eta': 0.04842404379213268, 'subsample': 0.6319894057470088, 'colsample_bytree': 0.839924916573817}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'max_depth': IntDistribution(high=100, log=False, low=3, step=1), 'eta': FloatDistribution(high=0.4, log=False, low=0.01, step=None), 'subsample': FloatDistribution(high=1.0, log=False, low=0.6, step=None), 'colsample_bytree': FloatDistribution(high=1.0, log=False, low=0.6, step=None)}, trial_id=86, value=None)


# First attempt

In [ ]:
# Best trial: {'max_depth': 41, 'eta': 0.06469690136778568, 'subsample': 0.6691108267215701, 'colsample_bytree': 0.6187924973029665}
# Test set accuracy: 0.6383333333333333
# using 5k final boosting rounds, 1000 in study with 80 max tree depth

# Second attempt

In [ ]:
# Best trial: {'max_depth': 7, 'eta': 0.04842404379213268, 'subsample': 0.6319894057470088, 'colsample_bytree': 0.839924916573817}
# Test set accuracy: 0.64
# Using 10 final boosting rounds, 5k in study with tree depth between 3 and 100

# Boosting separately on feature subsets

Load Data and Convert to DMatrix

In [29]:
train_subsets = split_features_by_type(X_train_scaled, feature_structure)
val_subsets = split_features_by_type(X_val_scaled, feature_structure)
test_subsets = split_features_by_type(X_test_scaled, feature_structure)

In [32]:
def objective(trial, X_sub, Y_sub):
    # Hyperparameters
    params = {
        'objective': 'multi:softmax',
        'num_class': 8,
        'max_depth': trial.suggest_int('max_depth', 3, 60),
        'eta': trial.suggest_float('eta', 0.01, 0.4),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'eval_metric': 'merror'  # Multiclass Classification Error
    }

    # Convert the subset dataset into DMatrix form
    dmatrix = xgb.DMatrix(X_sub, label=Y_sub)

    # Perform cross-validation
    cv_results = xgb.cv(params, dmatrix, num_boost_round=5000, nfold=5, stratified=True,
                        early_stopping_rounds=25, seed=42, verbose_eval=False)

    # Extract the minimum mean merror from the CV results
    min_mean_merror = cv_results['test-merror-mean'].min()

    return min_mean_merror

In [ ]:
best_params_subsets = {}
validation_accuracies = {}

for feature_name, feature_count in feature_structure.items():
    print(f"Running study for feature subset: {feature_name}")
    
    # Prepare the data for this subset
    X_sub_train = train_subsets[feature_name]
    Y_sub_train = Y_train  # Y_train should be defined in your context

    def subset_objective(trial):
        return objective(trial, X_sub_train, Y_sub_train)

    study = optuna.create_study(direction='minimize', study_name=f"XGB_{feature_name}")
    study.optimize(subset_objective, n_trials=100)

    best_params_subsets[feature_name] = study.best_trial.params

    # # Merge train and validation subsets for final model retraining
    # X_sub_train_val_combined = np.vstack((X_sub_train, val_subsets[feature_name]))
    # Y_train_val_combined = np.concatenate((Y_train, Y_val))  # Assuming Y_val is defined
    # 
    # # Retrain the model on the combined training and validation set with the best parameters
    # dtrain_val_combined = xgb.DMatrix(X_sub_train_val_combined, label=Y_train_val_combined)
    # Retrain model with optimal parameters with more boosting rounds
    params = {
        'objective': 'multi:softmax',
        'num_class': 8,
        **best_params_subsets[feature_name],  # Unpack the best parameters
    }
    final_model = xgb.train(params, X_sub_train, num_boost_round=10_000) # dtrain_val_combined

    # Evaluate the final model on the validation set
    dval = xgb.DMatrix(val_subsets[feature_name], label=Y_val)
    preds = final_model.predict(dval)
    val_accuracy = accuracy_score(Y_val, preds)
    validation_accuracies[feature_name] = val_accuracy

    # Save the final model
    model_name = f'Models/XGBoost-Feature-Subsets/xgboost_{feature_name}_final.model'
    final_model.save_model(model_name)

    print(f"Validation accuracy for {feature_name}: {val_accuracy}")

## Save Best Parameters 

In [ ]:
import json
from datetime import datetime
# Format the current date and time as a string
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# Filenames with date and time
filename_best_params = f"best_params_subsets_{timestamp}.json"
filename_validation_accuracies = f"validation_accuracies_{timestamp}.json"

# Save best_params_subsets
with open(filename_best_params, 'w') as file:
    json.dump(best_params_subsets, file, indent=4)

# Save validation_accuracies
with open(filename_validation_accuracies, 'w') as file:
    json.dump(validation_accuracies, file, indent=4)
