# Split data

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import collections

# Load the dataset
data_dir = 'data'

# data = pd.read_csv(f'{data_dir}/audio_features_no_duplicates_new_normalized.csv')
data = pd.read_csv(f'{data_dir}/audio_features_no_duplicates_new.csv')

data_normalized = pd.read_csv(f'{data_dir}/audio_features_no_duplicates_new_normalized.csv')

In [2]:
# Exclude the first column (filename) and assume the last column is the label
X = data.iloc[:, 1:-1].values
y = data.iloc[:, -1].values

X_normalized = data_normalized.iloc[:, 1:-1].values
y_normalized = data_normalized.iloc[:, -1].values

# Check the distribution of labels
label_distribution = collections.Counter(y)
print(f"Label distribution: {label_distribution}")

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

Label distribution: Counter({0: 1752, 1: 1678})


# Random Forrest Model

In [121]:
from sklearn.ensemble import RandomForestClassifier

# Train the classifier
clf = RandomForestClassifier(n_estimators=200)
clf.fit(X_train, y_train)

# Predict and evaluate
y_pred = clf.predict(X_test)

In [120]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Accuracy: 86.30%
Precision: 0.86
Recall: 0.86
F1 Score: 0.86
Confusion Matrix:
[[430  73]
 [ 68 458]]
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.85      0.86       503
           1       0.86      0.87      0.87       526

    accuracy                           0.86      1029
   macro avg       0.86      0.86      0.86      1029
weighted avg       0.86      0.86      0.86      1029



In [None]:
import os
import librosa
import numpy as np
from sklearn.preprocessing import MinMaxScaler

import librosa
import numpy as np

def extract_features(file_path):
    
    # Load the audio file
    y, sr = librosa.load(file_path)

    # Extract features
    mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13).T, axis=0)
    chroma = np.mean(librosa.feature.chroma_stft(y=y, sr=sr).T, axis=0)
    spectral_contrast = np.mean(librosa.feature.spectral_contrast(y=y, sr=sr).T, axis=0)
    zero_crossings = np.mean(librosa.feature.zero_crossing_rate(y))
    spectral_bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr))
    rms_energy = np.mean(librosa.feature.rms(y=y))

    # Combine all features into a single array
    features = np.hstack([mfccs, chroma, spectral_contrast, zero_crossings, spectral_bandwidth, rms_energy])

    return features

def predict_track(file_path, model):

    # Extract features from the audio file
    features = extract_features(file_path)
    
    # Convert features to a DataFrame with the same columns as used in training
    feature_df = pd.DataFrame([features])

    
    # Make a prediction and get probabilities
    label = model.predict(feature_df)
    probabilities = model.predict_proba(feature_df)
    
    return label,probabilities


## Getting predictions on unseen track

In [114]:
import os

track_directory = 'data/'

# Loop through all files in the directory
for track_name in os.listdir(track_directory):
    if track_name.endswith('.mp3'):
        track_path = os.path.join(track_directory, track_name)

        label, prediction = predict_track(track_path, clf)

        if label == 0:
            print(f"{track_name} is not proper! Label is {label[0]} with a probability of {prediction[0][label[0]]}")
        else:
            print(f"{track_name} is proper! Label is {label[0]} with a probability of {prediction[0][label[0]]}")

Rene Wise - Lakota Fox.mp3 is proper! Label is 1 with a probability of 0.885
Kike Pravda - Main Control [tZcM4ytxSGc].mp3 is proper! Label is 1 with a probability of 0.865
Beatrice - I'm Not From Here.mp3 is proper! Label is 1 with a probability of 0.76
Lowerzone - Verknipt People.mp3 is not proper! Label is 0 with a probability of 0.675
Mark Dekoda - Rave Harder Techno Bass.mp3 is not proper! Label is 0 with a probability of 0.62
09 - Step To Enchantment (Stringent).mp3 is proper! Label is 1 with a probability of 0.685
Kaiser (K S R) - Driving In A Fast Tool.mp3 is proper! Label is 1 with a probability of 0.905
Jeff Mills - Step to enchantment [eLtnhsLwSZE].mp3 is not proper! Label is 0 with a probability of 0.595
Voorman - Surface Scatter.mp3 is proper! Label is 1 with a probability of 0.87
Stef Mendesidis - Crime and Punishment.mp3 is proper! Label is 1 with a probability of 0.775
Reinier Zonneveld - Mom Was On Tequila.mp3 is not proper! Label is 0 with a probability of 0.97
Alexand

In [110]:
# Get prediction over one track
track_name = "09 - Step To Enchantment (Stringent).mp3"
track_path = os.path.join(track_directory, track_name)
label, prediction = predict_track(track_path, clf)

print(f"{track_name} is {label[0]} with a probability of {prediction[0][label[0]]}")

09 - Step To Enchantment (Stringent).mp3 is 1 with a probability of 0.73


## Notes

Looks like the model's predictions are only reliable when the quality of the track is not totally messed up. Needs to be decent quality. Not noisy vinyl rips.

When we deploy the model as a web app, and users can submit a Youtube link to a track, downloading the audio in high quality will be problematic. 

We need to find a way to download the audio in high quality. Or, we should downgrade the training data to match the quality of the audio that users can submit from Youtube.



## Saving the model

In [96]:
import pickle

# Save the model
with open('output/random_forest_model.pkl', 'wb') as file:
    pickle.dump(clf, file)

# # Load the model
# with open('random_forest_model.pkl', 'rb') as file:
#     clf_loaded = pickle.load(file)

# XGBoost Model

In [123]:
from xgboost import XGBClassifier

# Initialize and train the XGBoost model
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

# Make predictions
y_pred = xgb_model.predict(X_test)

# Evaluate the model
print("XGBoost Accuracy:", accuracy_score(y_test, y_pred))
print("XGBoost Classification Report:\n", classification_report(y_test, y_pred))

Parameters: { "use_label_encoder" } are not used.



XGBoost Accuracy: 0.8775510204081632
XGBoost Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.87      0.87       503
           1       0.88      0.88      0.88       526

    accuracy                           0.88      1029
   macro avg       0.88      0.88      0.88      1029
weighted avg       0.88      0.88      0.88      1029



### Grid Search for XGBoost

In [131]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.2],
    'min_child_weight': [1, 3, 5]
}

# Initialize the XGBClassifier
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Set up the grid search
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, 
                           scoring='accuracy', cv=3, verbose=1, n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X, y)

# Print the best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best accuracy score: ", grid_search.best_score_)

# Best parameters found:  {'colsample_bytree': 0.9, 'gamma': 0.2, 'learning_rate': 0.1, 'max_depth': 7, 'min_child_weight': 3, 'n_estimators': 100, 'subsample': 0.9}
# Best accuracy score:  0.8656201628642574

Fitting 3 folds for each of 2187 candidates, totalling 6561 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Best parameters found:  {'colsample_bytree': 0.9, 'gamma': 0.2, 'learning_rate': 0.1, 'max_depth': 7, 'min_child_weight': 3, 'n_estimators': 100, 'subsample': 0.9}
Best accuracy score:  0.8656201628642574


In [3]:
#Train the model with the best parameters
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', **grid_search.best_params_)
xgb_model.fit(X, y)

NameError: name 'XGBClassifier' is not defined

In [4]:
from xgboost import XGBClassifier
import pickle

xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', colsample_bytree=0.9, gamma=0.2, learning_rate=0.1, max_depth=7, min_child_weight=3, n_estimators=100, subsample=0.9)
xgb_model.fit(X, y)

# Save model
with open('output/xgb_model.pkl', 'wb') as file:
    pickle.dump(xgb_model, file)

Parameters: { "use_label_encoder" } are not used.



# Logistic Regression, SVM, KNN

### Logistic Regression default

In [125]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Train Logistic Regression
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)

y_pred_logistic = logistic_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_logistic)
precision = precision_score(y_test, y_pred_logistic, average='weighted')
recall = recall_score(y_test, y_pred_logistic, average='weighted')
f1 = f1_score(y_test, y_pred_logistic, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred_logistic)
class_report = classification_report(y_test, y_pred_logistic)

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Accuracy: 84.74%
Precision: 0.85
Recall: 0.85
F1 Score: 0.85
Confusion Matrix:
[[428  75]
 [ 82 444]]
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.85      0.85       503
           1       0.86      0.84      0.85       526

    accuracy                           0.85      1029
   macro avg       0.85      0.85      0.85      1029
weighted avg       0.85      0.85      0.85      1029



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Logistic Regression Grid Search

In [138]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

# Initialize the logistic regression model
logistic = LogisticRegression(max_iter=1000)

# Define the parameter grid
param_grid = {
    'C': [0.1, 1, 10, 100],  # Regularization strength
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],  # Penalty type
    'solver': ['saga']  # Solver to use; 'saga' supports l1, l2, and elasticnet
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=logistic, param_grid=param_grid, cv=5, scoring='accuracy')

# Fit the grid search to the data
grid_search.fit(X_normalized, y_normalized)

# Print the best parameters and the best score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-validation Score:", grid_search.best_score_)

# Not normalized
# Best Parameters: {'C': 0.1, 'penalty': 'l1', 'solver': 'saga'}
# Best Cross-validation Score: 0.8230320699708455

# Normalized
# Best Parameters: {'C': 0.1, 'penalty': 'l1', 'solver': 'saga'}
# Best Cross-validation Score: 0.839067055393586




Best Parameters: {'C': 0.1, 'penalty': 'l1', 'solver': 'saga'}
Best Cross-validation Score: 0.839067055393586


20 fits failed out of a total of 80.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/zein/opt/miniconda3/envs/proper-classifier/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/zein/opt/miniconda3/envs/proper-classifier/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1101, in fit
    raise ValueError(
ValueError: l1_ratio must be between 0 and 1; got (l1_ratio=None)

        nan 0.85626822 0.85626822 0.85597668        nan 0.85626822
 0.85597668 0.85568513        nan 0.85626822]


In [135]:
# Train the model with the best parameters
logistic_model = LogisticRegression(max_iter=1000, **grid_search.best_params_)
logistic_model.fit(X, y)



## SVM

In [129]:
# Train SVM
svm_model = SVC(probability=True)
svm_model.fit(X_train, y_train)

y_pred_svm = svm_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_svm)
precision = precision_score(y_test, y_pred_svm, average='weighted')
recall = recall_score(y_test, y_pred_svm, average='weighted')
f1 = f1_score(y_test, y_pred_svm, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred_svm)
class_report = classification_report(y_test, y_pred_svm)

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Accuracy: 78.33%
Precision: 0.79
Recall: 0.78
F1 Score: 0.78
Confusion Matrix:
[[432  71]
 [152 374]]
Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.86      0.79       503
           1       0.84      0.71      0.77       526

    accuracy                           0.78      1029
   macro avg       0.79      0.78      0.78      1029
weighted avg       0.79      0.78      0.78      1029



### SVM Grid Search

In [139]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

# Define the parameter grid for SVM
param_grid = {
    'C': [0.1, 1, 10, 100],  # Regularization parameter
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],  # Kernel type
    'gamma': ['scale', 'auto'],  # Kernel coefficient
    'degree': [3, 4, 5]  # Degree of the polynomial kernel function (only for 'poly')
}

# Initialize the SVM model
svm = SVC()

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X, y)

# Print the best parameters and the best score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-validation Score:", grid_search.best_score_)

Fitting 5 folds for each of 96 candidates, totalling 480 fits
[CV] END ...........C=0.1, degree=3, gamma=scale, kernel=rbf; total time=   0.6s
[CV] END ...........C=0.1, degree=3, gamma=scale, kernel=rbf; total time=   0.6s
[CV] END ...........C=0.1, degree=3, gamma=scale, kernel=rbf; total time=   0.6s
[CV] END ..........C=0.1, degree=3, gamma=scale, kernel=poly; total time=   0.4s
[CV] END ...........C=0.1, degree=3, gamma=scale, kernel=rbf; total time=   0.7s
[CV] END ...........C=0.1, degree=3, gamma=scale, kernel=rbf; total time=   0.8s
[CV] END ..........C=0.1, degree=3, gamma=scale, kernel=poly; total time=   0.3s
[CV] END ..........C=0.1, degree=3, gamma=scale, kernel=poly; total time=   0.4s
[CV] END ..........C=0.1, degree=3, gamma=scale, kernel=poly; total time=   0.3s
[CV] END ..........C=0.1, degree=3, gamma=scale, kernel=poly; total time=   0.4s
[CV] END .......C=0.1, degree=3, gamma=scale, kernel=sigmoid; total time=   0.4s
[CV] END .......C=0.1, degree=3, gamma=scale, k

KeyboardInterrupt: 

## KNN

In [127]:
# Train KNN
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

y_pred_knn = knn_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_knn)
precision = precision_score(y_test, y_pred_knn, average='weighted')
recall = recall_score(y_test, y_pred_knn, average='weighted')
f1 = f1_score(y_test, y_pred_knn, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred_knn)
class_report = classification_report(y_test, y_pred_knn)

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Accuracy: 81.73%
Precision: 0.82
Recall: 0.82
F1 Score: 0.82
Confusion Matrix:
[[424  79]
 [109 417]]
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.84      0.82       503
           1       0.84      0.79      0.82       526

    accuracy                           0.82      1029
   macro avg       0.82      0.82      0.82      1029
weighted avg       0.82      0.82      0.82      1029



  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


### KNN Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

# Define the parameter grid for KNN
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],  # Number of neighbors to use
    'weights': ['uniform', 'distance'],  # Weight function used in prediction
    'metric': ['euclidean', 'manhattan', 'minkowski']  # Distance metric
}

# Initialize the KNN model
knn = KNeighborsClassifier()

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X, y)

# Print the best parameters and the best score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-validation Score:", grid_search.best_score_)

## Evaluate all models w/ Grid Search params on unseen tracks

In [130]:
import os

track_directory = 'data/'

for model in [logistic_model, svm_model, knn_model]:
    print(f"Model: {model}")

    # Loop through all files in the directory
    for track_name in os.listdir(track_directory):
        if track_name.endswith('.mp3'):
            track_path = os.path.join(track_directory, track_name)

            label, prediction = predict_track(track_path, model)

            if label == 0:
                print(f"{track_name} is not proper! Label is {label[0]} with a probability of {prediction[0][label[0]]}")
            else:
                print(f"{track_name} is proper! Label is {label[0]} with a probability of {prediction[0][label[0]]}")

Model: LogisticRegression()
Rene Wise - Lakota Fox.mp3 is proper! Label is 1 with a probability of 0.8934672488502142
Kike Pravda - Main Control [tZcM4ytxSGc].mp3 is proper! Label is 1 with a probability of 0.8806780022623224
Beatrice - I'm Not From Here.mp3 is proper! Label is 1 with a probability of 0.9493046722020121
Lowerzone - Verknipt People.mp3 is not proper! Label is 0 with a probability of 0.5532544823335399
Mark Dekoda - Rave Harder Techno Bass.mp3 is not proper! Label is 0 with a probability of 0.7003384048900274
09 - Step To Enchantment (Stringent).mp3 is proper! Label is 1 with a probability of 0.8382499301230181
Kaiser (K S R) - Driving In A Fast Tool.mp3 is proper! Label is 1 with a probability of 0.888100460451353
Jeff Mills - Step to enchantment [eLtnhsLwSZE].mp3 is proper! Label is 1 with a probability of 0.5456870861280316
Voorman - Surface Scatter.mp3 is proper! Label is 1 with a probability of 0.9614647768970976
Stef Mendesidis - Crime and Punishment.mp3 is proper!

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Rene Wise - Lakota Fox.mp3 is proper! Label is 1 with a probability of 1.0


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Kike Pravda - Main Control [tZcM4ytxSGc].mp3 is proper! Label is 1 with a probability of 0.8


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Beatrice - I'm Not From Here.mp3 is not proper! Label is 0 with a probability of 0.6


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Lowerzone - Verknipt People.mp3 is not proper! Label is 0 with a probability of 1.0


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Mark Dekoda - Rave Harder Techno Bass.mp3 is not proper! Label is 0 with a probability of 1.0


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


09 - Step To Enchantment (Stringent).mp3 is proper! Label is 1 with a probability of 0.8


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Kaiser (K S R) - Driving In A Fast Tool.mp3 is proper! Label is 1 with a probability of 1.0


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Jeff Mills - Step to enchantment [eLtnhsLwSZE].mp3 is not proper! Label is 0 with a probability of 1.0


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Voorman - Surface Scatter.mp3 is proper! Label is 1 with a probability of 1.0


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Stef Mendesidis - Crime and Punishment.mp3 is proper! Label is 1 with a probability of 0.8


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Reinier Zonneveld - Mom Was On Tequila.mp3 is not proper! Label is 0 with a probability of 0.8


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Alexander Johansson   Mattias Fridell - Retsticka.mp3 is proper! Label is 1 with a probability of 0.6
Mike Parker - Rainmaker.mp3 is proper! Label is 1 with a probability of 0.8


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
