In [2]:
import numpy as np
import pandas as pd
import sklearn as sk
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import pickle
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
Songs = []
with open("Songs.pkl", 'rb') as f:
    Songs = pickle.load(f) 
data = [vars(song) for song in Songs]
df = pd.DataFrame(data)
df

Unnamed: 0,path,company,generation,artist,name,mfccs_path
0,/Users/tylerho/Library/CloudStorage/GoogleDriv...,SM,1,TRAX,Are you Ready¿.mp3,/Users/tylerho/Library/CloudStorage/GoogleDriv...
1,/Users/tylerho/Library/CloudStorage/GoogleDriv...,SM,1,TRAX,Paradise.mp3,/Users/tylerho/Library/CloudStorage/GoogleDriv...
2,/Users/tylerho/Library/CloudStorage/GoogleDriv...,SM,1,TRAX,초우 Cold Rain.mp3,/Users/tylerho/Library/CloudStorage/GoogleDriv...
3,/Users/tylerho/Library/CloudStorage/GoogleDriv...,SM,1,TRAX,아직은... 나 I Can Change.mp3,/Users/tylerho/Library/CloudStorage/GoogleDriv...
4,/Users/tylerho/Library/CloudStorage/GoogleDriv...,SM,1,TRAX,Knife.mp3,/Users/tylerho/Library/CloudStorage/GoogleDriv...
...,...,...,...,...,...,...
5522,/Users/tylerho/Library/CloudStorage/GoogleDriv...,JYP,5,KickFlip,Knock Knock.mp3,/Users/tylerho/Library/CloudStorage/GoogleDriv...
5523,/Users/tylerho/Library/CloudStorage/GoogleDriv...,JYP,5,KickFlip,제끼자 (Skip It!) (Preview).mp3,/Users/tylerho/Library/CloudStorage/GoogleDriv...
5524,/Users/tylerho/Library/CloudStorage/GoogleDriv...,JYP,5,KickFlip,Like A Monster.mp3,/Users/tylerho/Library/CloudStorage/GoogleDriv...
5525,/Users/tylerho/Library/CloudStorage/GoogleDriv...,JYP,5,KickFlip,Mama Said (뭐가 되려고？).mp3,/Users/tylerho/Library/CloudStorage/GoogleDriv...


In [4]:
# First, we must clean the data (specifically make the mfccs uniform length)
mfccs = []
i = 0
for mfccs_path in df["mfccs_path"]:
    if mfccs_path != '':
        mfccs.append(np.load(mfccs_path))
    else:
        i += 1

lengths = [mfcc.shape[1] for mfcc in mfccs]
max_len = max(lengths)
percentile_95 = int(np.percentile(lengths, 95))
print(i, max_len, percentile_95)

0 53604 12316


In [5]:
def load_pad_trunc(path, T_max):
    """
    Load an MFCC matrix of shape (n_mfcc, T) from `path`,
    truncate to T_max if T > T_max, else zero-pad on the right.
    Returns a 1D array of length n_mfcc * T_max.
    """
    if path != '':
        mfcc = np.load(path)             # shape: (n_mfcc, T)
        n_mfcc, T = mfcc.shape
        if T >= T_max:
            mfcc2 = mfcc[:, :T_max]
        else:
            pad_width = T_max - T
            mfcc2 = np.pad(mfcc,
                        pad_width=((0, 0), (0, pad_width)),
                        mode='constant',
                        constant_values=0)
        return mfcc2.ravel()             # shape: (n_mfcc*T_max,)
    else:
        return np.array([1])

In [6]:
# make all the mfccs the same length
fixed_length = percentile_95
feature_list = df['mfccs_path'].apply(lambda p: load_pad_trunc(p, fixed_length))


In [7]:
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

X = np.vstack(feature_list)
y = df['generation'].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)

scaler = StandardScaler().fit(X_train)
X_train_s, X_test_s = scaler.transform(X_train), scaler.transform(X_test)

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_s, y_train)

y_pred = knn.predict(X_test_s)
print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

           1       0.59      0.23      0.33       252
           2       0.49      0.42      0.45       362
           3       0.38      0.62      0.47       335
           4       0.42      0.33      0.37       144
           5       0.05      0.15      0.08        13

    accuracy                           0.42      1106
   macro avg       0.39      0.35      0.34      1106
weighted avg       0.46      0.42      0.41      1106

Confusion Matrix:
 [[ 58  68 107  11   8]
 [ 19 151 164  19   9]
 [ 18  65 209  30  13]
 [  2  22  66  47   7]
 [  2   2   3   4   2]]


In [8]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(knn, X_train, y_train,
                         cv=cv,
                         scoring='accuracy',  # or 'f1_macro', etc.
                         n_jobs=-1)
print("CV accuracy:", scores, "mean:", scores.mean())

CV accuracy: [0.47683616 0.44457014 0.4581448  0.45022624 0.45927602] mean: 0.45781067055244523


In [9]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_neighbors': [3,5,7,9],
    'weights': ['uniform','distance'],
    'metric': ['euclidean','manhattan']
}
gs = GridSearchCV(
    knn, param_grid,
    cv=cv,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)
gs.fit(X_train, y_train)
print("Best params:", gs.best_params_)
print("Best CV score:", gs.best_score_)
best_knn = gs.best_estimator_

Fitting 5 folds for each of 16 candidates, totalling 80 fits


KeyboardInterrupt: 

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = best_knn.predict(X_test)
print(classification_report(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

In [None]:
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt

train_sizes, train_scores, valid_scores = learning_curve(
    best_knn, X_train, y_train,
    cv=cv, scoring='accuracy',
    train_sizes=np.linspace(0.1,1.0,5),
    n_jobs=-1
)
plt.plot(train_sizes, train_scores.mean(axis=1), label="train")
plt.plot(train_sizes, valid_scores.mean(axis=1), label="validation")
plt.xlabel("Training examples"); plt.ylabel("Accuracy")
plt.legend(); plt.show()

In [None]:
from sklearn.model_selection import cross_val_score

nested_scores = cross_val_score(
    gs, X, y,
    cv=StratifiedKFold(5, shuffle=True, random_state=1),
    scoring='accuracy',
    n_jobs=-1
)
print("Nested CV accuracy:", nested_scores.mean())