In [73]:
%matplotlib inline

import os

import IPython.display as ipd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as skl
import sklearn.utils, sklearn.preprocessing, sklearn.decomposition, sklearn.svm
import librosa # for audio and music analysis
import librosa.display
import csv
import math
import datatables
import ast
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
import utils
from sklearn.neighbors import KNeighborsClassifier

plt.rcParams['figure.figsize'] = (17, 5)

In [50]:
# Directory where mp3 are stored.
AUDIO_DIR = os.environ.get('AUDIO_DIR')

# Load metadata and features.
tracks = utils.load('tracks.csv')
genres = utils.load('genres.csv')
features = utils.load('features.csv')
echonest = utils.load('echonest.csv')

np.testing.assert_array_equal(features.index, tracks.index)
assert echonest.index.isin(tracks.index).all()

tracks.shape, 
genres.shape, features.shape, echonest.shape

((163, 4), (106574, 518), (13129, 249))

In [67]:
medium = tracks['set', 'subset'] <= 'medium'

train = tracks['set', 'split'] == 'training'
val = tracks['set', 'split'] == 'validation'
test = tracks['set', 'split'] == 'test'

y_train = tracks.loc[medium & train, ('track', 'genre_top')]
y_train = skl.preprocessing.LabelEncoder().fit_transform(y_train)
y_test = tracks.loc[medium & test, ('track', 'genre_top')]
y_test = skl.preprocessing.LabelEncoder().fit_transform(y_test)
X_train = features.loc[medium & train,:]
X_test = features.loc[medium & test,:]

# Be sure training samples are shuffled.
X_train, y_train = skl.utils.shuffle(X_train, y_train, random_state=42)

# Standardize features by removing the mean and scaling to unit variance.
scaler = skl.preprocessing.StandardScaler(copy=False)
scaler.fit_transform(X_train)
scaler.transform(X_test)

#transform to PC space
estimator = PCA(n_components = 230)
X_train = estimator.fit_transform(X_train)
variance_explained = estimator.explained_variance_ratio_
print(np.sum(variance_explained))
X_test = estimator.transform(X_test)

print('{} training examples, {} testing examples'.format(y_train.size, y_test.size))
print('{} features, {} classes'.format(X_train.shape[1], np.unique(y_train).size))

0.959555803474
19922 training examples, 2573 testing examples
230 features, 16 classes


In [68]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=141, weights='distance')
neigh.fit(X_train, y_train)
score = neigh.score(X_test, y_test)
print('Accuracy: {:.2%}'.format(score))

Accuracy: 52.66%


In [89]:
#searching optimal k without PCA transformation
accuracy = []
for i in range(1, 10):
    y_train = tracks.loc[medium & train, ('track', 'genre_top')]
    y_train = skl.preprocessing.LabelEncoder().fit_transform(y_train)

    y_test = tracks.loc[medium & test, ('track', 'genre_top')]
    y_test = skl.preprocessing.LabelEncoder().fit_transform(y_test)
    
    X_train = features.loc[medium & train,:]
    X_test = features.loc[medium & test,:]
    print('{} features, {} classes'.format(X_train.shape[1], np.unique(y_train).size))
    
    # Be sure training samples are shuffled.
    X_train, y_train = skl.utils.shuffle(X_train, y_train, random_state=42)

    # Standardize features by removing the mean and scaling to unit variance.
    scaler = skl.preprocessing.StandardScaler(copy=False)
    scaler.fit_transform(X_train)
    scaler.transform(X_test)


    from sklearn.neighbors import KNeighborsClassifier
    neigh = KNeighborsClassifier(n_neighbors=i, weights='distance')
    neigh.fit(X_train, y_train)
    score = neigh.score(X_test, y_test)
    print('Accuracy: {:.2%}'.format(score))
    accuracy.append(score)

518 features, 16 classes
Accuracy: 44.58%
518 features, 16 classes
Accuracy: 44.58%
518 features, 16 classes
Accuracy: 49.01%
518 features, 16 classes
Accuracy: 50.52%
518 features, 16 classes
Accuracy: 51.57%
518 features, 16 classes
Accuracy: 51.85%
518 features, 16 classes
Accuracy: 52.20%
518 features, 16 classes
Accuracy: 52.62%
518 features, 16 classes
Accuracy: 52.82%


In [96]:
#searching optimal number of PC with k=20
accuracy = []
for i in range(125, 145):
    y_train = tracks.loc[medium & train, ('track', 'genre_top')]
    y_train = skl.preprocessing.LabelEncoder().fit_transform(y_train)
    y_test = tracks.loc[medium & test, ('track', 'genre_top')]
    y_test = skl.preprocessing.LabelEncoder().fit_transform(y_test)
    X_train = features.loc[medium & train,:]
    X_test = features.loc[medium & test,:]

    # Be sure training samples are shuffled.
    X_train, y_train = skl.utils.shuffle(X_train, y_train, random_state=42)

    # Standardize features by removing the mean and scaling to unit variance.
    scaler = skl.preprocessing.StandardScaler(copy=False)
    scaler.fit_transform(X_train)
    scaler.transform(X_test)

    #transform to PC space
    estimator = PCA(n_components = i)
    X_train = estimator.fit_transform(X_train)
    #variance_explained = estimator.explained_variance_ratio_
    #print('{:.2%} variance explained'.format(np.sum(variance_explained)))
    X_test = estimator.transform(X_test)

    neigh = KNeighborsClassifier(n_neighbors=20, weights='distance')
    neigh.fit(X_train, y_train)
    accuracy = neigh.score(X_test, y_test)
    print('Accuracy: {:.2%}'.format(accuracy))

Accuracy: 55.11%
Accuracy: 55.77%
Accuracy: 54.99%
Accuracy: 55.23%
Accuracy: 55.23%
Accuracy: 55.15%
Accuracy: 54.99%
Accuracy: 55.15%
Accuracy: 54.72%
Accuracy: 55.46%
Accuracy: 55.27%
Accuracy: 55.15%
Accuracy: 55.54%
Accuracy: 55.27%
Accuracy: 55.15%
Accuracy: 55.34%
Accuracy: 55.54%
Accuracy: 55.58%
Accuracy: 55.38%
Accuracy: 55.23%


In [95]:
#search for optimal k with 200 selected feature 
medium = tracks['set', 'subset'] <= 'medium'

train = tracks['set', 'split'] == 'training'
val = tracks['set', 'split'] == 'validation'
test = tracks['set', 'split'] == 'test'

y_train = tracks.loc[medium & train, ('track', 'genre_top')]
y_train = skl.preprocessing.LabelEncoder().fit_transform(y_train)
y_test = tracks.loc[medium & test, ('track', 'genre_top')]
y_test = skl.preprocessing.LabelEncoder().fit_transform(y_test)
X_train = features.loc[medium & train,:]
X_test = features.loc[medium & test,:]

# Be sure training samples are shuffled.
X_train, y_train = skl.utils.shuffle(X_train, y_train, random_state=42)

# Standardize features by removing the mean and scaling to unit variance.
scaler = skl.preprocessing.StandardScaler(copy=False)
scaler.fit_transform(X_train)
scaler.transform(X_test)

model = SelectKBest(k=169)
fit = model.fit(X_train, y_train)
X_train = fit.transform(X_train)
X_test = fit.transform(X_test)

neigh = KNeighborsClassifier(n_neighbors=20, weights='distance')
neigh.fit(X_train, y_train)
score = neigh.score(X_train, y_train)
print('Accuracy: {:.2%}'.format(score))

Accuracy: 99.98%


In [94]:
#search for optimal feature size with k = 20
medium = tracks['set', 'subset'] <= 'medium'

train = tracks['set', 'split'] == 'training'
val = tracks['set', 'split'] == 'validation'
test = tracks['set', 'split'] == 'test'

#searching optimal k without PCA transformation
for i in range(160, 170):
    y_train = tracks.loc[medium & train, ('track', 'genre_top')]
    y_train = skl.preprocessing.LabelEncoder().fit_transform(y_train)
    y_test = tracks.loc[medium & test, ('track', 'genre_top')]
    y_test = skl.preprocessing.LabelEncoder().fit_transform(y_test)
    X_train = features.loc[medium & train,:]
    X_test = features.loc[medium & test,:]

    # Be sure training samples are shuffled.
    X_train, y_train = skl.utils.shuffle(X_train, y_train, random_state=42)

    # Standardize features by removing the mean and scaling to unit variance.
    scaler = skl.preprocessing.StandardScaler(copy=False)
    scaler.fit_transform(X_train)
    scaler.transform(X_test)
    
    model = SelectKBest(k=i)
    fit = model.fit(X_train, y_train)
    X_train = fit.transform(X_train)
    X_test = fit.transform(X_test)
    neigh = KNeighborsClassifier(n_neighbors=20, weights='distance')
    neigh.fit(X_train, y_train)
    score = neigh.score(X_test, y_test)
    print('Accuracy: {:.2%}'.format(score))

Accuracy: 57.75%
Accuracy: 57.75%
Accuracy: 57.60%
Accuracy: 57.29%
Accuracy: 57.83%
Accuracy: 57.68%
Accuracy: 57.36%
Accuracy: 57.25%
Accuracy: 57.21%
Accuracy: 57.87%


In [93]:
#searching optimal number of WEIGHTED PC with k=20 
for i in range(200,500,50):
    y_train = tracks.loc[medium & train, ('track', 'genre_top')]
    y_train = skl.preprocessing.LabelEncoder().fit_transform(y_train)
    y_test = tracks.loc[medium & test, ('track', 'genre_top')]
    y_test = skl.preprocessing.LabelEncoder().fit_transform(y_test)
    X_train = features.loc[medium & train,:]
    X_test = features.loc[medium & test,:]

    # Be sure training samples are shuffled.
    X_train, y_train = skl.utils.shuffle(X_train, y_train, random_state=42)

    # Standardize features by removing the mean and scaling to unit variance.
    scaler = skl.preprocessing.StandardScaler(copy=False)
    scaler.fit_transform(X_train)
    scaler.transform(X_test)

    #transform to PC space
    estimator = PCA(n_components = i)
    X_train = estimator.fit_transform(X_train)
    X_test = estimator.transform(X_test)

    variance_explained = estimator.explained_variance_ratio_
    LAMBDA = np.diag(variance_explained) #diagonal matrix of loadings
    print('{:.2%} variance explained'.format(np.sum(variance_explained)))
    
    X_train = np.dot(X_train, LAMBDA) #PC weighted by eigenvalue
    X_test = np.dot(X_test, LAMBDA) #PC weighted by eigenvalue

    neigh = KNeighborsClassifier(n_neighbors=20, weights='distance')
    neigh.fit(X_train, y_train)
    accuracy = neigh.score(X_test, y_test)
    print('Accuracy: {:.2%}'.format(accuracy))

94.28% variance explained
Accuracy: 50.87%
96.80% variance explained
Accuracy: 50.87%
98.31% variance explained
Accuracy: 50.87%
99.21% variance explained
Accuracy: 50.87%
99.76% variance explained
Accuracy: 50.87%
99.96% variance explained
Accuracy: 50.87%
