In [1]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
import seaborn as sns  # for nicer plots
sns.set(style="darkgrid")  # default style
from matplotlib import pyplot as plt

### Data Preprocessing

In [2]:
raw_df = pd.read_csv('spotify_tracks_dataset.csv')

In [11]:
df = raw_df.copy()

# Remove null row
df.dropna(inplace=True)

# Drop Unnamed field and time_signature field
df = df.drop(columns = ["Unnamed: 0","track_id", "artists", "album_name","track_name","time_signature"])

# Map explicit field to binary
df['explicit'] = df['explicit'].apply(lambda x: 1 if x == True else 0)

# One hot encode key field
df = pd.get_dummies(df, columns=['key'], dtype=int)
df = df.rename(columns={"key_0":"C","key_1":"C#","key_2":"D","key_3":"D#","key_4":"E","key_5":"F",
                        "key_6":"F#","key_7":"G","key_8":"G#","key_9":"A","key_10":"A#","key_11":"B"})

df.head()

Unnamed: 0,popularity,duration_ms,explicit,danceability,energy,loudness,mode,speechiness,acousticness,instrumentalness,...,D,D#,E,F,F#,G,G#,A,A#,B
0,73,230666,0,0.676,0.461,-6.746,0,0.143,0.0322,1e-06,...,0,0,0,0,0,0,0,0,0,0
1,55,149610,0,0.42,0.166,-17.235,1,0.0763,0.924,6e-06,...,0,0,0,0,0,0,0,0,0,0
2,57,210826,0,0.438,0.359,-9.734,1,0.0557,0.21,0.0,...,0,0,0,0,0,0,0,0,0,0
3,71,201933,0,0.266,0.0596,-18.515,1,0.0363,0.905,7.1e-05,...,0,0,0,0,0,0,0,0,0,0
4,82,198853,0,0.618,0.443,-9.681,1,0.0526,0.469,0.0,...,1,0,0,0,0,0,0,0,0,0


In [4]:
df.columns

Index(['popularity', 'duration_ms', 'explicit', 'danceability', 'energy',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'track_genre', 'C', 'C#', 'D', 'D#',
       'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B'],
      dtype='object')

In [5]:
df['track_genre'].unique()

array(['acoustic', 'afrobeat', 'alt-rock', 'alternative', 'ambient',
       'anime', 'black-metal', 'bluegrass', 'blues', 'brazil',
       'breakbeat', 'british', 'cantopop', 'chicago-house', 'children',
       'chill', 'classical', 'club', 'comedy', 'country', 'dance',
       'dancehall', 'death-metal', 'deep-house', 'detroit-techno',
       'disco', 'disney', 'drum-and-bass', 'dub', 'dubstep', 'edm',
       'electro', 'electronic', 'emo', 'folk', 'forro', 'french', 'funk',
       'garage', 'german', 'gospel', 'goth', 'grindcore', 'groove',
       'grunge', 'guitar', 'happy', 'hard-rock', 'hardcore', 'hardstyle',
       'heavy-metal', 'hip-hop', 'honky-tonk', 'house', 'idm', 'indian',
       'indie-pop', 'indie', 'industrial', 'iranian', 'j-dance', 'j-idol',
       'j-pop', 'j-rock', 'jazz', 'k-pop', 'kids', 'latin', 'latino',
       'malay', 'mandopop', 'metal', 'metalcore', 'minimal-techno', 'mpb',
       'new-age', 'opera', 'pagode', 'party', 'piano', 'pop-film', 'pop',
       'pow

In [6]:
len(['acoustic', 'afrobeat', 'alt-rock', 'alternative', 'ambient',
       'anime', 'black-metal', 'bluegrass', 'blues', 'brazil',
       'breakbeat', 'british', 'cantopop', 'chicago-house', 'children',
       'chill', 'classical', 'club', 'comedy', 'country', 'dance',
       'dancehall', 'death-metal', 'deep-house', 'detroit-techno',
       'disco', 'disney', 'drum-and-bass', 'dub', 'dubstep', 'edm',
       'electro', 'electronic', 'emo', 'folk', 'forro', 'french', 'funk',
       'garage', 'german', 'gospel', 'goth', 'grindcore', 'groove',
       'grunge', 'guitar', 'happy', 'hard-rock', 'hardcore', 'hardstyle',
       'heavy-metal', 'hip-hop', 'honky-tonk', 'house', 'idm', 'indian',
       'indie-pop', 'indie', 'industrial', 'iranian', 'j-dance', 'j-idol',
       'j-pop', 'j-rock', 'jazz', 'k-pop', 'kids', 'latin', 'latino',
       'malay', 'mandopop', 'metal', 'metalcore', 'minimal-techno', 'mpb',
       'new-age', 'opera', 'pagode', 'party', 'piano', 'pop-film', 'pop',
       'power-pop', 'progressive-house', 'psych-rock', 'punk-rock',
       'punk', 'r-n-b', 'reggae', 'reggaeton', 'rock-n-roll', 'rock',
       'rockabilly', 'romance', 'sad', 'salsa', 'samba', 'sertanejo',
       'show-tunes', 'singer-songwriter', 'ska', 'sleep', 'songwriter',
       'soul', 'spanish', 'study', 'swedish', 'synth-pop', 'tango',
       'techno', 'trance', 'trip-hop', 'turkish', 'world-music'])

114

### Modeling Next Steps
* Decision Trees
* KNN
* Logistic Regression / NN


In [7]:
random_state = 1
X = df.drop('track_genre', axis=1)
y = df['track_genre']

#X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=random_state)
#X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.25, random_state=random_state)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
#X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

for i in [12, 13, 14, 15, 16]:
    print(f"{i} nearest neighbors")
    clf = KNeighborsClassifier(n_neighbors=i)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)

12 nearest neighbors
Accuracy: 0.19350877192982457
13 nearest neighbors
Accuracy: 0.19456140350877194
14 nearest neighbors
Accuracy: 0.19473684210526315
15 nearest neighbors
Accuracy: 0.19412280701754386
16 nearest neighbors
Accuracy: 0.19337719298245615


In [8]:
random_state = 1
X = df.drop(['track_genre', 'C', 'C#', 'D', 'D#',
       'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B'], axis=1)
y = df['track_genre']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

for i in [2, 4, 8, 16, 32, 64, 128]:
    print(f"{i} nearest neighbors")
    clf = KNeighborsClassifier(n_neighbors=i)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)

16 nearest neighbors
Accuracy: 0.23859649122807017
32 nearest neighbors
Accuracy: 0.24035087719298245
64 nearest neighbors
Accuracy: 0.2355263157894737
128 nearest neighbors
Accuracy: 0.22706140350877194


In [9]:
random_state = 1
X = df.drop(['track_genre', 'C', 'C#', 'D', 'D#',
       'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B'], axis=1)
y = df['track_genre']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

for i in [24, 32, 40, 48, 56]:
    print(f"{i} nearest neighbors")
    clf = KNeighborsClassifier(n_neighbors=i)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)

24 nearest neighbors
Accuracy: 0.24087719298245613
32 nearest neighbors
Accuracy: 0.24035087719298245
40 nearest neighbors
Accuracy: 0.24083333333333334
48 nearest neighbors
Accuracy: 0.23684210526315788
56 nearest neighbors
Accuracy: 0.23701754385964913


In [11]:
random_state = 1
X = df.drop(['track_genre', 'C', 'C#', 'D', 'D#',
       'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B'], axis=1)
y = df['track_genre']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

for i in [24, 26, 28, 30, 32, 34, 36, 38, 40]:
    print(f"{i} nearest neighbors")
    clf = KNeighborsClassifier(n_neighbors=i)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)

24 nearest neighbors
Accuracy: 0.24087719298245613
26 nearest neighbors
Accuracy: 0.24127192982456142
28 nearest neighbors
Accuracy: 0.24153508771929824
30 nearest neighbors
Accuracy: 0.24135964912280702
32 nearest neighbors
Accuracy: 0.24035087719298245
34 nearest neighbors
Accuracy: 0.2417105263157895
36 nearest neighbors
Accuracy: 0.24140350877192981
38 nearest neighbors
Accuracy: 0.24157894736842106
40 nearest neighbors
Accuracy: 0.24083333333333334


In [12]:
random_state = 1
X = df.drop(['track_genre', 'C', 'C#', 'D', 'D#',
       'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B'], axis=1)
y = df['track_genre']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

for i in [27, 28, 29, 33, 34, 35]:
    print(f"{i} nearest neighbors")
    clf = KNeighborsClassifier(n_neighbors=i)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)

27 nearest neighbors
Accuracy: 0.24166666666666667
28 nearest neighbors
Accuracy: 0.24153508771929824
29 nearest neighbors
Accuracy: 0.2425438596491228
33 nearest neighbors
Accuracy: 0.2419736842105263
34 nearest neighbors
Accuracy: 0.2417105263157895
35 nearest neighbors
Accuracy: 0.24192982456140352


In [None]:
random_state = 1
X = df.drop(['track_genre', 'C', 'C#', 'D', 'D#',
       'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B'], axis=1)
y = df['track_genre']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

for i in ["uniform", "distance"]:
    print(f"Weight function: {i}")
    clf = KNeighborsClassifier(n_neighbors=34, weights=i)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)

In [179]:
random_state = 1
X = df.drop(['track_genre', 'C', 'C#', 'D', 'D#',
       'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B'], axis=1)
y = df['track_genre']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

for i in [1, 1.5, 2, 2.5, 3]:
    print(f"Power: {i}")
    clf = KNeighborsClassifier(n_neighbors=34, weights="distance", p=i)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)
    #print(classification_report(y_test, y_pred))

minkowski nearest neighbors
Accuracy: 0.2293421052631579
precomputed nearest neighbors


ValueError: Negative values in data passed to precomputed distance matrix.

In [12]:
df2 = df.copy()
genre_to_category = {
    # EDM
    'edm': 'Electronic Dance Music', 
    'house': 'Electronic Dance Music', 
    'electro': 'Electronic Dance Music', 
    'trance': 'Electronic Dance Music', 
    'techno': 'Electronic Dance Music', 
    'dubstep': 'Electronic Dance Music', 
    'drum-and-bass': 'Electronic Dance Music', 
    'deep-house': 'Electronic Dance Music', 
    'detroit-techno': 'Electronic Dance Music', 
    'minimal-techno': 'Electronic Dance Music', 
    'progressive-house': 'Electronic Dance Music', 
    'breakbeat': 'Electronic Dance Music',
    
    # Rock
    'alt-rock': 'Rock', 
    'rock': 'Rock', 
    'indie': 'Rock', 
    'indie-pop': 'Rock', 
    'punk': 'Rock', 
    'punk-rock': 'Rock', 
    'hard-rock': 'Rock', 
    'metal': 'Rock', 
    'heavy-metal': 'Rock', 
    'black-metal': 'Rock', 
    'death-metal': 'Rock', 
    'grunge': 'Rock',
    
    # Hip-Hop and R&B
    'hip-hop': 'Hip-Hop and R&B', 
    'r-n-b': 'Hip-Hop and R&B', 
    'trap': 'Hip-Hop and R&B',
    
    # Pop
    'pop': 'Pop', 
    'electro-pop': 'Pop', 
    'synth-pop': 'Pop', 
    'k-pop': 'Pop', 
    'pop-film': 'Pop', 
    'power-pop': 'Pop',
    
    # Latin & Reggae/Dancehall
    'latin': 'Latin & Reggae/Dancehall', 
    'reggaeton': 'Latin & Reggae/Dancehall', 
    'salsa': 'Latin & Reggae/Dancehall', 
    'samba': 'Latin & Reggae/Dancehall', 
    'reggae': 'Latin & Reggae/Dancehall', 
    'dancehall': 'Latin & Reggae/Dancehall',
    
    # Funk and Disco
    'funk': 'Funk and Disco', 
    'disco': 'Funk and Disco', 
    'groove': 'Funk and Disco',
}

# Map each track to a category
df2['music_category'] = df2['track_genre'].apply(lambda x: genre_to_category.get(x, 'Other'))

df2 = df2.drop(['track_genre'], axis=1)

In [13]:
df2['music_category'].value_counts()

music_category
Other                       74000
Rock                        12000
Electronic Dance Music      12000
Latin & Reggae/Dancehall     6000
Pop                          4999
Funk and Disco               3000
Hip-Hop and R&B              2000
Name: count, dtype: int64

In [14]:
X = df2.drop(['music_category', 'C', 'C#', 'D', 'D#',
       'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B'], axis=1)
y = df2['music_category']

#X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=random_state)
#X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.25, random_state=random_state)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
#X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

for i in [2, 4, 8, 16, 32, 64]:
    print(f"{i} nearest neighbors")
    clf = KNeighborsClassifier(n_neighbors=i)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)

2 nearest neighbors
Accuracy: 0.6275438596491228
4 nearest neighbors
Accuracy: 0.6727631578947368
8 nearest neighbors
Accuracy: 0.6903947368421053
16 nearest neighbors
Accuracy: 0.6925877192982456
32 nearest neighbors
Accuracy: 0.6922368421052632
64 nearest neighbors
Accuracy: 0.6864473684210526


In [145]:
def EuclideanDistance(x1, x2):
    return np.sqrt(np.sum((x1 - x2)**2))
class KNN:
    def __init__(self, n_neighbors=5, metric=EuclideanDistance):
        self.k = n_neighbors
        self.metric = metric

    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def predict(self, X_test):
        #y_pred = [self._predict_item(x) for x in X_test]
        #return np.array(y_pred)
        results = []
        for x in X_test:
            results.append(self._predict_item(x))
        return np.array(results)

    def _predict_item(self, x):

        '''
        distances = [self.metric(x, x_train) for x_train in self.X_train]
        k_indices = np.argsort(distances)[:self.k]
        print(k_indices)
        for i in k_indices:
            print(i in set(self.y_train.keys()))
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        print(k_nearest_labels)
        unique_labels, counts = np.unique(k_nearest_labels, return_counts=True)
        most_common_index = np.argmax(counts)
        return unique_labels[most_common_index]
        '''

        distances = sorted((self.metric(x, x_train), y) for x_train, y in zip(self.X_train, self.y_train))
        scores = dict()
        return distances[0]
        best_score = 0
        best_label = None
        for distance, y in distances[:self.k]:
            if distance == 0:
                return y
        '''
        for distance, y in distances[:self.k]:
            if distance == 0:
                return y
                break
            scores[y] = scores.get(y, 0) + 1/distance
        best_score, best_label = max((score, label) for label, score in scores.items())
        return best_label
        '''

In [146]:
random_state = 1
X = df.drop('track_genre', axis=1)
y = df['track_genre']

#X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=random_state)
#X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.25, random_state=random_state)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
#X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

for i in [4, 8, 16, 32, 64]:
    clf = KNN(n_neighbors=i)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)
    #print(classification_report(y_test, y_pred))

KeyboardInterrupt: 