# Spotify Automatic Track Organization

In [92]:
import matplotlib.pyplot as plt
import plotly.express as px
import numpy as np
import pandas as pd
import seaborn as sns
import json
import time
import spotipy
import os

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split

from spotipy.oauth2 import SpotifyOAuth
from credentials import *

## Initializing `Spotipy` Access

In [2]:
scope = "user-library-read"

sp = spotipy.Spotify(auth_manager=SpotifyOAuth(scope=scope, 
    client_id=SPOTIPY_CLIENT_ID, 
    client_secret=SPOTIPY_CLIENT_SECRET,
    redirect_uri=SPOTIPY_REDIRECT_URI))

## Utility Functions

In [5]:
def extract_playlists_from_user():
    """
    Returns a dictionary of playlist URIs and their corresponding titles.
    """
    playlists = sp.current_user_playlists()
    return {playlist['uri']: playlist['name'] for playlist in playlists['items']}

In [7]:
def extract_songs_from_playlist(uri):
    """
    Returns a dictionary of track URIs and their corresponding names given a playlist.
    """
    playlist = sp.playlist_tracks(uri)
    
    return {song['track']['id']: song['track']['name'] for song in playlist['items']}

In [10]:
def create_playlist_df(songs: dict):
    """
    Returns a DataFrame representation of a given playlist.
    """
    column_labels = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 
                 'instrumentalness', 'liveness', 'valence', 'tempo', 'type', 'id', 'uri', 'track_href', 
                 'analysis_url', 'duration_ms', 'time_signature']

    features_df = pd.DataFrame(columns=column_labels)
    for (uri, title) in songs.items():
        features = sp.audio_features([uri])[0]
        features_df = features_df.append(pd.DataFrame(data=features, index=[title]), sort=False)
    
    return features_df

In [12]:
def create_df_from_uri(uri):
    """
    Ensemble function that combines songs extraction and DataFrame creation.
    """
    
    return create_playlist_df(extract_songs_from_playlist(uri))

In [144]:
def clean_raw_df(df: pd.DataFrame, pipe=True) -> pd.DataFrame:
    """Returns a clean version of the raw DataFrame"""
    df = df[~df.index.duplicated(keep='first')]
    if pipe:
        df = df.drop(["track_href", "id", "type", "analysis_url"], axis=1)
    df = df[df['time_signature'] > 0.0]
    df = df.replace({"mode": {1.0:"major", 0.0: "minor"}})
    df['duration_ms'] = df['duration_ms'].astype(float)

    return df

In [136]:
def one_hot(df: pd.DataFrame) -> pd.DataFrame:
    """One hot encodes the Mode and Time Signature Features"""
    oh_enc = OneHotEncoder()
    oh_enc.fit(df[['mode', "time_signature", "key"]])
    dummies = pd.DataFrame(oh_enc.transform(df[['mode', "time_signature", "key"]]).todense(), 
                           columns=oh_enc.get_feature_names(['mode', "time_signature", "key"]),
                           index = df.index)
    return df.join(dummies).drop(['mode', "time_signature", "key"], axis=1)

In [137]:
def center(df: pd.DataFrame) -> pd.DataFrame:
    """ Returns a design matrix for PCA from a given DataFrame"""
    means = np.mean(df)
    centered = df - means
    
    return centered

In [138]:
def design_matrix(df: pd.DataFrame) -> pd.DataFrame:
    """Returns the design matric of the DataFrame"""
    X = df.drop(['playlist'], axis=1)
    X = StandardScaler().fit_transform((X - np.mean(X)).to_numpy())
    y = df['playlist']
    return X, y

## Save Playlist Feature Data

In [59]:
# Load in dictionary of playlists
playlists = extract_playlists_from_user()
name_to_uri = {y:x for x,y in playlists.items()}

In [20]:
df_dict = {}
for (uri, name) in playlists.items():
    new_df = create_df_from_uri(uri)
    new_df.to_csv(f"../data/playlists/{name}", index=True)
    df_dict[name] = new_df

## Read in Playlist Data

In [139]:
column_labels = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 
                 'instrumentalness', 'liveness', 'valence', 'tempo', 'type', 'id', 'uri', 'track_href', 
                 'analysis_url', 'duration_ms', 'time_signature', 'playlist']

features_df = pd.DataFrame(columns=column_labels)
for playlist in os.listdir("../data/playlists"):
    if playlist == '.DS_Store':
        continue
    features = pd.read_csv(f"../data/playlists/{playlist}", index_col=0)
    features['playlist'] = name_to_uri[playlist]
    features_df = features_df.append(features, sort=False)
    
features_df = features_df.set_index('uri')
features_df.head()

Unnamed: 0_level_0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,track_href,analysis_url,duration_ms,time_signature,playlist
uri,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
spotify:track:7B0ofCHeQu2FoX7P28ZaPp,0.253,0.0118,8,-33.838,1,0.042,0.952,0.614,0.245,0.149,82.843,audio_features,7B0ofCHeQu2FoX7P28ZaPp,https://api.spotify.com/v1/tracks/7B0ofCHeQu2F...,https://api.spotify.com/v1/audio-analysis/7B0o...,316167,3,spotify:playlist:5cKX7ojWfXgfUO8NqnL51T
spotify:track:2tr4oclswJ6v3dfDlI01HD,0.3,0.212,6,-21.017,1,0.0333,0.847,0.861,0.062,0.0553,199.617,audio_features,2tr4oclswJ6v3dfDlI01HD,https://api.spotify.com/v1/tracks/2tr4oclswJ6v...,https://api.spotify.com/v1/audio-analysis/2tr4...,239769,3,spotify:playlist:5cKX7ojWfXgfUO8NqnL51T
spotify:track:7vbCfkJf89i4s745KbELgr,0.359,0.0224,3,-29.089,1,0.039,0.987,0.866,0.111,0.169,107.849,audio_features,7vbCfkJf89i4s745KbELgr,https://api.spotify.com/v1/tracks/7vbCfkJf89i4...,https://api.spotify.com/v1/audio-analysis/7vbC...,354989,3,spotify:playlist:5cKX7ojWfXgfUO8NqnL51T
spotify:track:25zykbJGBYXYjnq5VyhO3N,0.409,0.0523,1,-25.613,1,0.0343,0.993,0.943,0.117,0.344,145.833,audio_features,25zykbJGBYXYjnq5VyhO3N,https://api.spotify.com/v1/tracks/25zykbJGBYXY...,https://api.spotify.com/v1/audio-analysis/25zy...,227184,3,spotify:playlist:5cKX7ojWfXgfUO8NqnL51T
spotify:track:4RURs7dAXGYoqMKMbp4qxw,0.463,0.00629,2,-30.121,0,0.0364,0.98,0.882,0.0578,0.175,105.69,audio_features,4RURs7dAXGYoqMKMbp4qxw,https://api.spotify.com/v1/tracks/4RURs7dAXGYo...,https://api.spotify.com/v1/audio-analysis/4RUR...,220000,3,spotify:playlist:5cKX7ojWfXgfUO8NqnL51T


## Data Pre-processing

In [145]:
cleaned = clean_raw_df(features_df)
X, y = design_matrix(one_hot(clean_raw_df(features_df)))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

print("Train-Test Split", '\n----------')
print(f"Training Data Shape: {X_train.shape}")
print(f"Testing Data Shape: {X_test.shape}\n")
print(f"Response Training Vector Shape: {y_train.shape}")
print(f"Response Test Vector Shape: {y_test.shape}")

Train-Test Split 
----------
Training Data Shape: (662, 28)
Testing Data Shape: (74, 28)

Response Training Vector Shape: (662,)
Response Test Vector Shape: (74,)


In [146]:
def score_model(model, x_train, y_train, x_test, y_test):
    model.fit(x_train, y_train)
    return model.score(x_test, y_test)

In [147]:
clf = RandomForestClassifier()
knn = KNeighborsClassifier()
nn = MLPClassifier(max_iter=1000, solver='lbfgs')

In [148]:
for model in [clf, knn, nn]:
    print(f"Model: {model}, \nScore: {score_model(model, X_train, y_train, X_test, y_test)}\n")

Model: RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False), 
Score: 0.2972972972972973

Model: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform'), 
Score: 0.21621621621621623

Model: MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), le

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


## Plotting

In [157]:
xx, yy = design_matrix(one_hot(clean_raw_df(features_df)))

In [174]:
new_df = one_hot(clean_raw_df(features_df))
xx, yy = design_matrix(new_df)

pca = PCA(n_components=3)

principal_components = pca.fit_transform(xx)

target_df_pca = pd.DataFrame(data=principal_components, columns=['PC_1', 'PC_2', 'PC_3'], index=new_df.index)
joined_df = new_df.join(target_df_pca)

In [175]:
joined_df['playlist_name'] = [playlists[id_] for id_ in joined_df['playlist']]

In [178]:
fig = px.scatter_3d(joined_df, x='PC_1', y='PC_2', z='PC_3', hover_name=joined_df.index, color="playlist_name")
fig.update_layout(
    height=600,
    width=900,
    title_text='PCA of Spotify Tracks'
)
fig.show()