In [1]:
from secrets import *

In [2]:
import spotipy
from spotipy.oauth2 import SpotifyOAuth

import numpy as np
import pandas as pd
import seaborn as sns
import json
import time
import matplotlib.pyplot as plt
%matplotlib inline

plt.style.use('ggplot')

In [84]:
import os
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split

In [3]:
scope = "user-library-read"

sp = spotipy.Spotify(auth_manager=SpotifyOAuth(scope=scope, 
    client_id=SPOTIPY_CLIENT_ID, 
    client_secret=SPOTIPY_CLIENT_SECRET,
    username=CLIENT_USERNAME,
    redirect_uri=SPOTIPY_REDIRECT_URI))

In [4]:
def extract_playlists_from_user():
    """
    Returns a dictionary of playlist URIs and their corresponding titles.
    """
    playlists = sp.current_user_playlists()
    d = {}
    for playlist in playlists['items']:
        d[playlist["uri"]] = playlist['name']
    
    return d

In [5]:
def extract_songs_from_playlist(uri):
    """
    Returns a dictionary of track URIs and their corresponding names given a playlist.
    """
    playlist = sp.playlist_tracks(uri)
    playlist_d = {}
    for song in playlist['items']:
        playlist_d[song['track']['id']] = song['track']['name']
    
    return playlist_d

In [6]:
def create_playlist_df(songs: dict):
    """
    Returns a DataFrame representation of a given playlist.
    """
    column_labels = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 
                 'instrumentalness', 'liveness', 'valence', 'tempo', 'type', 'id', 'uri', 'track_href', 
                 'analysis_url', 'duration_ms', 'time_signature']

    features_df = pd.DataFrame(columns=column_labels)
    for (uri, title) in zip(songs.keys(), songs.values()):
        features = sp.audio_features([uri])[0]
        features_df = features_df.append(pd.DataFrame(data=features, index=[title]), sort=False)
    
    return features_df

In [8]:
def create_df_from_uri(uri):
    """
    Ensemble function that combines songs extraction and DataFrame creation.
    """
    songs = extract_songs_from_playlist(uri)
    df = create_playlist_df(songs)
    
    return df

In [65]:
def clean_raw_df(df: pd.DataFrame, pipe=True) -> pd.DataFrame:
    """Returns a clean version of the raw DataFrame"""
    df = df[~df.index.duplicated(keep='first')]
    if pipe:
        df = df.drop(["track_href", "id", "type", "analysis_url", "uri"], axis=1)
    df = df[df['time_signature'] > 0.0]
    df = df.replace({"mode": {1.0:"major", 0.0: "minor"}})

    return df

In [66]:
def one_hot(df: pd.DataFrame) -> pd.DataFrame:
    """One hot encodes the Mode and Time Signature Features"""
    oh_enc = OneHotEncoder()
    oh_enc.fit(df[['mode', "time_signature", "key"]])
    dummies = pd.DataFrame(oh_enc.transform(df[['mode', "time_signature", "key"]]).todense(), 
                           columns=oh_enc.get_feature_names(['mode', "time_signature", "key"]),
                           index = df.index)
    return df.join(dummies).drop(['mode', "time_signature", "key"], axis=1)

In [67]:
def center(df: pd.DataFrame) -> pd.DataFrame:
    """ Returns a design matrix for PCA from a given DataFrame"""
    means = np.mean(df)
    centered = df - means
    
    return centered

In [74]:
def design_matrix(df: pd.DataFrame) -> pd.DataFrame:
    """Returns the design matric of the DataFrame"""
    X = df.drop(['playlist'], axis=1)
    X = StandardScaler().fit_transform((X - np.mean(X)).to_numpy())
    y = df['playlist']
    return X, y

In [75]:
column_labels = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 
                 'instrumentalness', 'liveness', 'valence', 'tempo', 'type', 'id', 'uri', 'track_href', 
                 'analysis_url', 'duration_ms', 'time_signature', 'playlist']

features_df = pd.DataFrame(columns=column_labels)
for playlist in os.listdir("playlists"):
    features = pd.read_csv(f"playlists/{playlist}", index_col=0)
    features['playlist'] = playlist
    features_df = features_df.append(features, sort=False)
    
features_df.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature,playlist
Meditate (feat. J.I.D.),0.743,0.679,10,-8.295,0,0.286,0.0757,0.0,0.347,0.618,147.752,audio_features,0Eqm7hD828cATBLUx2fJox,spotify:track:0Eqm7hD828cATBLUx2fJox,https://api.spotify.com/v1/tracks/0Eqm7hD828cA...,https://api.spotify.com/v1/audio-analysis/0Eqm...,278084,4,"String playlistName = ""Workout"";"
Money Trees,0.716,0.531,7,-7.355,1,0.122,0.0703,0.0,0.224,0.344,71.994,audio_features,2HbKqm4o0w5wEeEFXm2sD4,spotify:track:2HbKqm4o0w5wEeEFXm2sD4,https://api.spotify.com/v1/tracks/2HbKqm4o0w5w...,https://api.spotify.com/v1/audio-analysis/2HbK...,386907,4,"String playlistName = ""Workout"";"
LIFE,0.901,0.497,7,-9.236,1,0.266,0.0192,0.0,0.109,0.463,120.025,audio_features,3au0pvHqcTtszysswan6AO,spotify:track:3au0pvHqcTtszysswan6AO,https://api.spotify.com/v1/tracks/3au0pvHqcTts...,https://api.spotify.com/v1/audio-analysis/3au0...,162750,4,"String playlistName = ""Workout"";"
"Costa Rica (with Bas & JID feat. Guapdad 4000, Reese LAFLARE, Jace, Mez, Smokepurpp, Buddy & Ski Mask The Slump God)",0.642,0.647,7,-6.312,0,0.428,0.517,0.0,0.0649,0.664,120.749,audio_features,5WnfnSpuNEGXyEt78PBA6d,spotify:track:5WnfnSpuNEGXyEt78PBA6d,https://api.spotify.com/v1/tracks/5WnfnSpuNEGX...,https://api.spotify.com/v1/audio-analysis/5Wnf...,217867,4,"String playlistName = ""Workout"";"
151 Rum,0.756,0.867,7,-3.626,1,0.162,0.155,0.00285,0.33,0.666,129.983,audio_features,22WV03i2lBbwNVCE1g671p,spotify:track:22WV03i2lBbwNVCE1g671p,https://api.spotify.com/v1/tracks/22WV03i2lBbw...,https://api.spotify.com/v1/audio-analysis/22WV...,156780,4,"String playlistName = ""Workout"";"


In [100]:
df = one_hot(clean_raw_df(features_df))

In [102]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(['playlist'], axis=1), df['playlist'], test_size=0.10, random_state=42)

In [105]:
knn = KNeighborsClassifier(n_neighbors=3)

In [106]:
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [94]:
df_dict = {}
for (uri, name) in zip(playlists.keys(), playlists.values()):
    new_df = create_df_from_uri(uri)
    new_df.to_csv(name, index=True)
    df_dict[name] = new_df