# Spotify Automatic Track Organization

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import json
import time
import spotipy
import os

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from spotipy.oauth2 import SpotifyOAuth
from credentials import *

## Initializing `Spotipy` Access

In [None]:
scope = "user-library-read"

sp = spotipy.Spotify(auth_manager=SpotifyOAuth(scope=scope, 
    client_id=SPOTIPY_CLIENT_ID, 
    client_secret=SPOTIPY_CLIENT_SECRET,
    redirect_uri=SPOTIPY_REDIRECT_URI))

## Utility Functions

In [None]:
def extract_playlists_from_user():
    """
    Returns a dictionary of playlist URIs and their corresponding titles.
    """
    playlists = sp.current_user_playlists()
    d = {}
    for playlist in playlists['items']:
        d[playlist["uri"]] = playlist['name']
    
    return d

In [None]:
def extract_songs_from_playlist(uri):
    """
    Returns a dictionary of track URIs and their corresponding names given a playlist.
    """
    playlist = sp.playlist_tracks(uri)
    playlist_d = {}
    for song in playlist['items']:
        playlist_d[song['track']['id']] = song['track']['name']
    
    return playlist_d

In [None]:
def create_playlist_df(songs: dict):
    """
    Returns a DataFrame representation of a given playlist.
    """
    column_labels = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 
                 'instrumentalness', 'liveness', 'valence', 'tempo', 'type', 'id', 'uri', 'track_href', 
                 'analysis_url', 'duration_ms', 'time_signature']

    features_df = pd.DataFrame(columns=column_labels)
    for (uri, title) in zip(songs.keys(), songs.values()):
        features = sp.audio_features([uri])[0]
        features_df = features_df.append(pd.DataFrame(data=features, index=[title]), sort=False)
    
    return features_df

In [None]:
def create_df_from_uri(uri):
    """
    Ensemble function that combines songs extraction and DataFrame creation.
    """
    songs = extract_songs_from_playlist(uri)
    df = create_playlist_df(songs)
    
    return df

In [None]:
def clean_raw_df(df: pd.DataFrame, pipe=True) -> pd.DataFrame:
    """Returns a clean version of the raw DataFrame"""
    df = df[~df.index.duplicated(keep='first')]
    if pipe:
        df = df.drop(["track_href", "id", "type", "analysis_url", "uri"], axis=1)
    df = df[df['time_signature'] > 0.0]
    df = df.replace({"mode": {1.0:"major", 0.0: "minor"}})

    return df

In [None]:
def one_hot(df: pd.DataFrame) -> pd.DataFrame:
    """One hot encodes the Mode and Time Signature Features"""
    oh_enc = OneHotEncoder()
    oh_enc.fit(df[['mode', "time_signature", "key"]])
    dummies = pd.DataFrame(oh_enc.transform(df[['mode', "time_signature", "key"]]).todense(), 
                           columns=oh_enc.get_feature_names(['mode', "time_signature", "key"]),
                           index = df.index)
    return df.join(dummies).drop(['mode', "time_signature", "key"], axis=1)

In [None]:
def center(df: pd.DataFrame) -> pd.DataFrame:
    """ Returns a design matrix for PCA from a given DataFrame"""
    means = np.mean(df)
    centered = df - means
    
    return centered

In [None]:
def design_matrix(df: pd.DataFrame) -> pd.DataFrame:
    """Returns the design matric of the DataFrame"""
    X = df.drop(['playlist'], axis=1)
    X = StandardScaler().fit_transform((X - np.mean(X)).to_numpy())
    y = df['playlist']
    return X, y

## Save Playlist Feature Data

In [None]:
df_dict = {}
for (uri, name) in zip(playlists.keys(), playlists.values()):
    new_df = create_df_from_uri(uri)
    new_df.to_csv(name, index=True)
    df_dict[name] = new_df

In [None]:
column_labels = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 
                 'instrumentalness', 'liveness', 'valence', 'tempo', 'type', 'id', 'uri', 'track_href', 
                 'analysis_url', 'duration_ms', 'time_signature', 'playlist']

features_df = pd.DataFrame(columns=column_labels)
for playlist in os.listdir("../data/playlists"):
    features = pd.read_csv(f"../data/playlists/{playlist}", index_col=0)
    features['playlist'] = playlist
    features_df = features_df.append(features, sort=False)
    
features_df