In [37]:
import librosa
import librosa.display
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import os
from os import path
import random
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import timeit

In [10]:
def librosa_analysis(filepath):
    
    """Compute song's chromagram
    
    input: path of audio file
    
    output: list of 12 arrays of intensity values across samples (sampling rate of 22050 Hz)
    
    """
    
    y, sr = librosa.load(filepath)
    # Compute chroma features from the harmonic signal
    chromagram = librosa.feature.chroma_stft(y=y,sr=sr)
    chromaDF = pd.DataFrame(chromagram)
    chromaDF[chromaDF < 1] = 0
    chroma_f = chromaDF.sum(axis = 1)
    chroma_p = [i / sum(chroma_f) for i in chroma_f]
    
    # Beat track on the percussive signal
    tempo, beat_frames = librosa.beat.beat_track(y=y,sr=sr)
    
    results = [tempo]
    results.append(chroma_p)
    
    return results

In [2]:
def plot_chroma_graph(path, x = 10, y = 4, f = 20, title = 'None'):
    
    """Plot chromagram
    
    input: str audio file path
    
    output: plot
    
    """
    
    if title == 'None':
        title = path
    
    y, sr = librosa.load(path)
    # Compute chroma features from the harmonic signal
    chroma = librosa.feature.chroma_stft(y=y_b,sr=sr_b)
    
    plt.figure(figsize=(x, y))
    librosa.display.specshow(chroma, y_axis='chroma', x_axis='time')
    plt.colorbar()
    plt.title(path)
    plt.tight_layout()
    plt.rcParams.update({'font.size': f})
    plt.show()

In [3]:
def retrieve_chroma_bpm(path):
    
    """
    input: directory path
    
    output: DataFrame of track name, bpm, and chroma distribution
    
    """
    
    # Identify all file paths
    folder_path = path
    entries = os.listdir(folder_path)
    track_paths = [(folder_path + paths) for paths in entries if paths[0] != '.']
    track_names = [paths for paths in entries if paths[0] != '.']

    results = []

    # Extract chroma and BPM data from tracks
    for path in track_paths:
        new_result = mir(path)
        results.append(new_result)

    bpms = [i[0] for i in results]
    chromas = [i[1] for i in results]

    # Dataframe to store results
    results_df = pd.DataFrame(list(zip(track_names, bpms)), columns = ['track', 'bpm'])
    notes = ['C', 'Db', 'D', 'Eb', 'E', 'F', 'Gb', 'G', 'Ab', 'A', 'Bb', 'B']
    chroma_df = pd.DataFrame(chromas, columns = notes)
    results_df = pd.concat([results_df, chroma_df], axis = 1)

    # Convert tempo to fall between 80 and 160 BPMs
    results_df.loc[results_df.bpm < 80, 'bpm'] = results_df.loc[results_df.bpm < 80, 'bpm'] * 2
    results_df.loc[results_df.bpm > 160, 'bpm'] = results_df.loc[results_df.bpm > 160, 'bpm'] / 2

    return results_df

In [4]:
def get_spotify_song(search, cid, secret):
    
    """Return song metadata from Spotify
    
    input: str search query (replace all space with '+', e.g., beyonce+crazy+in+love)
    
    output: dictionary of metadata
    
    """
    
    
    client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret)
    sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager)
    
    query = sp.search(search)
    song_id = query['tracks']['items'][0]['id']

    audio_feature = sp.audio_features(song_id)[0]
    
    return audio_feature

In [None]:
def get_spotify_metadata(results, cid, secret):
    
    """ query spotify metadata for list of song
    
    input: Merged DataFrame: output from retrieve_chroma_bpm and 
    
    """
    
    tracks = results.loc[:,['track']]
    tracks['search_name'] = tracks.track
    
    # Edit track names for Spotify API format
    tracks.search_name = tracks.search_name.str.replace('.mp3', '')
    tracks.search_name = tracks.search_name.str.replace('.wav', '')
    tracks.search_name = tracks.search_name.str.replace('.m4a', '')
    tracks.search_name = tracks.search_name.str.replace('(', '')
    tracks.search_name = tracks.search_name.str.replace(')', '')
    tracks.search_name = tracks.search_name.str.replace(' - ', ' ')
    tracks.search_name = tracks.search_name.str.replace(' ', '+')
    
    search_list = tracks.search_name.to_list()
    

    client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret)
    sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager)
    
    search_names = []
    audio_features = []
    result_names = []
    
    # Fetch track metadata from Spotify
    for search in search_list:

        try:
            query = sp.search(search)
            song_id = query['tracks']['items'][0]['id']
            result = query['tracks']['items'][0]['name']

            search_names.append(search)
            audio_feature = sp.audio_features(song_id)[0]
            audio_features.append(audio_feature)
            result_names.append(result)

        except:
            print(search, 'not found')
            
    spotify_results =  pd.DataFrame(audio_features)
    spotify_results['search_name'] = search_names
    spotify_results['result_name'] = result_names
    
    # Ensure BPMs fall between 80 and 160
    spotify_results.loc[spotify_results.tempo < 80, 'tempo'] = spotify_results.loc[spotify_results.tempo < 80, 'tempo'] * 2
    spotify_results.loc[spotify_results.tempo > 160, 'tempo'] = spotify_results.loc[spotify_results.tempo > 160, 'tempo'] / 2

    spotify_results_final = tracks.merge(spotify_results, on = 'search_name', how = 'left')

    return spotify_results_final

In [37]:
def get_song_distances(results_df, bpm_threshold = 0.15):
    
    """Calculate chroma distribution difference and bpm increase for every permutation pair
    
    input: merged DataFrame: retrieve_chroma_bpm and get_spotify_metadata, float: bpm filter threshold
    
    output: DataFrame of every pair permutation with chroma distribution difference and % bpm change
    
    """
    
    # Perform cartesian join (compare each song to every other song)
    results_df['join'] = 1
    results_ct = results_df.merge(results_df, on = 'join', how = 'outer')

    s1_col = ['C_x', 'Db_x', 'D_x', 'Eb_x', 'E_x', 'F_x', 'Gb_x', 'G_x', 'Ab_x','A_x', 'Bb_x', 'B_x']
    
    s2_col = ['C_y', 'Db_y', 'D_y', 'Eb_y', 'E_y', 'F_y', 'Gb_y', 'G_y', 'Ab_y','A_y', 'Bb_y', 'B_y']

    # Create new dataframe to calculate chroma distances between songs
    song_dist = results_ct.loc[:,['track_x', 'track_y']]
    for i in range(0, len(s1_col)):
        dist = (results_ct[s1_col[i]] - results_ct[s2_col[i]])**2
        song_dist = pd.concat([song_dist, dist], axis = 1)

    song_dist['dist'] = song_dist.iloc[:,2:].sum(axis = 1)
    song_dist = song_dist.loc[song_dist.dist != 0, ['track_x', 'track_y', 'dist']]

    # Calculate % increase in BPM between songs
    bpm_dist = pd.DataFrame((results_ct['bpm_x'] - results_ct['bpm_y']).abs() / 
                            results_ct[['bpm_x','bpm_y']].min(axis = 1), columns = ['bpm_inc'])

    bpm_dist = pd.concat([bpm_dist, results_ct[['track_x', 'track_y']]], axis = 1)

    # Merge features
    song_dist = song_dist.merge(bpm_dist, on = ['track_x', 'track_y'])
    
    # Remove pairs with large bpm increases
    song_dist_bpm = song_dist[song_dist.bpm_inc < bpm_threshold].sort_values(by = 'dist')
    
    # Include song keys
    song_dist_bpm = song_dist_bpm.merge(results_df[['track', 'key', 'mode', 'instrumentalness', 'speechiness']], how = 'left', left_on = 'track_x', right_on = 'track')
    song_dist_bpm = song_dist_bpm.rename({'key':'key_x', 'mode':'mode_x', 'speechiness':'speechiness_x', 'instrumentalness': 'instrumentalness_x'}, axis = 1).drop('track', axis = 1)

    song_dist_bpm = song_dist_bpm.merge(results_df[['track', 'key', 'mode', 'instrumentalness', 'speechiness']], how = 'left', left_on = 'track_y', right_on = 'track')
    song_dist_bpm = song_dist_bpm.rename({'key':'key_y', 'mode':'mode_y', 'speechiness':'speechiness_y', 'instrumentalness': 'instrumentalness_y'}, axis = 1).drop('track', axis = 1)
    
    return song_dist_bpm

In [5]:
#Compare chromagraph distribution between songs
def plot_song_comparison(results_df, track1, track2):
    
    """Plot chroma distribution comparison between two songs
    
    input: get_chroma_bpm output (DataFrame), str track1, str track2
    
    output: plot
    """
    
    song1 = results_df[results_df.track == track1].iloc[:,2:14].values.tolist()[0]
    song2 = results_df[results_df.track == track2].iloc[:,2:14].values.tolist()[0]
    
    notes = ['C', 'Db', 'D', 'Eb', 'E', 'F', 'Gb', 'G', 'Ab', 'A', 'Bb', 'B']
    fig, ax = plt.subplots()
    x = np.arange(len(notes))
    width = 0.35

    rects1 = ax.bar(x - width/2, song1, width, label = track1)
    rects2 = ax.bar(x + width/2, song2, width, label = track2)
    
    ax.set_xticks(x)
    ax.set_xticklabels(notes)
    ax.legend()
    
    plt.show()

In [6]:
def create_tracklist(distances, set_length = 99, song_list = [], start_track = 'NA'):
    
    """ Create tracklist by minimizing chroma distribution difference
    
    input: output from get_song_distances (DataFrame), set length int, subset of songs to evaluate (list), start track name (str)
    
    output: list of tracklist 
    
    """
    
    if len(song_list) == 0:
        song_list = distances.track_x.drop_duplicates().values.tolist()
        
    if start_track == 'NA':
        start_track = song_list[random.randint(0, len(song_list)) - 1]
        
    if set_length == 99:
        set_length = len(song_list)
    
    track = start_track
    tracklist = [start_track]

    while len(tracklist) < set_length:
        search = distances[(distances.track_x == track) & 
                           (~distances.track_y.isin(tracklist))].sort_values(by = 'dist')

        track = search.head(1).track_y.values[0]

        tracklist.append(track)
        
    return tracklist

In [28]:
folder_path = "/Users/briancai/Drive/Music/Secret Smoothie"
entries = os.listdir(folder_path)

In [29]:
track_names = [paths for paths in entries if paths[0] != '.']

In [30]:
track_names

['Flume - Never Be Like You (Disclosure Remix).mp3',
 'Disclosure - Latch.mp3',
 "Backstreet Boys - Everybody (Backstreet's Back).mp3",
 'Love Regenerator - CP-1.mp3',
 'Kanye West - Fade.mp3',
 'James Blake  - Limit To Your Love (Club remix).mp3',
 'Mr. Carmack - Pay For What.mp3',
 'Zhu - My Life (Kyle Watson Remix).mp3',
 'Coeus - Rebirth.wav',
 'AC Slater - Fly Kicks (Wax Motif Remix).mp3',
 'Disclosure - Omen (Jonas Rathsman Remix).mp3',
 'Hot Since 82 - Veins.mp3',
 'Vince Staples - So What.mp3',
 'Drake - Nice For What.mp3',
 'Schoolboy Q - Numb Numb Juice.mp3',
 "Caribou - Can't Do Without You (Tale Of Us Remix).mp3",
 'Kanye West - Flashing Lights.mp3',
 'The xx - Tides (Dixon Remix).mp3',
 'Denzel Curry - SPEEDBOAT.mp3',
 'Rihanna - Kiss It Better (Four Tet Remix).mp3',
 'The Weeknd - Snowchild.mp3',
 'The Weeknd - Blinding Lights.mp3',
 'Justin Timberlake - Cry Me a River.mp3',
 'Disclosure - When A Fire Starts To Burn.mp3',
 'Mathame - Magia.wav',
 'Bad Bunny - Estamos Bien

In [31]:
result_path = "/Users/briancai/Drive/NU/DJ_recommender/mashup-recommender/data/librosa_spotipy_output.csv"

In [32]:
past_results = pd.read_csv(result_path)

In [33]:
current_list = past_results['track'].values.tolist()

In [34]:
'Vince Staples - So What.mp3' not in current_list

False

In [42]:
past_results.track.to_list()

['Kanye West - Flashing Lights.mp3',
 'Disclosure - Holding On.mp3',
 'Disclosure - Latch.mp3',
 'Four Tet - Baby.mp3',
 'Kodak Black - Patty Cake.mp3',
 'Chrome Sparks - Wake.mp3',
 'Beyonce - Formation.mp3',
 'Ekali - Unfaith.mp3',
 'Bonobo - Linked.mp3',
 'Against All Logic - You Are Going to Love Me and Scream.mp3',
 'The Weeknd - Blinding Lights.mp3',
 'The Weeknd - Snowchild.mp3',
 'AC Slater - Fly Kicks (Wax Motif Remix).mp3',
 'Cardi B - I Like It (Dillon Francis Remix).mp3',
 'Disclosure - When A Fire Starts To Burn.mp3',
 'Chris Lake - I Want You.mp3',
 'Chris Lake - Y.O.D.O.mp3',
 'Claptone - Cream.mp3',
 'Drake - Nice For What.mp3',
 'FISHER - Losing It.mp3',
 'Corona - Rhythm Of The Night (Blonde Remix).mp3',
 'Disclosure - Omen (Jonas Rathsman Remix).mp3',
 'Disclosure - Stimulation.mp3',
 'Meek Mill - Going Bad.mp3',
 'Dusky - Perth Remix.mp3',
 'Ginuwine - Pony.mp3',
 'FISHER - You Little Beauty.mp3',
 'Dillon Francis - Get Low.mp3',
 "Flume - Holdin' On.mp3",
 'Flume -

In [35]:
new_tracks = [i for i in track_names if i not in current_list]

In [36]:
new_tracks

['Coeus - Rebirth.wav', 'Mathame - Magia.wav']

In [38]:
track_review = [path.join(folder_path, i) for i in new_tracks]

In [39]:
track_review

['/Users/briancai/Drive/Music/Secret Smoothie/Coeus - Rebirth.wav',
 '/Users/briancai/Drive/Music/Secret Smoothie/Mathame - Magia.wav']

In [40]:
def extract_chroma_bpm(dir_path, track_list):
    
    """
    input: directory path
    
    output: DataFrame of track name, bpm, and chroma distribution
    
    """
    
    # Identify all file paths
    track_paths = [path.join(dir_path, i) for i in track_list]

    results = []

    # Extract chroma and BPM data from tracks
    for path in track_paths:
        new_result = librosa_analysis(path)
        results.append(new_result)

    bpms = [i[0] for i in results]
    chromas = [i[1] for i in results]

    # Dataframe to store results
    results_df = pd.DataFrame(list(zip(track_names, bpms)), columns = ['track', 'bpm'])
    notes = ['C', 'Db', 'D', 'Eb', 'E', 'F', 'Gb', 'G', 'Ab', 'A', 'Bb', 'B']
    chroma_df = pd.DataFrame(chromas, columns = notes)
    results_df = pd.concat([results_df, chroma_df], axis = 1)

    # Convert tempo to fall between 80 and 160 BPMs
    results_df.loc[results_df.bpm < 80, 'bpm'] = results_df.loc[results_df.bpm < 80, 'bpm'] * 2
    results_df.loc[results_df.bpm > 160, 'bpm'] = results_df.loc[results_df.bpm > 160, 'bpm'] / 2

    return results_df