# Song Recommendation System Based on Distance Matric

1. Extract data from Million Song Data Subset including song pitch, song timbre and loudeness which are major auditory attributes of musical tones.
2. Two variable combinations are used to compare recommendation results. One has only pitch, timbre and loudness, another add key and time_signature.
3. Three different distance matrics, Mahhatan Distance, Euclidean Ditance and Cosine Similarity were used to calculate the similarity between different songs.
4. Users input their favourite songs and our program search n-closest neighbors of every input song and find all the overlapped songs. The top 5 most frequency songs will be selected for users. 

In [1]:
import pandas as pd
import numpy  as np
import sklearn.neighbors as sn
import os
import re
import itertools as it
import operator 
import functools

In [2]:
def get_filenames(path):
    return([get_filenames(path+"/"+entry.name)
            if entry.is_dir() 
            else path+"/"+entry.name 
            for entry 
            in os.scandir(path)
           ])

def unlist(alist):
    return(list(it.chain.from_iterable(alist)
               )
          )

def var_list(base,numof):
    return([base+str(ndx) for ndx in range(numof)]
          )

def h1d_array(in_array,n): 
    # n1d is the number of elements in `in_array`
    n1d = functools.reduce(operator.mul,
                           list(in_array.shape))
    # return a 1 row 2D array with `n` columns
    b = np.ndarray(shape=(1,n1d),
                   buffer=in_array,
                   dtype=in_array.dtype
                  )[0:1,0:n]
    return(b)

## Using make_1row_df to extract data and make a data frame

In [3]:
def make_1row_df(filename='', metadata_vars=[], analysis_vars=[], remove=False):
    # open `filename` as a HDF5 file
    store = pd.HDFStore(filename,"r")
    if remove==True:
        # `metadata_vars` and `analysis_vars` contain the variables to remove
        metadata_vars = list({item for item 
                                  in list(store.root.metadata.songs.read().dtype.names) 
                                  if item not in metadata_vars})
        analysis_vars = list({item for item 
                                  in list(store.root.analysis.songs.read().dtype.names) 
                                  if item not in analysis_vars})
    # else: `metadata_vars` and `analysis_vars` contain the variables to keep
    
    # retrieve the first `n` values as a horizontal array of 1 dimension
    segments_pitches = h1d_array(store.root.analysis.segments_pitches.read(),60)
    segments_timbre  = h1d_array(store.root.analysis.segments_timbre.read(),60)
    bars_confidence  = h1d_array(store.root.analysis.bars_confidence.read(),10)
    artist_terms     = h1d_array(store.root.metadata.artist_terms.read(),3)
    
    # store these values as variables in single dataframes
    at_df = pd.DataFrame(artist_terms    ,columns=var_list('at_',artist_terms    .shape[1]))
    bc_df = pd.DataFrame(bars_confidence ,columns=var_list('bc_',bars_confidence .shape[1]))
    sp_df = pd.DataFrame(segments_pitches,columns=var_list('sp_',segments_pitches.shape[1]))
    st_df = pd.DataFrame(segments_timbre ,columns=var_list('st_',segments_timbre .shape[1]))
    
    # merge these single dataframes into one single row dataframe
    ret = pd.concat([
            # retrieve a single row dataframe from `/metadata/songs`
            pd.DataFrame(store.root.metadata.songs.read(), 
                         columns=metadata_vars),
            # retrieve a single row dataframe from `/analysis/songs`
            pd.DataFrame(store.root.analysis.songs.read(), 
                         columns=analysis_vars),
            #at_df, 
            bc_df, 
            sp_df,
            st_df],
            axis=1) # `axes=1` means stack the dataframes horizontally 
    # close the HDF5 file
    store.close()
    # return the merged dataframe
    return(ret)

In [4]:
path = "D:\\millionsongsubset_full\\MillionSongSubset\\data"
filenames = unlist(unlist(unlist(get_filenames(path))))

mss_df_list = [make_1row_df(filename=filename,
                            metadata_vars=['artist_familiarity','artist_hotttnesss',
                                           'song_hotttnesss','title',
                                           'artist_name',
                                           'artist_location','release',
                                           'artist_longitude','artist_latitude',
                                           'artist_id','song_id','track_id'],
                            # Omit: genre
                            analysis_vars=['duration','key','loudness','mode',
                                           'tempo','time_signature'],
                            # Omit: danceability, energy
                            remove=False
                           )
                for filename in filenames[0:10000] # get data from all 10,000 files
              ]

mss_df = pd.concat(mss_df_list,axis=0).reset_index(drop=True)
save_load_path = 'D:\\ML755'
mss_df.to_pickle(save_load_path+'\\mss_df_sr.pk60')


In [5]:
save_load_path = 'D:\\ML755'
mss_df = pd.read_pickle(save_load_path+'\\mss_df_sr.pk60')


mss_df['mode']            = mss_df['mode']           .astype('float64')
mss_df['key']             = mss_df['key']            .astype('category')
mss_df['time_signature']  = mss_df['time_signature'] .astype('category')


mss_df = pd.get_dummies(mss_df, columns=['key','time_signature'], prefix=['k','ts'])



In [6]:
num_rows = 10000
mss_song_artist = pd.concat([mss_df.loc[:num_rows, 'title'],
                            mss_df.loc[:num_rows, 'artist_name'],
                            mss_df.loc[:num_rows,'artist_id'],
                            mss_df.loc[:num_rows,'song_id']
                       ],
                       axis=1
                       )

mss_song_artist

Unnamed: 0,title,artist_name,artist_id,song_id
0,"b""I Didn't Mean To""",b'Casual',b'ARD7TVE1187B99BFB1',b'SOMZWCG12A8C13C480'
1,b'Soul Deep',b'The Box Tops',b'ARMJAGH1187FB546F3',b'SOCIWDW12A8C13D406'
2,b'Amor De Cabaret',b'Sonora Santanera',b'ARKRRTF1187B9984DA',b'SOXVLOJ12AB0189215'
3,b'Something Girls',b'Adam Ant',b'AR7G5I41187FB4CE6C',b'SONHOTT12A8C13493C'
4,b'Face the Ashes',b'Gob',b'ARXR32B1187FB57099',b'SOFSOCN12A8C143F5D'
5,b'The Moon And I (Ordinary Day Album Version)',b'Jeff And Sheri Easter',b'ARKFYS91187B98E58F',b'SOYMRWW12A6D4FAB14'
6,b'Keepin It Real (Skit)',b'Rated R',b'ARD0S291187B9B7BF5',b'SOMJBYD12A6D4F8557'
7,b'Drop of Rain',b'Tweeterfriendly Music',b'AR10USD1187B99F3F1',b'SOHKNRJ12A6701D1F8'
8,b'Pink World',b'Planet P Project',b'AR8ZCNI1187B9A069B',b'SOIAZJW12AB01853F1'
9,b'Insatiable (Instrumental Version)',b'Clp',b'ARNTLGG11E2835DDB9',b'SOUDSGM12AC9618304'


## First data frame including song pitch, timbre and loudness of 5 segments. 

In [7]:
# withdraw data 

num_rows = 10000
mss_num_df = pd.concat([mss_df.loc[:num_rows, 'sp_0':'sp_59'],
                        mss_df.loc[:num_rows, 'st_0':'st_59'],
                        mss_df.loc[:num_rows,'loudness']
                       ],
                       axis=1
                       )
mss_num_df.head()


Unnamed: 0,sp_0,sp_1,sp_10,sp_11,sp_12,sp_13,sp_14,sp_15,sp_16,sp_17,...,st_51,st_52,st_53,st_54,st_55,st_56,st_57,st_58,st_59,loudness
0,0.946,0.684,1.0,0.742,0.01,0.054,0.015,0.021,0.067,0.17,...,-58.292,16.52,-48.17,27.457,42.717,-13.197,3.489,-16.801,-8.547,-11.197
1,1.0,1.0,1.0,1.0,0.018,0.07,0.04,0.044,0.217,0.074,...,-21.399,-71.754,-23.274,32.708,7.204,38.913,20.644,20.334,-11.435,-9.843
2,1.0,0.911,0.096,0.147,0.489,1.0,0.561,0.258,0.153,0.096,...,13.779,-21.774,-7.484,-20.936,15.136,-1.335,-16.819,8.278,18.478,-9.689
3,0.651,0.592,0.693,0.663,0.506,0.135,0.109,0.102,0.104,0.075,...,12.729,-11.11,22.801,-17.896,-17.476,-21.478,10.794,-6.544,-29.117,-9.013
4,1.0,0.529,0.318,0.331,0.534,0.821,0.198,0.155,0.484,0.489,...,17.525,23.192,-1.157,-20.737,29.243,18.097,-3.008,46.894,13.278,-4.501


## Another combination of five variables including song pitch, timber loudness, key and time_signature.

In [8]:
mss_num_df2 = pd.concat([mss_df.loc[:num_rows, 'sp_0':'sp_59'],
                        mss_df.loc[:num_rows, 'st_0':'st_59'],
                        mss_df.loc[:num_rows, 'k_0' :'k_11'],
                        mss_df.loc[:num_rows, 'ts_0':'ts_7'],
                        mss_df.loc[:num_rows, 'loudness']
                       ],
                       axis=1
                       )

mss_num_df2.head()

Unnamed: 0,sp_0,sp_1,sp_10,sp_11,sp_12,sp_13,sp_14,sp_15,sp_16,sp_17,...,k_9,k_10,k_11,ts_0,ts_1,ts_3,ts_4,ts_5,ts_7,loudness
0,0.946,0.684,1.0,0.742,0.01,0.054,0.015,0.021,0.067,0.17,...,0,0,0,0,0,0,1,0,0,-11.197
1,1.0,1.0,1.0,1.0,0.018,0.07,0.04,0.044,0.217,0.074,...,0,0,0,0,0,0,1,0,0,-9.843
2,1.0,0.911,0.096,0.147,0.489,1.0,0.561,0.258,0.153,0.096,...,0,0,0,0,1,0,0,0,0,-9.689
3,0.651,0.592,0.693,0.663,0.506,0.135,0.109,0.102,0.104,0.075,...,0,0,0,0,0,0,1,0,0,-9.013
4,1.0,0.529,0.318,0.331,0.534,0.821,0.198,0.155,0.484,0.489,...,0,0,0,0,0,0,1,0,0,-4.501


In [9]:
# normalize all columns
from sklearn import preprocessing
mss_num_df = mss_num_df.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))
mss_num_df2 = mss_num_df2.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))



#get manhatten distance
dm_m1 = sn.DistanceMetric.get_metric('minkowski',p=1)
dm_m2 = sn.DistanceMetric.get_metric('minkowski',p=2)
mss_num_dm = dm_m1.pairwise(mss_num_df[:])         # distance matric of less variables using mahhatan distance
mss_num_dm2 = dm_m2.pairwise(mss_num_df[:])        # distance matric of less variables using Euclidean ditance
 
mss_num_dm3 = dm_m1.pairwise(mss_num_df2[:])       # distance matric of more variables using mahhatan distance
mss_num_dm4 = dm_m2.pairwise(mss_num_df2[:])       # distance matric of more variables using Euclidean distance







There are some NaN values and need to be removed to calculate cosine similarity. 

In [17]:
mss_num_df.isnull().sum()  #NaN in the dataframe 

sp_0        0
sp_1        0
sp_10       0
sp_11       0
sp_12       0
sp_13       0
sp_14       0
sp_15       0
sp_16       0
sp_17       0
sp_18       0
sp_19       0
sp_2        0
sp_20       0
sp_21       0
sp_22       0
sp_23       0
sp_24       0
sp_25       0
sp_26       0
sp_27       0
sp_28       0
sp_29       0
sp_3        0
sp_30       0
sp_31       0
sp_32       0
sp_33       0
sp_34       0
sp_35       0
           ..
st_33       0
st_34       0
st_35       0
st_36       0
st_37       0
st_38       0
st_39       0
st_4        0
st_40       0
st_41       0
st_42       0
st_43       0
st_44       0
st_45       0
st_46       0
st_47       0
st_48       1
st_49       1
st_5        0
st_50       1
st_51       1
st_52       1
st_53       1
st_54       1
st_55       1
st_56       1
st_57       1
st_58       1
st_59       1
loudness    0
dtype: int64

In [21]:
mss_num_df_nonan = mss_num_df.dropna()  # drop them to calculate the cosine_similarity 
mss_num_df_nonan2 = mss_num_df2.dropna()
mss_num_df_nonan.isnull().sum()
mss_num_df_nonan2.isnull().sum()

sp_0        0
sp_1        0
sp_10       0
sp_11       0
sp_12       0
sp_13       0
sp_14       0
sp_15       0
sp_16       0
sp_17       0
sp_18       0
sp_19       0
sp_2        0
sp_20       0
sp_21       0
sp_22       0
sp_23       0
sp_24       0
sp_25       0
sp_26       0
sp_27       0
sp_28       0
sp_29       0
sp_3        0
sp_30       0
sp_31       0
sp_32       0
sp_33       0
sp_34       0
sp_35       0
           ..
st_5        0
st_50       0
st_51       0
st_52       0
st_53       0
st_54       0
st_55       0
st_56       0
st_57       0
st_58       0
st_59       0
k_0         0
k_1         0
k_2         0
k_3         0
k_4         0
k_5         0
k_6         0
k_7         0
k_8         0
k_9         0
k_10        0
k_11        0
ts_0        0
ts_1        0
ts_3        0
ts_4        0
ts_5        0
ts_7        0
loudness    0
dtype: int64

In [22]:
from sklearn.metrics.pairwise import cosine_similarity
mss_dm_cosine = cosine_similarity(mss_num_df_nonan[:])    # distance matric of less variables using cosine distance
mss_dm_cosine2 = cosine_similarity(mss_num_df_nonan2[:])  # distance matric of more variables using cosine distance
mss_dm_cosine[0:3,0:9]

array([[ 1.        ,  0.76921087,  0.79415471,  0.7664976 ,  0.75881616,
         0.70938656,  0.76134625,  0.79411514,  0.69567745],
       [ 0.76921087,  1.        ,  0.72431731,  0.77922757,  0.78486277,
         0.66644953,  0.78407663,  0.72675455,  0.68433512],
       [ 0.79415471,  0.72431731,  1.        ,  0.78840089,  0.8174149 ,
         0.72618746,  0.8109599 ,  0.80056821,  0.71304493]])

In [46]:
input_song = "Don't Let Her Pull You Down"

def find_loc(input_song):
    i = 0
    for title in mss_df['title']:
        title_df = title.decode("utf-8")
        if input_song == title_df:
            return (i)
        i = i + 1

loc_song = find_loc(input_song)
print(loc_song)

def find_loc2(location):
    j = 0
    a = []
    for distance in mss_num_dm[0:num_rows,location]:
        if distance <= np.partition(mss_num_dm[0:num_rows,location],6)[5] and distance !=0:
            a.append(j) 
        j = j + 1
    return a

        
def find_loc3(location):
    j = 0
    for distance in mss_dm_cosine[0:num_rows,location]:
        if distance == np.partition(mss_dm_cosine[0:num_rows,location],9999)[9998]:
            return j
        j = j + 1

loc_re_song= find_loc2(loc_song)
print(loc_re_song)        
recom_songs = mss_df['title'][loc_re_song[:]]

def name(songs):
    b =[]
    for song in songs:
        new_song = song.decode("utf-8")
        b.append(new_song)
    return b

recom_songs_final = name(recom_songs)
print(recom_songs_final)

9981
[3734, 3848, 6433, 7755, 8846]
['Dreidel_ Dreidel_ Dreidel_ Dreidel!', "Run's House", 'Tócale Las Palmas', "I Don't Like It Like This", 'Never Again (Album Version)']


## Search 50 closest neighbors and find similar songs 

In [30]:
input_songs = ['G G Kah', 'Where Are You', 'Mindless']


def find_locall(location,index):    
    j = 0
    a = []
    for distance in mss_num_dm[0:num_rows,location]:
        if distance <= np.partition(mss_num_dm[0:num_rows,location],index+1)[index] and distance !=0:
            a.append(j) 
        j = j + 1
    return a

def find_all(inputsongs):
    index = 50
    loc_song = []
    return_songs = []
    for song in input_songs:
        loc_song.append(find_loc(song))
    #print(loc_song)
    for location in loc_song:        
        #return_songs.append(find_locall(location, index))
        return_songs.append(find_locall(location, index))
    return_songlist = [item for sublist in return_songs for item in sublist]
    return return_songlist
        #print (return_songs)

songs_return = find_all(input_songs)
songs_return



[26,
 138,
 357,
 364,
 456,
 522,
 672,
 833,
 1267,
 1747,
 1891,
 2269,
 2475,
 2663,
 2786,
 2890,
 3215,
 3375,
 3551,
 3659,
 3719,
 3793,
 4017,
 4224,
 4236,
 4507,
 5460,
 5489,
 5941,
 6205,
 6605,
 6646,
 7131,
 7285,
 7305,
 7318,
 7429,
 7768,
 7803,
 7872,
 8219,
 8733,
 8889,
 9049,
 9279,
 9347,
 9587,
 9818,
 9849,
 9858,
 250,
 326,
 342,
 1052,
 1240,
 1389,
 1532,
 2196,
 2202,
 2451,
 2669,
 2760,
 2805,
 2955,
 3587,
 3797,
 4655,
 4898,
 5291,
 5537,
 5849,
 5936,
 6131,
 6344,
 6616,
 6749,
 6974,
 7225,
 7286,
 7460,
 7713,
 7761,
 8216,
 8336,
 8465,
 8588,
 8601,
 8902,
 9029,
 9035,
 9049,
 9253,
 9309,
 9388,
 9418,
 9679,
 9795,
 9,
 26,
 364,
 420,
 556,
 595,
 1166,
 1346,
 1347,
 1845,
 1967,
 1970,
 2048,
 2081,
 2145,
 2404,
 2540,
 2619,
 2773,
 3088,
 3192,
 3685,
 3715,
 3913,
 4465,
 4496,
 4615,
 4691,
 5183,
 5323,
 5616,
 6129,
 6672,
 6747,
 6757,
 6761,
 6921,
 6939,
 7318,
 7623,
 7714,
 7763,
 7803,
 7914,
 8253,
 9642,
 9741,
 9797,
 9818,

In [31]:
songs_return.sort()
counts = Counter(songs_return)
largest_loc = heapq.nlargest(5, counts, key=counts.get)
mss_song_artist.loc[largest_loc,]

Unnamed: 0,title,artist_name,artist_id,song_id
26,b'Superconfidential',b'Clp',b'ARNTLGG11E2835DDB9',b'SOZQDIU12A58A7BCF6'
9049,b'Le Bug (Live 2005)',b'M',b'AR828WL1187FB47E81',b'SOIFMLF12A6D4F845B'
9818,b'Sugar Coated Sour',b'The Dillinger Escape Plan',b'ARMAC4T1187FB3FA4C',b'SOZXTKD12A8C13FC43'
364,"b""Ain't No Love Ain't No Use (Love To Infinity...",b'Tabu feat. Mitzib',b'AR1072G1187FB5B351',b'SOPDILX12AF72AA3C2'
7803,"b""Jam-Master Jammin'""",b'RUN-DMC',b'ARX9YIP1187B98A656',b'SODMJWA12A8C13BA03'


In [26]:
input_songs = ['G G Kah', 'Where Are You', 'Mindless']
# Less variables with ED

def find_locall(location,index):    
    j = 0
    a = []
    for distance in mss_num_dm2[0:num_rows,location]:
        if distance <= np.partition(mss_num_dm2[0:num_rows,location],index+1)[index] and distance !=0:
            a.append(j) 
        j = j + 1
    return a

def find_all(inputsongs):
    index = 50
    loc_song = []
    return_songs = []
    for song in input_songs:
        loc_song.append(find_loc(song))
    #print(loc_song)
    for location in loc_song:        
        #return_songs.append(find_locall(location, index))
        return_songs.append(find_locall(location, index))
    return_songlist = [item for sublist in return_songs for item in sublist]
    return return_songlist
        #print (return_songs)

songs_return2 = find_all(input_songs)
songs_return2

[26,
 138,
 321,
 357,
 364,
 456,
 522,
 584,
 672,
 833,
 1267,
 1747,
 1891,
 2269,
 2663,
 2890,
 3375,
 3526,
 3551,
 3659,
 3793,
 3945,
 4017,
 4236,
 4507,
 4743,
 5460,
 5895,
 5912,
 5941,
 5975,
 6205,
 6646,
 6931,
 7305,
 7318,
 7381,
 7557,
 7768,
 7803,
 7872,
 8209,
 8219,
 8714,
 8889,
 9049,
 9587,
 9642,
 9818,
 9858,
 250,
 326,
 342,
 1052,
 1162,
 1240,
 1389,
 1619,
 1643,
 2196,
 2202,
 2322,
 2451,
 2509,
 2669,
 2760,
 2979,
 3412,
 3544,
 3587,
 3905,
 4073,
 4464,
 4655,
 4723,
 4822,
 4898,
 5291,
 5670,
 5849,
 6131,
 6311,
 6313,
 6616,
 7225,
 7460,
 7713,
 7761,
 8336,
 8465,
 8588,
 8902,
 9029,
 9049,
 9253,
 9309,
 9418,
 9757,
 9795,
 9989,
 9,
 26,
 364,
 1347,
 1845,
 1970,
 2048,
 2079,
 2081,
 2145,
 2335,
 2404,
 2619,
 2703,
 2773,
 2942,
 3526,
 3685,
 3693,
 3715,
 3913,
 4610,
 4615,
 4691,
 4943,
 5183,
 5323,
 5331,
 5462,
 5616,
 5645,
 5975,
 6191,
 6654,
 6672,
 6716,
 6757,
 6921,
 6939,
 7623,
 7714,
 7803,
 7872,
 7914,
 8209,
 9347

In [27]:
from collections import Counter
songs_return2.sort()
counts = Counter(songs_return2)
print(counts)


Counter({8209: 2, 26: 2, 5975: 2, 9049: 2, 9818: 2, 364: 2, 7803: 2, 9858: 2, 9642: 2, 7872: 2, 3526: 2, 2048: 1, 4610: 1, 3587: 1, 9989: 1, 5895: 1, 6939: 1, 9: 1, 522: 1, 8465: 1, 5645: 1, 6672: 1, 2322: 1, 6931: 1, 5912: 1, 8219: 1, 1052: 1, 9757: 1, 2079: 1, 7713: 1, 4898: 1, 7460: 1, 9253: 1, 5670: 1, 4743: 1, 4655: 1, 5183: 1, 1347: 1, 5941: 1, 6921: 1, 7225: 1, 2619: 1, 3693: 1, 138: 1, 1845: 1, 833: 1, 8714: 1, 9795: 1, 1267: 1, 326: 1, 584: 1, 8588: 1, 2890: 1, 3659: 1, 4943: 1, 7761: 1, 4691: 1, 1619: 1, 5460: 1, 342: 1, 7768: 1, 2703: 1, 9309: 1, 3685: 1, 2145: 1, 6716: 1, 1891: 1, 2404: 1, 357: 1, 5331: 1, 2663: 1, 4615: 1, 3945: 1, 1643: 1, 2669: 1, 6205: 1, 4464: 1, 9587: 1, 6191: 1, 1240: 1, 2942: 1, 3715: 1, 9347: 1, 7557: 1, 3905: 1, 7305: 1, 1162: 1, 4236: 1, 1389: 1, 8336: 1, 2451: 1, 2196: 1, 7318: 1, 2202: 1, 4507: 1, 9885: 1, 9029: 1, 672: 1, 2979: 1, 8902: 1, 6311: 1, 6313: 1, 5291: 1, 321: 1, 6757: 1, 4017: 1, 1970: 1, 4723: 1, 3913: 1, 8889: 1, 2335: 1, 3544: 1

In [28]:
import heapq
largest_loc = heapq.nlargest(5, counts, key=counts.get)
print(largest_loc)

[8209, 26, 5975, 9049, 9818]


In [29]:
mss_song_artist.loc[largest_loc,]

Unnamed: 0,title,artist_name,artist_id,song_id
8209,b'Mark My Words (Album Version)',b'Hatebreed',b'ARPKATM1187B9B76E6',b'SOZLPVU12A6D4FD279'
26,b'Superconfidential',b'Clp',b'ARNTLGG11E2835DDB9',b'SOZQDIU12A58A7BCF6'
5975,b'There Are More Questions Than Answers',b'Johnny Nash',b'AREZWC61187FB52DCF',b'SOHJNUH12AB01891DE'
9049,b'Le Bug (Live 2005)',b'M',b'AR828WL1187FB47E81',b'SOIFMLF12A6D4F845B'
9818,b'Sugar Coated Sour',b'The Dillinger Escape Plan',b'ARMAC4T1187FB3FA4C',b'SOZXTKD12A8C13FC43'


In [32]:
input_songs = ['G G Kah', 'Where Are You', 'Mindless']
# More variable with MD

def find_locall(location,index):    
    j = 0
    a = []
    for distance in mss_num_dm3[0:num_rows,location]:
        if distance <= np.partition(mss_num_dm3[0:num_rows,location],index+1)[index] and distance !=0:
            a.append(j) 
        j = j + 1
    return a

def find_all(inputsongs):
    index = 50
    loc_song = []
    return_songs = []
    for song in input_songs:
        loc_song.append(find_loc(song))
    #print(loc_song)
    for location in loc_song:        
        #return_songs.append(find_locall(location, index))
        return_songs.append(find_locall(location, index))
    return_songlist = [item for sublist in return_songs for item in sublist]
    return return_songlist
        #print (return_songs)

songs_return3 = find_all(input_songs)
songs_return3

[26,
 138,
 357,
 364,
 443,
 456,
 833,
 1079,
 1327,
 1467,
 1747,
 1891,
 2131,
 2269,
 2322,
 2475,
 2786,
 2843,
 2890,
 2992,
 3659,
 3719,
 3793,
 3876,
 3966,
 4236,
 4507,
 4722,
 4743,
 5941,
 5993,
 6154,
 6205,
 6757,
 6946,
 7257,
 7305,
 7318,
 7411,
 7666,
 7803,
 7872,
 7984,
 8209,
 8219,
 8362,
 8913,
 9035,
 9049,
 9858,
 250,
 326,
 342,
 1240,
 1243,
 1389,
 1532,
 1882,
 2196,
 2669,
 2760,
 2805,
 3587,
 3681,
 3797,
 3967,
 4236,
 4245,
 4464,
 5291,
 5819,
 5936,
 6131,
 6275,
 6616,
 6749,
 6764,
 6837,
 6974,
 7225,
 7713,
 7761,
 7763,
 8216,
 8219,
 8465,
 8588,
 8601,
 8902,
 9029,
 9035,
 9049,
 9253,
 9373,
 9418,
 9426,
 9795,
 9,
 26,
 364,
 556,
 595,
 683,
 1088,
 1166,
 1346,
 1516,
 2048,
 2081,
 2256,
 2404,
 2619,
 2773,
 2873,
 2885,
 3088,
 3192,
 3365,
 3685,
 3715,
 3727,
 3913,
 4610,
 4691,
 4818,
 5183,
 5323,
 6129,
 6672,
 6757,
 6761,
 6939,
 7318,
 7329,
 7623,
 7714,
 7803,
 7914,
 8010,
 8041,
 8242,
 9049,
 9573,
 9642,
 9741,
 9815

In [33]:
songs_return3.sort()
counts = Counter(songs_return3)
largest_loc = heapq.nlargest(5, counts, key=counts.get)
mss_song_artist.loc[largest_loc,]

Unnamed: 0,title,artist_name,artist_id,song_id
9049,b'Le Bug (Live 2005)',b'M',b'AR828WL1187FB47E81',b'SOIFMLF12A6D4F845B'
26,b'Superconfidential',b'Clp',b'ARNTLGG11E2835DDB9',b'SOZQDIU12A58A7BCF6'
364,"b""Ain't No Love Ain't No Use (Love To Infinity...",b'Tabu feat. Mitzib',b'AR1072G1187FB5B351',b'SOPDILX12AF72AA3C2'
7803,"b""Jam-Master Jammin'""",b'RUN-DMC',b'ARX9YIP1187B98A656',b'SODMJWA12A8C13BA03'
4236,"b""I'm A Believer""",b'Tyrone Taylor',b'ARUBX2Y1187B99CD25',b'SOGTZDE12AB017E44A'


In [34]:
input_songs = ['G G Kah', 'Where Are You', 'Mindless']
# More variables with ED

def find_locall(location,index):    
    j = 0
    a = []
    for distance in mss_num_dm4[0:num_rows,location]:
        if distance <= np.partition(mss_num_dm4[0:num_rows,location],index+1)[index] and distance !=0:
            a.append(j) 
        j = j + 1
    return a

def find_all(inputsongs):
    index = 50
    loc_song = []
    return_songs = []
    for song in input_songs:
        loc_song.append(find_loc(song))
    #print(loc_song)
    for location in loc_song:        
        #return_songs.append(find_locall(location, index))
        return_songs.append(find_locall(location, index))
    return_songlist = [item for sublist in return_songs for item in sublist]
    return return_songlist
        #print (return_songs)

songs_return4 = find_all(input_songs)
songs_return4

[357,
 364,
 443,
 456,
 595,
 1079,
 1327,
 1467,
 1747,
 1891,
 2001,
 2131,
 2322,
 2512,
 2626,
 2786,
 2843,
 2890,
 2992,
 3171,
 3192,
 3876,
 3966,
 4236,
 4507,
 4681,
 4743,
 4890,
 5378,
 5804,
 5941,
 5993,
 6006,
 6205,
 6757,
 6946,
 7191,
 7257,
 7305,
 7411,
 7666,
 7803,
 7984,
 8209,
 8362,
 8913,
 9035,
 9595,
 9642,
 9858,
 250,
 326,
 342,
 455,
 456,
 1142,
 1240,
 1243,
 1389,
 2196,
 2293,
 2322,
 2418,
 2669,
 2760,
 2805,
 2979,
 3404,
 3410,
 3412,
 3587,
 3681,
 4236,
 4357,
 4464,
 4655,
 5030,
 5291,
 6131,
 6275,
 6616,
 6650,
 6749,
 6764,
 6931,
 7713,
 7761,
 7763,
 8219,
 8245,
 8465,
 8588,
 8601,
 8902,
 8933,
 9029,
 9049,
 9253,
 9426,
 9795,
 9,
 26,
 221,
 364,
 478,
 556,
 683,
 1088,
 1275,
 1376,
 1420,
 2048,
 2256,
 2404,
 2619,
 2773,
 2822,
 2885,
 3088,
 3127,
 3365,
 3447,
 3727,
 3913,
 4610,
 4691,
 4818,
 4986,
 5009,
 5183,
 5323,
 5331,
 6205,
 6532,
 6672,
 6757,
 6982,
 7623,
 7914,
 8010,
 8041,
 8157,
 8242,
 9049,
 9227,
 9573

In [35]:
songs_return4.sort()
counts = Counter(songs_return4)
largest_loc = heapq.nlargest(5, counts, key=counts.get)
mss_song_artist.loc[largest_loc,]

Unnamed: 0,title,artist_name,artist_id,song_id
2322,b'Use Of A Weapon (Album Version)',b'Between The Buried And Me',b'AR7GUNF1187B990CCF',b'SOUJQRS12A6D4F7ADE'
6205,b'Raining Revolution (Live) (Unplugged)',b'ARRESTED DEVELOPMENT',b'ARWQ3M31187FB4CF01',b'SOKELFE12A6D4F7911'
6757,b'When the Broken Hearted Love Again',b'Danielle Bollinger',b'ARORFIN1187FB3BE5C',b'SOHAMPZ12AB018AB4A'
364,"b""Ain't No Love Ain't No Use (Love To Infinity...",b'Tabu feat. Mitzib',b'AR1072G1187FB5B351',b'SOPDILX12AF72AA3C2'
9642,b'My Old Man Boogie',"b""The Reverend Peyton's Big Damn Band""",b'ARM2D2V1187B9AEB4F',b'SOZMKSN12AC96186C4'


In [36]:
input_songs = ['G G Kah', 'Where Are You', 'Mindless']
# Less variables using cosine_similarity 

def find_locall(location,index):    
    j = 0
    a = []
    for distance in mss_dm_cosine[0:num_rows,location]:
        if distance <= np.partition(mss_dm_cosine[0:num_rows,location],index+1)[index] and distance !=0:
            a.append(j) 
        j = j + 1
    return a

def find_all(inputsongs):
    index = 50
    loc_song = []
    return_songs = []
    for song in input_songs:
        loc_song.append(find_loc(song))
    #print(loc_song)
    for location in loc_song:        
        #return_songs.append(find_locall(location, index))
        return_songs.append(find_locall(location, index))
    return_songlist = [item for sublist in return_songs for item in sublist]
    return return_songlist
        #print (return_songs)

songs_return_cosine = find_all(input_songs)
songs_return_cosine

[376,
 758,
 1310,
 1408,
 1635,
 1913,
 2097,
 2111,
 2225,
 2411,
 2565,
 2601,
 2901,
 2950,
 3063,
 3157,
 3330,
 3350,
 3997,
 4065,
 4560,
 5153,
 5215,
 5518,
 5823,
 5947,
 6012,
 6046,
 6218,
 6407,
 6559,
 6608,
 6648,
 6786,
 7060,
 7877,
 7967,
 8096,
 8175,
 8267,
 8273,
 8550,
 8608,
 8639,
 8706,
 8839,
 9037,
 9068,
 9270,
 9423,
 9540,
 210,
 625,
 662,
 674,
 758,
 840,
 883,
 886,
 891,
 924,
 1465,
 1481,
 1509,
 1576,
 1635,
 1889,
 1913,
 2181,
 2270,
 2601,
 2914,
 3036,
 3157,
 3198,
 3622,
 3997,
 4355,
 4560,
 5315,
 5803,
 5947,
 5968,
 6187,
 6302,
 6520,
 6559,
 6641,
 6786,
 7302,
 7476,
 7621,
 7709,
 7958,
 8020,
 8566,
 8761,
 9255,
 9270,
 9423,
 9599,
 9924,
 139,
 789,
 883,
 886,
 1066,
 1165,
 1533,
 1635,
 1913,
 2463,
 2665,
 2762,
 2950,
 3226,
 3351,
 3729,
 3769,
 3853,
 3917,
 4190,
 4233,
 4730,
 4781,
 5561,
 5691,
 5881,
 5925,
 6079,
 6538,
 6608,
 7060,
 7350,
 7476,
 7566,
 8007,
 8060,
 8193,
 8238,
 8550]

In [37]:
songs_return_cosine.sort()
counts = Counter(songs_return_cosine)
largest_loc = heapq.nlargest(5, counts, key=counts.get)
mss_song_artist.loc[largest_loc,]

Unnamed: 0,title,artist_name,artist_id,song_id
1635,b'I Did it for You',b'David Cook',b'ARFN2TE11A348EFFB4',b'SORCRQT12A8C142A5A'
1913,b'Pray On',b'Babbie Mason',b'ARK6JC91187B9B4B2D',b'SODWDRX12A8C136E77'
2601,b'Mine Again',b'Mariah Carey',b'ARKSZW81187B9B695D',b'SOAIURX12A67020F3B'
7476,b'Open your Eyes',b'Call To Preserve',b'AR01IP11187B9AF5D2',b'SOMVWOV12AAF3B3883'
9270,"b""I'm Glad""",b'Jennifer Lopez',b'AR7C6G11187B9B4C1E',b'SOPPDZH12AF72A283A'


In [38]:
input_songs = ['G G Kah', 'Where Are You', 'Mindless']
# More variables using cosine_similarity 


def find_locall(location,index):    
    j = 0
    a = []
    for distance in mss_dm_cosine2[0:num_rows,location]:
        if distance <= np.partition(mss_dm_cosine2[0:num_rows,location],index+1)[index] and distance !=0:
            a.append(j) 
        j = j + 1
    return a

def find_all(inputsongs):
    index = 50
    loc_song = []
    return_songs = []
    for song in input_songs:
        loc_song.append(find_loc(song))
    #print(loc_song)
    for location in loc_song:        
        #return_songs.append(find_locall(location, index))
        return_songs.append(find_locall(location, index))
    return_songlist = [item for sublist in return_songs for item in sublist]
    return return_songlist
        #print (return_songs)

songs_return_cosine2 = find_all(input_songs)
songs_return_cosine2

[5,
 295,
 988,
 1310,
 1408,
 1528,
 1635,
 1913,
 1978,
 2111,
 2191,
 2225,
 2443,
 2535,
 2814,
 2901,
 2950,
 3063,
 3157,
 3303,
 3330,
 3581,
 3622,
 3802,
 3997,
 4065,
 4149,
 4371,
 4461,
 4560,
 4983,
 5153,
 5517,
 5518,
 5691,
 5947,
 6012,
 6046,
 6256,
 6407,
 6559,
 6648,
 6757,
 6775,
 7967,
 8096,
 8273,
 8550,
 9255,
 9270,
 9793,
 210,
 220,
 358,
 662,
 674,
 758,
 785,
 886,
 891,
 924,
 1465,
 1481,
 1509,
 1912,
 2110,
 2241,
 2270,
 2407,
 2601,
 2814,
 3036,
 3157,
 3305,
 3622,
 3997,
 4065,
 4560,
 5215,
 5218,
 5328,
 5517,
 5947,
 6398,
 6407,
 6460,
 6520,
 6559,
 6786,
 7092,
 7621,
 7661,
 8020,
 8566,
 8594,
 8685,
 8975,
 9036,
 9255,
 9270,
 9553,
 9874,
 139,
 570,
 749,
 789,
 883,
 886,
 1066,
 1165,
 1533,
 1913,
 2267,
 2443,
 2463,
 2665,
 2762,
 2950,
 3226,
 3351,
 3583,
 3729,
 3769,
 3853,
 3917,
 4183,
 4190,
 4233,
 4730,
 4781,
 5561,
 5691,
 5881,
 5925,
 5928,
 5985,
 6079,
 6148,
 6218,
 6256,
 6608,
 7060,
 7103,
 7350,
 7476,
 7566,

In [39]:
songs_return_cosine2.sort()
counts = Counter(songs_return_cosine2)
largest_loc = heapq.nlargest(5, counts, key=counts.get)
mss_song_artist.loc[largest_loc,]

Unnamed: 0,title,artist_name,artist_id,song_id
6407,"b""Make That Money (Scrooge's Song) (Album Vers...",b'Alice Cooper',b'AR1ABAO1187FB531E1',b'SOHPXSL12A8AE47FCB'
8550,b'Dog Tagged',b'Kill The Client',b'AR6HFW21187B9A8922',b'SOEZZFV12AB0184D09'
3622,b'Kyrie Eleison',b'Jane Winther',b'ARGAWJY11F50C50904',b'SOVAVUD12AB01832AE'
9270,"b""I'm Glad""",b'Jennifer Lopez',b'AR7C6G11187B9B4C1E',b'SOPPDZH12AF72A283A'
5691,b'Unholy Outburst #3',b'Ramesses',b'AROCXGR1187FB511B2',b'SOBFRQQ12AB018D108'


In [12]:
loc_song = 26

def find_loc2(location):
    j = 0
    for distance in mss_num_dm2[0:num_rows,location]:
        if distance == np.partition(mss_num_dm2[0:num_rows,location],2)[1]:
            return j
        j = j + 1

loc_re_song = find_loc2(loc_song)
        
mss_df['title'][loc_re_song].decode("utf-8")


'Where Are You'

In [14]:
loc_song = 26
def find_loc2(location):
    j = 0
    for distance in mss_num_dm4[0:num_rows,location]:
        if distance == np.partition(mss_num_dm4[0:num_rows,location],2)[1]:
            return j
        j = j + 1

loc_re_song = find_loc2(loc_song)
        
mss_df['title'][loc_re_song].decode("utf-8")

'Back To Burn'

In [47]:
input_songs = ['Dreidel_ Dreidel_ Dreidel_ Dreidel!', "Run's House", 'Tócale Las Palmas']


def find_locall(location,index):    
    j = 0
    a = []
    for distance in mss_num_dm[0:num_rows,location]:
        if distance <= np.partition(mss_num_dm[0:num_rows,location],index+1)[index] and distance !=0:
            a.append(j) 
        j = j + 1
    return a

def find_all(inputsongs):
    index = 50
    loc_song = []
    return_songs = []
    for song in input_songs:
        loc_song.append(find_loc(song))
    #print(loc_song)
    for location in loc_song:        
        #return_songs.append(find_locall(location, index))
        return_songs.append(find_locall(location, index))
    return_songlist = [item for sublist in return_songs for item in sublist]
    return return_songlist
        #print (return_songs)

songs_return = find_all(input_songs)
songs_return



[197,
 268,
 413,
 435,
 513,
 603,
 902,
 1508,
 1655,
 1975,
 2436,
 2480,
 2613,
 2699,
 3118,
 3386,
 3627,
 3670,
 3882,
 3932,
 4036,
 4237,
 4368,
 4583,
 4763,
 4862,
 5388,
 5541,
 6117,
 6321,
 6514,
 7016,
 7076,
 7086,
 7297,
 7472,
 7644,
 7681,
 8265,
 8782,
 8846,
 9103,
 9118,
 9672,
 9853,
 9899,
 9981,
 268,
 603,
 632,
 709,
 1508,
 1655,
 1677,
 2480,
 2515,
 2613,
 2699,
 2803,
 3118,
 3386,
 3627,
 3670,
 3740,
 3748,
 4004,
 4059,
 4095,
 4113,
 4118,
 4237,
 4763,
 4806,
 5066,
 5388,
 5671,
 5756,
 6321,
 6433,
 6637,
 6888,
 7021,
 7086,
 7755,
 7851,
 7854,
 8512,
 8782,
 8846,
 9014,
 9040,
 9072,
 9672,
 9825,
 9853,
 9933,
 9981,
 143,
 268,
 513,
 603,
 709,
 1323,
 1677,
 2421,
 2480,
 2515,
 2613,
 2699,
 2803,
 3386,
 3474,
 3476,
 3627,
 3670,
 3740,
 3748,
 3848,
 4004,
 4059,
 4095,
 4113,
 4118,
 4237,
 4763,
 4806,
 5671,
 5732,
 6202,
 6321,
 6339,
 6637,
 7086,
 7346,
 7525,
 7644,
 7755,
 7851,
 8133,
 8265,
 8511,
 8782,
 8846,
 9003,
 9072,
 

In [48]:
songs_return.sort()
counts = Counter(songs_return)
largest_loc = heapq.nlargest(5, counts, key=counts.get)
mss_song_artist.loc[largest_loc,]

Unnamed: 0,title,artist_name,artist_id,song_id
3627,b'Lo',b'Brice Kapel',b'ARSKZAQ1269FCD50EF',b'SOFVLOG12AB0180AB2'
268,b'Drop the Bass',b'The Nightraver & The Magican',b'ARHVU591187B99F46A',b'SOCWXYM12AB018BDF3'
2613,b'Freight Train',b'Elizabeth Cotten',b'ARZQSNO1187FB5BBAB',b'SOJURPV12A8C141B82'
3386,b'Passo De Anjo',b'Spok Frevo Orquestra',b'ARMQUTT12086C12BF5',b'SOFVVWJ12AB0183B6A'
8782,b'1000 Of Years Ago',b'Namatjira',b'ARNJCPV1269FCD1139',b'SODFVLR12AB0182A73'
