In [125]:
# Import the needs
import numpy as np # linear algebra
import pandas as pd # data processing

import json

In [126]:
# Define a function named getMusicName that takes a single argument 'elem'
def getMusicName(elem):
    
    # The function returns a string formatted as 'artist - song_title'
    # elem is expected to be a dictionary with 'artist' and 'song_title' as keys
    # The 'format' method replaces the placeholders {} with the respective values from the elem dictionary
    return '{} - {}'.format(elem['artist'], elem['song_title'])

In [127]:
# Load dataset
# The 'index_col=0' argument specifies that the first column should be used as the index of the DataFrame
dfSongs = pd.read_csv('data.csv', index_col=0)

# Get the number of rows (songs) and columns (attributes) in the DataFrame
# The shape attribute of the DataFrame returns a tuple (number of rows, number of columns)
rows, cols = dfSongs.shape

# Print the number of songs (rows) and number of attributes (columns) in the dataset
print('Number of songs: {}'.format(rows))
print('Number of attributes per song: {}'.format(cols))

Number of songs: 2017
Number of attributes per song: 16


In [None]:
# the above script loads a CSV file into a pandas DataFrame, 
# then calculates and prints the number of rows (songs) 
# and columns (attributes) in the dataset.

In [128]:
# Print the columns This line of code outputs the names of all the columns (attributes) 
# in the dfSongs DataFrame. It helps to understand what attributes are available for each song.

display(dfSongs.columns)

Index(['acousticness', 'danceability', 'duration_ms', 'energy',
       'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
       'speechiness', 'tempo', 'time_signature', 'valence', 'target',
       'song_title', 'artist'],
      dtype='object')

In [129]:
# Print the attributes type
# Display a concise summary of the DataFrame using the 'info()' method
# This will print the following information:
# 1. Index range (number of rows)
# 2. Column names
# 3. Number of non-null values per column
# 4. Data types of each column (e.g., int64, float64, object, etc.)
# 5. Memory usage of the DataFrame

dfSongs.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2017 entries, 0 to 2016
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   acousticness      2017 non-null   float64
 1   danceability      2017 non-null   float64
 2   duration_ms       2017 non-null   int64  
 3   energy            2017 non-null   float64
 4   instrumentalness  2017 non-null   float64
 5   key               2017 non-null   int64  
 6   liveness          2017 non-null   float64
 7   loudness          2017 non-null   float64
 8   mode              2017 non-null   int64  
 9   speechiness       2017 non-null   float64
 10  tempo             2017 non-null   float64
 11  time_signature    2017 non-null   float64
 12  valence           2017 non-null   float64
 13  target            2017 non-null   int64  
 14  song_title        2017 non-null   object 
 15  artist            2017 non-null   object 
dtypes: float64(10), int64(4), object(2)
memory usag

In [130]:
dfSongs[['song_title', 'artist']].head(5)

Unnamed: 0,song_title,artist
0,Mask Off,Future
1,Redbone,Childish Gambino
2,Xanny Family,Future
3,Master Of None,Beach House
4,Parallel Lines,Junior Boys


In [131]:
# Select a song
anySong = dfSongs.loc[0]
# Get the song name
anySongName = getMusicName(anySong)
print('name:', anySongName)

name: Future - Mask Off


In [132]:
# K-query for K-furthest neighbour
# query point = song / arrCharactPoints = features and k = number of furthest neighbours
def kfnQuery(queryPoint, arrCharactPoints, k):
    
    #creating an array of 
    tmp = arrCharactPoints.copy(deep=True)
    
    tmp['dist'] = tmp.apply(lambda x: np.linalg.norm(x-queryPoint), axis=1)
    # sort by distance in descending order for K-furthest Neighbours
    tmp = tmp.sort_values('dist')
    
    # return the indices of the k furthest neighbour
    return tmp.tail(k).index

# Range query for points beyond a certain radius
def rangeFurthestQuery(queryPoint, arrCharactPoints, radius):
    tmp = arrCharactPoints.copy(deep=True)
    tmp['dist'] = tmp.apply(lambda x: np.linalg.norm(x-queryPoint), axis=1)
    
    # Mark points that are outside the radius (furthest from the query point)
    tmp['outside_radius'] = tmp.apply(lambda x: 1 if x['dist'] <= radius else 0, axis=1)
    
    # Return indices of points that are outside the radius
    return tmp.query('outside_radius == 1').index

In [133]:
# Execute k-NN removing the 'query point'
def querySimilars(df, columns, idx, func, param):
    arr = df[columns].copy(deep=True)
    queryPoint = arr.loc[idx]
    arr = arr.drop([idx])
    response = func(queryPoint, arr, param)
    return response

In [168]:
# Selecting song and attributes

# danceability score of 0.867
songIndex = 84 # query point, selected song
columns = ['acousticness','danceability','energy','instrumentalness','liveness','speechiness','valence']

# Selecting query parameters
func, param = kfnQuery, 3 # k=3

# Querying
response = querySimilars(dfSongs, columns, songIndex, func, param)

In [169]:
# Select a song
anySong = dfSongs.loc[songIndex]
# Get the song name
anySongName = getMusicName(anySong)

# Print
print('# Query Point')
print(songIndex, anySongName)

# Query Point
84 James Brown - The Payback


In [170]:
response

Index([1596, 1554, 1876], dtype='int64')

In [171]:
# print('# Similar songs')
# for idx in response:
#     anySong = dfSongs.loc[idx]
#     anySongName = getMusicName(anySong)
    
#     print(idx, anySongName)

In [172]:
# Selecting song and attributes
songIndex = 6 # query point, selected song
#7 attributes
columns = ['acousticness','danceability','energy','instrumentalness','liveness','speechiness','valence']

# Selecting query parameters
func, param = kfnQuery, 3 # threshold distance

# Querying
response = querySimilars(dfSongs, columns, songIndex, func, param)

In [173]:
# give furthest neighbor to queried songs
response

Index([1530, 1546, 1876], dtype='int64')

In [176]:
# print('# Similar songs')
# for idx in response:
#     anySong = dfSongs.loc[idx]
#     anySongName = getMusicName(anySong)
    
#     print(idx, anySongName)

# Similar songs
1530 Passport Quartet - I'm a Fool to Want You
1546 Jan Lundgren Trio - Almas vaggvisa
1876 Frédéric Chopin - Nocturne No.1 In B Flat Minor, Op.9 No.1


In [190]:
# similar songs // regular recommendations

# Defining the query point and the attributes
# k = 3
# queryPoint = [1, 1, 1] # query point

# # adjust attributes to 
# columns = ['danceability','energy','valence']

# # Searching for the songs
# arr = dfSongs[columns].copy(deep=True)
# response = kfnQuery(queryPoint, arr, k)

# # Printing
# for idx in response:
#     anySong = dfSongs.loc[idx]
#     anySongName = getMusicName(anySong)
    
#     print(idx, anySongName)

In [191]:
# Break out recommendations / furthest neighbours

# Defining the query point and the attributes
k = 3
queryPoint = [0.2, 0.2, 0.2] # query point
columns = ['instrumentalness','energy','speechiness']

# Searching for the songs
arr = dfSongs[columns].copy(deep=True)
response = kfnQuery(queryPoint, arr, k)

# Printing
# print('# Active, cheerful songs')
for idx in response:
    anySong = dfSongs.loc[idx]
    anySongName = getMusicName(anySong)
    
    print(idx, anySongName)

1318 Obscure of Acacia - Rope
1349 Aversions Crown - Prismatic Abyss
1322 Walking Dead On Broadway - Pitchblack
