# Getting data from spotify for RecNet

In [None]:
import pandas as pd
import requests
from requests.structures import CaseInsensitiveDict
import json

pd.options.mode.chained_assignment = None  # default='warn'

In [None]:
import numpy as np


### Update your username below. This is just to identify that its your data, so it can be anything. In the actual implementation this can be replaced with RecNet username.

In [None]:
username = ''

### Step 1: Download a csv file from https://watsonbox.github.io/exportify/, and assign the path variable with its appropriate location.

In [None]:
path = 'liked.csv'
df = pd.read_csv(path)
print("Downloaded data:")
df

In [None]:
print("Relevant data: ")
data = df[['Track URI', 'Track Name', 'Artist URI(s)', 'Artist Name(s)', 'Popularity']]
for i, row in data.iterrows():
    data.at[i, 'Artist'] = row['Artist Name(s)'].split(',')[0]
    data.at[i, 'Other Artists'] = ','.join(row['Artist Name(s)'].split(',')[1:])
data

### Assign the genereted on Spotify console below (as a string)
Generate code here: https://developer.spotify.com/console/get-audio-features-several-tracks/

Note: Make sure you copy the entire token (its kinda huge and may have a lot of hyphens, so avoid selecting using a double click, instead use cmd+A (or its equivalent))

In [None]:
token = ''

In [None]:
track_ids = 'ids='
for row in df['Track URI']:
    track_ids += row.split(':')[2] + '%2C'
track_ids

url = "https://api.spotify.com/v1/audio-features"
url += '?' + track_ids
headers = CaseInsensitiveDict()
headers["Accept"] = "application/json"
headers["Content-Type"] = "application/json"
headers["Authorization"] = "Bearer " + token
resp = requests.get(url, headers=headers)
audio_features_json = resp.json()
audio_features_df = pd.json_normalize(audio_features_json['audio_features'])
audio_features = audio_features_df[['uri', 'danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence']]
audio_features = audio_features.rename(columns = {'uri': 'Track URI'})
audio_features

### Extracting album ids and getting genre details

In [None]:
album_ids = []

for row in df['Album URI']:
    album_ids.append(row.split(':')[2])
    
album_ids = list(dict.fromkeys(album_ids))
batch_size = 15
batches = []
i = 0
while i < len(album_ids):
    batch = album_ids[i:i+batch_size]
    i += len(batch)
    batches.append(batch)
    
# del(album_genres)
for batch in batches:
    album_ids_url = 'ids='
    for album_id in batch:
        album_ids_url += album_id + '%2C'
    
    album_ids_url = album_ids_url[0:len(album_ids_url) - 3]
    url = "https://api.spotify.com/v1/albums"
    url += '?' + album_ids_url
    
    headers = CaseInsensitiveDict()
    headers["Accept"] = "application/json"
    headers["Content-Type"] = "application/json"
    headers["Authorization"] = "Bearer " + token
    resp = requests.get(url, headers=headers)
    album_details_json = resp.json()

    album_details_df = pd.json_normalize(album_details_json['albums'])
    try:
        album_genres
    except NameError:
        album_genres = album_details_df[['uri', 'genres']]
        album_genres = album_genres.rename(columns = {'uri': 'Album URI'})
    else:
        album_genres2 = album_details_df[['uri', 'genres']]
        album_genres2 = album_genres2.rename(columns = {'uri': 'Album URI'})
        album_genres = pd.concat([album_genres, album_genres2],ignore_index=True)
album_genres


## Merging all the retrieved data and saving it to username_spotifydata.csv.

In [None]:
data = df[['Track URI', 'Album URI', 'Track Name', 'Artist URI(s)', 'Artist Name(s)', 'Popularity']]
data = pd.merge(data, audio_features, on = ['Track URI'])
data = pd.merge(data, album_genres, on=['Album URI'])
data['Username'] = username
data

In [None]:
data.to_csv(username + 'spotifydata.csv')

In [None]:
columns = ["energy",  "speechiness", "acousticness", "instrumentalness", "loudness", "danceability",'valence' , "liveness"]

In [None]:
audio_features["loudness"] = (audio_features['loudness']-audio_features['loudness'].min())/(audio_features['loudness'].max()-audio_features['loudness'].min())

In [None]:
audio_features.to_csv(username + 'songFeatures.csv')

### Taking the data and using it to get similar songs based on user likes

The script ahead takes audio_features_p1 to be the liked songs of user_1 and audio_features_p2 to be liked songs for user_2.  

In [None]:
data = pd.read_csv(username + 'spotifydata.csv')
audio_features = pd.read_csv(username + 'songFeatures.csv')

In [None]:
def euclidean_distance(arg1, arg2):
  '''
  Returns the euclidean distance between arg1 and arg2
  
  Parameters
            arg1 (numpy array)
            arg2 (numpy array)
  Output
            float distance
  '''
  return np.sqrt(np.sum((arg1 - arg2)**2))

In [None]:
# !pip install yellowbrick
# !pip install scikit-learn-extra

In [None]:
from sklearn_extra.cluster import KMedoids
from sklearn.cluster import KMeans

In [None]:
audio_features[columns]

In [None]:
from yellowbrick.cluster import KElbowVisualizer

In [None]:
model = KMedoids()
visualizer = KElbowVisualizer(model, k=(2,8), metric='distortion', timings=False)
visualizer.fit(audio_features[columns])
score = visualizer.elbow_score_
value = visualizer.elbow_value_


In [None]:
kmed = KMedoids(n_clusters=5, random_state=12022).fit(audio_features[columns])

In [None]:
kmed.labels_

In [None]:
kmed.predict([audio_features.iloc[3][columns]])

In [None]:
# Take the 5 elements with least distances from centroid 
chosen_element = audio_features[columns].apply(lambda row : euclidean_distance(row, kmed.cluster_centers_[1]), axis=1).nsmallest(5).index

In [None]:
def getAlbumFromTrack(tracks, features_data):
  '''
  This function takes in the track ID and returns the album ID from the dataframe
  '''
  ret = []
  for index, track in tracks.iteritems():
    df_item = features_data[features_data['Track URI'] == track]['Album URI'].item()
    ret.append(df_item[df_item.rfind(':')+1:])
  return ret

In [None]:
album_id = getAlbumFromTrack(audio_features['Track URI'].loc[chosen_element], data)

In [None]:
def getTrackSuggestionFromAlbum(album_ids):
  '''

  This function takes in mmultiple album IDs and gets a recommendation from the songs in an album
  '''

  url = "https://api.spotify.com/v1/albums/"
  # url += album_id + '/tracks'
  urls = [url + album_id + '/tracks' for album_id in album_ids]
  headers = CaseInsensitiveDict()
  headers["Accept"] = "application/json"
  headers["Content-Type"] = "application/json"
  headers["Authorization"] = "Bearer " + token
  ret = []
  for url in urls:
    resp = requests.get(url, headers=headers)
    album_tracks_json = resp.json()

    album_tracks_df = pd.json_normalize(album_tracks_json['items'])
    idx = np.random.randint(len(album_tracks_df))
    ret.append({'song_name':album_tracks_df['name'].iloc[idx], 'song_url': album_tracks_df['external_urls.spotify'].iloc[idx]})
  return ret

In [None]:
getTrackSuggestionFromAlbum(album_id)

Test for compatibility between person 1 and person 2


In [None]:
path = 'liked_p2.csv'
df = pd.read_csv(path)
print("Downloaded data:")
df

In [None]:
track_ids = 'ids='
for row in df['Track URI']:
    track_ids += row.split(':')[2] + '%2C'
track_ids

url = "https://api.spotify.com/v1/audio-features"
url += '?' + track_ids
headers = CaseInsensitiveDict()
headers["Accept"] = "application/json"
headers["Content-Type"] = "application/json"
headers["Authorization"] = "Bearer " + token
resp = requests.get(url, headers=headers)
audio_features_json = resp.json()
audio_features_df = pd.json_normalize(audio_features_json['audio_features'])
audio_features_p2 = audio_features_df[['uri', 'danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence']]
audio_features_p2 = audio_features_p2.rename(columns = {'uri': 'Track URI'})
# audio_features
audio_features_p2["loudness"] = (audio_features_p2['loudness']-audio_features_p2['loudness'].min())/(audio_features_p2['loudness'].max()-audio_features_p2['loudness'].min())


In [None]:
model = kmed

In [None]:
data2 = pd.read_csv('liked_p2.csv')

In [None]:
audio_features_p2

In [None]:
classes = []
for index, row in audio_features[columns].iterrows():
  classes.append(model.predict(row.values.reshape(1, -1))[0])


In [None]:
# Calculate similarity scores based on distance of all songs from cluster centers
# value between 0 and 1
similarity_score_1_2 = 0

classes = [0, 1, 2, 3, 4]
cluster_map = pd.DataFrame()
cluster_map['data_index'] = audio_features[columns].index.values
cluster_map['cluster'] = model.labels_
cluster_map['dists'] = cluster_map.apply(lambda row: euclidean_distance(model.cluster_centers_[row['cluster']], audio_features[columns].iloc[row['data_index']].values), axis = 1)

max_dists = []
for classVal in classes:
  max_dists.append(cluster_map[cluster_map['cluster'] == classVal]['dists'].max())
# cluster_map
max_dists

In [None]:
p2_prediction = model.predict(audio_features_p2[columns])
p2_prediction

In [None]:
for index, pred in enumerate(p2_prediction):
  t = euclidean_distance(audio_features[columns].iloc[index], model.cluster_centers_[pred]) / max_dists[pred]
  similarity_score_1_2+=t
similarity_score_1_2 /= len(p2_prediction)

In [None]:
similarity_score_1_2