# **Building Music Recommendation System using Spotify Dataset**


In [12]:
!pip3 install -r requirements.txt

Collecting google
  Downloading google-3.0.0-py2.py3-none-any.whl (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.3/45.3 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Collecting sklearn
  Using cached sklearn-0.0.post1-py3-none-any.whl
Installing collected packages: sklearn, google
Successfully installed google-3.0.0 sklearn-0.0.post1


In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

# **Import Libraries**

In [1]:
import os
import numpy as np
import pandas as pd

import seaborn as sns
import plotly.express as px 
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist

import warnings
warnings.filterwarnings("ignore")

# **Read Data**

In [2]:
# TODO: create data, genre_data, and year_data variables and import data

In [1]:
# print data info

# **Data Understanding by Visualization and EDA**

# **Music Over Time**

Using the data grouped by year, we can understand how the overall sound of music has changed from 1921 to 2020.

In [5]:
# TODO: Create function to sort into decade bins and visualize

In [6]:
# TODO: select A, D, E, I, L, V and visualize over time

# **Characteristics of Different Genres**

This dataset contains the audio features for different songs along with the audio features for different genres. We can use this information to compare different genres and understand their unique differences in sound.

In [7]:
# TODO: Get top 10 most popular genres and visualize their V, E, D, A

# **Clustering Genres with K-Means**

Here, the simple K-means clustering algorithm is used to divide the genres in this dataset into ten clusters based on the numerical audio features of each genres.

In [10]:
# EXAMPLE
# StandardScaler() => standardize data (mathematical thing)
# KMeans() => KMeans algorithm
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

cluster_pipeline = Pipeline([('scaler', StandardScaler()), ('kmeans', KMeans(n_clusters=10))])
X = genre_data.select_dtypes(np.number)
cluster_pipeline.fit(X)
genre_data['cluster'] = cluster_pipeline.predict(X)

In [11]:
# Visualizing the Clusters with t-SNE
# t-SNE => algorithm that allows us to view multidim data in a way we can interpret (2D, 3D, or 4D)
from sklearn.manifold import TSNE

tsne_pipeline = Pipeline([('scaler', StandardScaler()), ('tsne', TSNE(n_components=2, verbose=1))])
genre_embedding = tsne_pipeline.fit_transform(X)
projection = pd.DataFrame(columns=['x', 'y'], data=genre_embedding)
projection['genres'] = genre_data['genres']
projection['cluster'] = genre_data['cluster']

fig = px.scatter(
    projection, x='x', y='y', color='cluster', hover_data=['x', 'y', 'genres'])
fig.show()

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 2973 samples in 0.002s...
[t-SNE] Computed neighbors for 2973 samples in 0.148s...
[t-SNE] Computed conditional probabilities for sample 1000 / 2973
[t-SNE] Computed conditional probabilities for sample 2000 / 2973
[t-SNE] Computed conditional probabilities for sample 2973 / 2973
[t-SNE] Mean sigma: 0.777516
[t-SNE] KL divergence after 250 iterations with early exaggeration: 76.113174
[t-SNE] KL divergence after 1000 iterations: 1.391461


# **Clustering Songs with K-Means**

In [8]:
# StandardScaler() => standardize data (mathematical thing)
# KMeans() => KMeans algorithm

# TODO: Create pipeline (Standardize data, then apply KMeans), select X and y variable data and columns list, fit and predict data, and save predictions

In [9]:
# Visualizing the Clusters with PCA
from sklearn.decomposition import PCA

# PCA also allows us to view multi-dimensional data in a 2D way
# think of representing a hand on a flat surface => a shadow!

# TODO: Create Pipeline (StandardScalar, PCA), fit and transform X data from before, save our data and visualize

# **Build Recommender System**

* Based on the analysis and visualizations, it’s clear that similar genres tend to have data points that are located close to each other while similar types of songs are also clustered together.
* This observation makes perfect sense. Similar genres will sound similar and will come from similar time periods while the same can be said for songs within those genres. We can use this idea to build a recommendation system by taking the data points of the songs a user has listened to and recommending songs corresponding to nearby data points.
* [Spotipy](https://spotipy.readthedocs.io/en/2.16.1/) is a Python client for the Spotify Web API that makes it easy for developers to fetch data and query Spotify’s catalog for songs. You have to install using `pip install spotipy`
* After installing Spotipy, you will need to create an app on the [Spotify Developer’s page](https://developer.spotify.com/) and save your Client ID and secret key.

In [14]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from collections import defaultdict

# create environment variable
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id="YOUR ID",
                                                           client_secret="YOUR SECRET"))

def find_song(name, year):
    song_data = defaultdict()
    results = sp.search(q='track: {} year: {}'.format(name,year), limit=1)
    if results['tracks']['items'] == []:
        return None

    results = results['tracks']['items'][0]
    track_id = results['id']
    audio_features = sp.audio_features(track_id)[0]

    song_data['name'] = [name]
    song_data['year'] = [year]
    song_data['explicit'] = [int(results['explicit'])]
    song_data['duration_ms'] = [results['duration_ms']]
    song_data['popularity'] = [results['popularity']]

    for key, value in audio_features.items():
        song_data[key] = value

    return pd.DataFrame(song_data)

In [15]:
from collections import defaultdict
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist
import difflib

number_cols = ['valence', 'year', 'acousticness', 'danceability', 'duration_ms', 'energy', 'explicit',
 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'popularity', 'speechiness', 'tempo']


def get_song_data(song, spotify_data):
    
    try:
        song_data = spotify_data[(spotify_data['name'] == song['name']) 
                                & (spotify_data['year'] == song['year'])].iloc[0]
        return song_data
    
    except IndexError:
        return find_song(song['name'], song['year'])
        

def get_mean_vector(song_list, spotify_data):
    
    song_vectors = []
    
    for song in song_list:
        song_data = get_song_data(song, spotify_data)
        if song_data is None:
            print('Warning: {} does not exist in Spotify or in database'.format(song['name']))
            continue
        song_vector = song_data[number_cols].values
        song_vectors.append(song_vector)  
    
    song_matrix = np.array(list(song_vectors))
    return np.mean(song_matrix, axis=0)


def flatten_dict_list(dict_list):
    
    flattened_dict = defaultdict()
    for key in dict_list[0].keys():
        flattened_dict[key] = []
    
    for dictionary in dict_list:
        for key, value in dictionary.items():
            flattened_dict[key].append(value)
            
    return flattened_dict


def recommend_songs( song_list, spotify_data, n_songs=10):
    
    # TODO: Create recommendation algorithm (A LOT OF LINEAR ALGEBRA, FOLLOW ALONG!!!)

Call the ```recommend_songs()``` function in the following manner: 

In [16]:
recommend_songs([{'name': 'Meh', 'year':2020},
                {'name': 'Location', 'year': 2018},
                {'name': 'Call Me', 'year': 2018},
                {'name': 'JACKIE BROWN', 'year': 2022},
                {'name': 'DEAD MAN WALKING', 'year': 2018}],  data)

[{'name': 'Zoo York (feat. Fivio Foreign & Pop Smoke)',
  'year': 2020,
  'artists': "['Lil Tjay', 'Fivio Foreign', 'Pop Smoke']"},
 {'name': '(i hope you) miss me', 'year': 2020, 'artists': "['Joseph Black']"},
 {'name': 'Wishing For A Hero (feat. BJ The Chicago Kid)',
  'year': 2020,
  'artists': "['Polo G', 'BJ The Chicago Kid']"},
 {'name': 'Swervin (feat. 6ix9ine)',
  'year': 2018,
  'artists': "['A Boogie Wit da Hoodie', '6ix9ine']"},
 {'name': 'Needed Me', 'year': 2016, 'artists': "['Rihanna']"},
 {'name': 'Somebody',
  'year': 2019,
  'artists': "['Internet Money', 'Lil Tecca', 'A Boogie Wit da Hoodie']"},
 {'name': 'F.N', 'year': 2019, 'artists': "['Lil Tjay']"},
 {'name': 'BERETTA (feat. Wifisfuneral)',
  'year': 2020,
  'artists': "['$NOT', 'Wifisfuneral']"},
 {'name': 'You Got It', 'year': 2020, 'artists': "['Vedo']"},
 {'name': 'Dolly (with Lil Uzi Vert)',
  'year': 2020,
  'artists': "['Lil Tecca', 'Lil Uzi Vert']"}]