# Building Recommender Systems with Spotify Data

## Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import spotipy
import datapane as dp
import chart_studio.plotly as py
import chart_studio
import os
# username = os.environ['PLOTLY_USERNAME']
# api_key = os.environ['PLOTLY_API_KEY']
# chart_studio.tools.set_credentials_file(username=username, api_key=api_key)
plt.rcParams.update({'font.size': 22})
%matplotlib inline

## Reading the Data

In [None]:
spotify_data = pd.read_csv("D:\\bs4\\music-recommendation\\SpotifyRecommenderSystem\\data\\data.csv.zip")   
genre_data = pd.read_csv('D:\\bs4\\music-recommendation\\SpotifyRecommenderSystem\\data\\data_by_genres.csv') 
data_by_year = pd.read_csv('D:\\bs4\\music-recommendation\\SpotifyRecommenderSystem\\data\\data_by_year.csv')   
spotify_data.head(10)

In [None]:
spotify_data.info()

In [None]:
genre_data.info()

In [None]:
data_by_year.info()

## Exploratory Data Analysis

In [None]:
sns.distplot(spotify_data['popularity'])

### Music Over Time

In [None]:
def get_decade(year):
    
    period_start = int(year/10) * 10
    decade = '{}'.format(period_start)
    
    return decade

spotify_data['decade'] = spotify_data['year'].apply(get_decade)

In [None]:
sns.set(rc={'figure.figsize':(11 ,6)})
sns.countplot(spotify_data['decade'])

In [None]:
import plotly.express as px 

sound_features = ['acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'valence']
fig = px.line(data_by_year, x='year', y=sound_features)

fig.show()


# report = dp.Report(dp.Plot(fig) ) #Create a report
# report.publish(name='music_over_time', open=True, visibility='PUBLIC') #Publish the report

In [None]:
fig = px.line(data_by_year, x='year', y='tempo')

fig.show()


# report = dp.Report(dp.Plot(fig) ) #Create a report
# report.publish(name='music_tempo_over_time', open=True, visibility='PUBLIC') #Publish the report

### Characteristics of Different Genres

In [None]:
top10_genres = genre_data.nlargest(10, 'popularity')

In [None]:
fig = px.bar(top10_genres, x='genres', y=['valence', 'energy', 'danceability', 'acousticness'], barmode='group')

fig.show()
# report = dp.Report(dp.Plot(fig) ) #Create a report
# report.publish(name='sound_of_different_genres', open=True, visibility='PUBLIC') #Publish the report

## Clustering Genres

In [None]:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

cluster_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('dbscan', DBSCAN(eps=4, min_samples=2))
    # ('dbscan', DBSCAN(eps=0.5, min_samples=5))
])

X = genre_data.select_dtypes(np.number)
cluster_pipeline.fit(X)
genre_data['cluster'] = cluster_pipeline.named_steps['dbscan'].fit_predict(X)

In [None]:
from sklearn.manifold import TSNE

tsne_pipeline = Pipeline([('scaler', StandardScaler()), ('tsne', TSNE(n_components=2, verbose=2))])
genre_embedding = tsne_pipeline.fit_transform(X)

projection = pd.DataFrame(columns=['x', 'y'], data=genre_embedding)
projection['genres'] = genre_data['genres']
projection['cluster'] = genre_data['cluster']

In [None]:
import plotly.express as px

fig = px.scatter(
    projection, x='x', y='y', color='cluster', hover_data=['x', 'y', 'genres'])
fig.show()


# report = dp.Report(dp.Plot(fig) ) #Create a report
# report.publish(name='clustering_genres', open=True, visibility='PUBLIC') #Publish the report

## Clustering Songs

In [None]:
cluster_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('dbscan', DBSCAN(eps=3, min_samples=5))
])

X = spotify_data.select_dtypes(np.number)
number_cols = list(X.columns)
cluster_pipeline.fit(X)
spotify_data['cluster_label'] = cluster_pipeline.named_steps['dbscan'].fit_predict(X)

In [None]:
# spotify_data['cluster_label'] = cluster_pipeline.named_steps['dbscan'].fit_predict(X)

In [None]:
from sklearn.decomposition import PCA

pca_pipeline = Pipeline([('scaler', StandardScaler()), ('PCA', PCA(n_components=2))])
song_embedding = pca_pipeline.fit_transform(X)

projection = pd.DataFrame(columns=['x', 'y'], data=song_embedding)
projection['title'] = spotify_data['name']
projection['cluster'] = spotify_data['cluster_label']

In [None]:
import plotly.express as px

fig = px.scatter(
    projection, x='x', y='y', color='cluster', hover_data=['x', 'y', 'title'])
fig.show()


# report = dp.Report(dp.Plot(fig) ) #Create a report
# report.publish(name='clustering_songs', open=True, visibility='PUBLIC') #Publish the report

## Building a Content-Based Recommender System

In [None]:
# Import libraries
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler

# Initialize Spotify API

client_id = "00597080cc594010b69e8406c08edfa3"
client_secret = "165ccddc2e04454cbd0da4ee29bfd667"
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

# Define a function to get audio features of a track
def get_audio_features(track):
    # Get track id
    track_id = sp.search(track, limit=1)["tracks"]["items"][0]["id"]
    # Get audio features
    audio_features = sp.audio_features(track_id)[0]
    # Return a dictionary of audio features
    return {
        "danceability": audio_features["danceability"],
        "energy": audio_features["energy"],
        "loudness": audio_features["loudness"],
        "speechiness": audio_features["speechiness"],
        "acousticness": audio_features["acousticness"],
        "instrumentalness": audio_features["instrumentalness"],
        "liveness": audio_features["liveness"],
        "valence": audio_features["valence"],
        "tempo": audio_features["tempo"]
    }


tracks = ["Bad Habits - Ed Sheeran", 
          "Levitating - Dua Lipa",  
          "Blinding Lights - The Weeknd",
          "Happier Than Ever - Billie Eilish",
          "Circles - Post Malone"]

# Define a function to get recommendations based on DBSCAN clustering
def get_recommendations(tracks):
    # Create an empty dataframe to store the tracks and their audio features
    df = pd.DataFrame(columns=["track", "danceability", "energy", "loudness", "speechiness", "acousticness", "instrumentalness", "liveness", "valence", "tempo"])
    # For each track, get the audio features and append to the dataframe
    for track in tracks:
        audio_features = get_audio_features(track)
        df = df.append({"track": track, **audio_features}, ignore_index=True)
    # Scale the audio features
    scaler = StandardScaler()
    X = scaler.fit_transform(df.drop("track", axis=1))
    # Fit DBSCAN with eps=0.5 and min_samples=2
    dbscan = DBSCAN(eps=4, min_samples=2).fit(X)
    print("dbscan:",dbscan)
    # Get the cluster labels
    labels = dbscan.labels_
    print("labels:",labels)
    # Get the number of clusters (excluding noise)
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    # Print the number of clusters and the tracks in each cluster
    print(f"Number of clusters: {n_clusters}")
    for i in range(n_clusters):
        print(f"Cluster {i}:")
        print(df[labels == i]["track"].to_list())
    # For each cluster, get 5 recommendations from Spotify based on the seed tracks in that cluster
    for i in range(n_clusters):
        print(f"Recommendations for cluster {i}:")
        seed_tracks = df[labels == i]["track"].to_list()
        seed_ids = [sp.search(track, limit=1)["tracks"]["items"][0]["id"] for track in seed_tracks]
        recommendations = sp.recommendations(seed_tracks=seed_ids, limit=5)["tracks"]
        for rec in recommendations:
            print(rec["name"], "-", rec["artists"][0]["name"])
            
get_recommendations(tracks)