In [None]:
''' Install Spotipy package '''

!pip install spotipy

In [None]:
''' Mount Google Drive '''

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
''' IMPORTS '''

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.cluster import DBSCAN, KMeans, SpectralClustering, AgglomerativeClustering
from sklearn.neighbors import NearestNeighbors
from collections import Counter
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture

from sklearn.manifold import TSNE
import plotly.express as px

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

In [None]:
''' READ DATA '''

data = pd.read_csv('drive/MyDrive/USML/v2/data.csv')
genre_data = pd.read_csv('drive/MyDrive/USML/v2/data_by_genres.csv')
year_data = pd.read_csv('drive/MyDrive/USML/v2/data_by_year.csv')
artist_data = pd.read_csv('drive/MyDrive/USML/v2/data_by_artist.csv')

In [None]:
''' PRINT DATA '''

data.head()

Unnamed: 0,valence,year,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo
0,0.0594,1921,0.982,"['Sergei Rachmaninoff', 'James Levine', 'Berli...",0.279,831667,0.211,0,4BJqT0PrAfrxzMOxytFOIz,0.878,10,0.665,-20.096,1,"Piano Concerto No. 3 in D Minor, Op. 30: III. ...",4,1921,0.0366,80.954
1,0.963,1921,0.732,['Dennis Day'],0.819,180533,0.341,0,7xPhfUan2yNtyFG0cUWkt8,0.0,7,0.16,-12.441,1,Clancy Lowered the Boom,5,1921,0.415,60.936
2,0.0394,1921,0.961,['KHP Kridhamardawa Karaton Ngayogyakarta Hadi...,0.328,500062,0.166,0,1o6I8BglA6ylDMrIELygv1,0.913,3,0.101,-14.85,1,Gati Bali,5,1921,0.0339,110.339
3,0.165,1921,0.967,['Frank Parker'],0.275,210000,0.309,0,3ftBPsC5vPBKxYSee08FDH,2.8e-05,5,0.381,-9.316,1,Danny Boy,3,1921,0.0354,100.109
4,0.253,1921,0.957,['Phil Regan'],0.418,166693,0.193,0,4d6HGyGT8e121BsdKmw9v6,2e-06,3,0.229,-10.096,1,When Irish Eyes Are Smiling,2,1921,0.038,101.665


In [None]:
''' PRINT GENRE DATA '''

genre_data.head()

Unnamed: 0,mode,genres,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key
0,1,21st century classical,0.979333,0.162883,160297.7,0.071317,0.606834,0.3616,-31.514333,0.040567,75.3365,0.103783,27.833333,6
1,1,432hz,0.49478,0.299333,1048887.0,0.450678,0.477762,0.131,-16.854,0.076817,120.285667,0.22175,52.5,5
2,1,8-bit,0.762,0.712,115177.0,0.818,0.876,0.126,-9.18,0.047,133.444,0.975,48.0,7
3,1,[],0.651417,0.529093,232880.9,0.419146,0.205309,0.218696,-12.288965,0.107872,112.857352,0.513604,20.859882,7
4,1,a cappella,0.676557,0.538961,190628.5,0.316434,0.003003,0.172254,-12.479387,0.082851,112.110362,0.448249,45.820071,7


In [None]:
''' PRINT YEAR DATA '''

year_data.head()

Unnamed: 0,mode,year,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key
0,1,1921,0.886896,0.418597,260537.166667,0.231815,0.344878,0.20571,-17.048667,0.073662,101.531493,0.379327,0.653333,2
1,1,1922,0.938592,0.482042,165469.746479,0.237815,0.434195,0.24072,-19.275282,0.116655,100.884521,0.535549,0.140845,10
2,1,1923,0.957247,0.577341,177942.362162,0.262406,0.371733,0.227462,-14.129211,0.093949,114.01073,0.625492,5.389189,0
3,1,1924,0.9402,0.549894,191046.707627,0.344347,0.581701,0.235219,-14.231343,0.092089,120.689572,0.663725,0.661017,10
4,1,1925,0.962607,0.573863,184986.92446,0.278594,0.418297,0.237668,-14.146414,0.111918,115.521921,0.621929,2.604317,5


In [None]:
''' PRINT ARTIST DATA '''

artist_data.head()

Unnamed: 0,mode,count,acousticness,artists,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,key
0,1,9,0.590111,"""Cats"" 1981 Original London Cast",0.467222,250318.555556,0.394003,0.0114,0.290833,-14.448,0.210389,117.518111,0.3895,38.333333,5
1,1,26,0.862538,"""Cats"" 1983 Broadway Cast",0.441731,287280.0,0.406808,0.081158,0.315215,-10.69,0.176212,103.044154,0.268865,30.576923,5
2,1,7,0.856571,"""Fiddler On The Roof” Motion Picture Chorus",0.348286,328920.0,0.286571,0.024593,0.325786,-15.230714,0.118514,77.375857,0.354857,34.857143,0
3,1,27,0.884926,"""Fiddler On The Roof” Motion Picture Orchestra",0.425074,262890.962963,0.24577,0.073587,0.275481,-15.63937,0.1232,88.66763,0.37203,34.851852,0
4,1,7,0.510714,"""Joseph And The Amazing Technicolor Dreamcoat""...",0.467143,270436.142857,0.488286,0.0094,0.195,-10.236714,0.098543,122.835857,0.482286,43.0,5


<h2> Dimensionality Reduction </h2>

In [None]:
''' PCA - Music data'''

X = data[['danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'duration_ms', 'popularity']]

pca_pipeline = Pipeline([('scaler', StandardScaler()), ('PCA', PCA(n_components = 2))])
X_embedded_pca = pca_pipeline.fit_transform(X)

temp_df = pd.DataFrame()
temp_df['x'] = X_embedded_pca[:, 0]
temp_df['y'] = X_embedded_pca[:, 1]
temp_df.to_csv('drive/MyDrive/USML/v2/pca_music.csv')

In [None]:
''' tSNE - Music data '''

X = data[['danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'duration_ms', 'popularity']]

tsne_pipeline = Pipeline([('scaler', StandardScaler()), ('tsne', TSNE(n_components = 2, verbose = 1))])
X_embedded_tsne = tsne_pipeline.fit_transform(X)

temp_df = pd.DataFrame()
temp_df['x'] = X_embedded_tsne[:, 0]
temp_df['y'] = X_embedded_tsne[:, 1]
temp_df.to_csv('drive/MyDrive/USML/v2/tsne_music.csv')

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 170653 samples in 0.733s...
[t-SNE] Computed neighbors for 170653 samples in 743.705s...
[t-SNE] Computed conditional probabilities for sample 1000 / 170653
[t-SNE] Computed conditional probabilities for sample 2000 / 170653
[t-SNE] Computed conditional probabilities for sample 3000 / 170653
[t-SNE] Computed conditional probabilities for sample 4000 / 170653
[t-SNE] Computed conditional probabilities for sample 5000 / 170653
[t-SNE] Computed conditional probabilities for sample 6000 / 170653
[t-SNE] Computed conditional probabilities for sample 7000 / 170653
[t-SNE] Computed conditional probabilities for sample 8000 / 170653
[t-SNE] Computed conditional probabilities for sample 9000 / 170653
[t-SNE] Computed conditional probabilities for sample 10000 / 170653
[t-SNE] Computed conditional probabilities for sample 11000 / 170653
[t-SNE] Computed conditional probabilities for sample 12000 / 170653
[t-SNE] Computed conditional proba

In [None]:
''' PCA - Genre data'''

X = genre_data[['danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'duration_ms', 'popularity']]

pca_pipeline = Pipeline([('scaler', StandardScaler()), ('PCA', PCA(n_components = 2))])
X_embedded_pca = pca_pipeline.fit_transform(X)

temp_df = pd.DataFrame()
temp_df['x'] = X_embedded_pca[:, 0]
temp_df['y'] = X_embedded_pca[:, 1]
temp_df.to_csv('drive/MyDrive/USML/v2/pca_genre.csv')

In [None]:
''' tSNE - Genre data '''

X = genre_data[['danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'duration_ms', 'popularity']]

tsne_pipeline = Pipeline([('scaler', StandardScaler()), ('tsne', TSNE(n_components = 2, verbose = 1))])
X_embedded_tsne = tsne_pipeline.fit_transform(X)

temp_df = pd.DataFrame()
temp_df['x'] = X_embedded_tsne[:, 0]
temp_df['y'] = X_embedded_tsne[:, 1]
temp_df.to_csv('drive/MyDrive/USML/v2/tsne_genre.csv')

In [None]:
''' PCA - Artist data'''

X = artist_data[['danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'duration_ms', 'popularity']]

pca_pipeline = Pipeline([('scaler', StandardScaler()), ('PCA', PCA(n_components = 2))])
X_embedded_pca = pca_pipeline.fit_transform(X)

temp_df = pd.DataFrame()
temp_df['x'] = X_embedded_pca[:, 0]
temp_df['y'] = X_embedded_pca[:, 1]
temp_df.to_csv('drive/MyDrive/USML/v2/pca_artist.csv')

In [None]:
''' tSNE - Artist data '''

X = artist_data[['danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'duration_ms']]

tsne_pipeline = Pipeline([('scaler', StandardScaler()), ('tsne', TSNE(n_components = 2, verbose = 1))])
X_embedded_tsne = tsne_pipeline.fit_transform(X)

temp_df = pd.DataFrame()
temp_df['x'] = X_embedded_tsne[:, 0]
temp_df['y'] = X_embedded_tsne[:, 1]
temp_df.to_csv('drive/MyDrive/USML/v2/tsne_artist.csv')

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 28680 samples in 0.058s...
[t-SNE] Computed neighbors for 28680 samples in 16.295s...
[t-SNE] Computed conditional probabilities for sample 1000 / 28680
[t-SNE] Computed conditional probabilities for sample 2000 / 28680
[t-SNE] Computed conditional probabilities for sample 3000 / 28680
[t-SNE] Computed conditional probabilities for sample 4000 / 28680
[t-SNE] Computed conditional probabilities for sample 5000 / 28680
[t-SNE] Computed conditional probabilities for sample 6000 / 28680
[t-SNE] Computed conditional probabilities for sample 7000 / 28680
[t-SNE] Computed conditional probabilities for sample 8000 / 28680
[t-SNE] Computed conditional probabilities for sample 9000 / 28680
[t-SNE] Computed conditional probabilities for sample 10000 / 28680
[t-SNE] Computed conditional probabilities for sample 11000 / 28680
[t-SNE] Computed conditional probabilities for sample 12000 / 28680
[t-SNE] Computed conditional probabilities for sa

In [None]:
''' Read PCA and tSNE embeddings '''

pca_music = pd.read_csv('drive/MyDrive/USML/v2/pca_music.csv')
tsne_music = pd.read_csv('drive/MyDrive/USML/v2/tsne_music.csv')

In [None]:
pca.head()

Unnamed: 0.1,Unnamed: 0,x,y
0,0,3.810762,3.237547
1,1,-0.287322,-3.140135
2,2,3.306169,1.978608
3,3,1.503984,1.011282
4,4,1.439878,0.103652


In [None]:
tsne.head()

Unnamed: 0.1,Unnamed: 0,x,y
0,0,3.810762,3.237547
1,1,-0.287322,-3.140135
2,2,3.306169,1.978608
3,3,1.503984,1.011282
4,4,1.439878,0.103652
