In [1]:
# Only run if you need spotipy to pull from Spotify API


import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

In [2]:
import numpy as np
import pandas as pd

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# Code to Pull Data from API

In [73]:
#Authentication - without user
cid = ""
secret = "" 

## you will need to use the spotify keys generated for ur account
## log in to spotify for developers using ur spotify acc
## and go the "dashboard", create new app
## and then it should give you a client key/ID and a secret key/ID to use here


client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret)
sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager)

In [221]:
data = pd.read_csv("muse_v3.csv")

# pull relevant columns from original dataset (MuseV3) and drop entres w/o spotify_id entries
data = data.loc[:,["track", "artist","seeds", "spotify_id"]]
data = data.dropna(subset=["spotify_id"])
data = data.reset_index(drop=True)

# print(data.loc[36463])
# print(data.loc[53048])
# print(data.loc[53049])

# drop outlying entries that give invalid spotify_id
data = data.drop([36463, 53048, 53049], axis=0)
data = data.reset_index(drop=True)

spotify_URIs = data.loc[:,"spotify_id"]

data.to_csv("cleaned.csv")

track                The Worst Thing
artist              Natalie Merchant
seeds                    ['sensual']
spotify_id    1oxGZTK9dJsxGFFQg5Bp4A
Name: 36463, dtype: object
track          Duo: Stabat Mater dolorosa
artist        Giovanni Battista Pergolesi
seeds                          ['sacred']
spotify_id         03XXOqmlkcR7m09uiBnJXA
Name: 53048, dtype: object
track         Duo: O quam tristis et afflicta
artist            Giovanni Battista Pergolesi
seeds                              ['sacred']
spotify_id             7oOrKaeaN9TILjfVW0Nesq
Name: 53049, dtype: object


In [240]:
num_tracks = len(spotify_URIs)
print(num_tracks)

# pull audio features from Spotify API 100 at a time

audio_features = sp.audio_features(tracks=spotify_URIs[:100])
audio_data = pd.DataFrame(audio_features)
for i in range(100, num_tracks, 100): 
    if (i >= 61600):
        audio_features = sp.audio_features(tracks=spotify_URIs[i:])
        print(i)
        print("LAST")
    else:
        audio_features = sp.audio_features(tracks=spotify_URIs[i:i+100])

    more_data = pd.DataFrame(audio_features)

    audio_data = pd.concat((audio_data, more_data), axis=0)

    print(audio_data)

61627
    danceability  energy  key  loudness  mode  speechiness  acousticness  \
0          0.548   0.847    1    -3.237     1       0.1860      0.062200   
1          0.249   0.949    2    -2.642     0       0.0678      0.001310   
2          0.668   0.787    1    -4.226     1       0.0429      0.109000   
3          0.805   0.918    9    -4.554     1       0.2120      0.049300   
4          0.657   0.960    5    -3.524     0       0.0700      0.001690   
..           ...     ...  ...       ...   ...          ...           ...   
95         0.511   0.961    7    -4.450     0       0.0651      0.000064   
96         0.827   0.676    7    -9.735     1       0.2620      0.025700   
97         0.817   0.658    2    -6.212     1       0.3060      0.108000   
98         0.361   0.871   10    -4.821     0       0.3100      0.001190   
99         0.210   0.841    1    -5.676     0       0.0715      0.000227   

    instrumentalness  liveness  valence    tempo            type  \
0           0

In [241]:
print(data.shape)
print(audio_data.shape)
audio_data = audio_data.reset_index(drop=True)

# combine original dataset containing titles/artists with the pulled audio features

full_data = pd.concat((data, audio_data), axis=1)
full_data = full_data.reset_index(drop=True)

audio_data.to_csv("audio_feat.csv")

full_data.to_csv("full_data.csv")

(61627, 4)
(61627, 18)


# Removing Duplicate Tags

full_data.csv contains the cleaned data w/ pulled audio features from Spotify.

The code below then cuts songs that are associated with multiple tags, leaving only songs w/ a singular emotion tag.

In [5]:
full_data = pd.read_csv("full_data.csv")
full_data = full_data.drop("Unnamed: 0", axis=1)

# get only rows where the "seeds"/tags columns does not contain a "," that indicated multiple tags
single_tags = full_data[full_data["seeds"].str.contains(",") == False]

# the list of the different tags and # of occurrences is written to tags.csv
# full dataset with only single tags is written to single_tags.csv
single_tags["seeds"].value_counts().to_csv("tags.csv")
single_tags.to_csv("single_tags.csv")

In [6]:
print(full_data.shape)
print(single_tags.shape)

print(single_tags["seeds"].shape)

# this only cuts about 13000 from the dataset. 

# PROBLEM: too many labels.
# there are about 2560 different labels
# and each label accounts for at maximum 1.5% of the data points

(61627, 22)
(48390, 22)
(48390,)


# Creating CSV of Features for Single Tags

In [7]:
relevant_data_single_tags = single_tags.drop(["track", "artist", "seeds", "spotify_id", "type", "id", "uri", "track_href", "analysis_url", "duration_ms"], axis=1)
print(relevant_data_single_tags)

relevant_data_single_tags.to_csv("relevant_features_single_tags.csv")

       danceability  energy  key  loudness  mode  speechiness  acousticness  \
0             0.548   0.847    1    -3.237     1       0.1860      0.062200   
1             0.249   0.949    2    -2.642     0       0.0678      0.001310   
2             0.668   0.787    1    -4.226     1       0.0429      0.109000   
4             0.657   0.960    5    -3.524     0       0.0700      0.001690   
5             0.431   0.962    6    -3.269     1       0.0789      0.000008   
...             ...     ...  ...       ...   ...          ...           ...   
61622         0.396   0.915    0    -5.126     0       0.1560      0.033400   
61623         0.244   0.866    2    -5.774     0       0.0674      0.004330   
61624         0.236   0.107   10   -20.091     0       0.0344      0.901000   
61625         0.542   0.909    2    -8.977     0       0.1230      0.071800   
61626         0.495   0.291    0   -11.355     1       0.0280      0.727000   

       instrumentalness  liveness  valence    tempo

# Initial PCA Analysis

In [8]:
# perform PCA on full_data (for unsupervised learning)
# drop irrelevant columns to get only relevant features 
relevant_data = full_data.drop(["track", "artist", "seeds", "spotify_id", "type", "id", "uri", "track_href", "analysis_url", "duration_ms"], axis=1)
print(relevant_data.shape)

relevant_data.to_csv("features_only.csv")

print(relevant_data.columns)

scaler = StandardScaler()
scaler.fit(relevant_data.to_numpy())
scaled_data = scaler.transform(relevant_data.to_numpy())

# pca with all 12 componenets (no features removed)
pca_full = PCA(12)
pca_full.fit(scaled_data)
pca_full_data = pca_full.transform(scaled_data)

# prints the percentage of variance attributed/explained by each component
print(pca_full.explained_variance_ratio_)

# prints the principal components (eigenvectors)
# see how much each original feature contributes to each component
print(pca_full.components_)


(61627, 12)
Index(['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'time_signature'],
      dtype='object')
[0.25852876 0.11352705 0.09887135 0.09709431 0.08114961 0.07430625
 0.07126963 0.06620282 0.06305546 0.03538144 0.02872126 0.01189205]
[[-2.71376318e-01 -4.91197283e-01 -2.60754062e-02 -4.77282660e-01
  -5.01374101e-03 -1.22084970e-01  4.38932157e-01  2.37321677e-01
  -1.12023665e-01 -3.54964914e-01 -1.63473538e-01 -1.65576491e-01]
 [-5.93338512e-01  2.85677089e-01  2.89767506e-02  1.70698389e-01
  -8.29119744e-02 -6.26395174e-02 -2.80481147e-01  3.05583785e-01
   2.46985388e-01 -4.39007180e-01  2.85135797e-01 -1.35939994e-01]
 [ 8.58634334e-02  1.42282930e-03  6.44998286e-01 -6.58323292e-02
  -6.75323034e-01  2.50451146e-01  1.97411423e-02  9.80832646e-02
   1.23890346e-01 -1.13856203e-04 -1.60910734e-01  4.72297541e-02]
 [-6.12362988e-02 -3.03506694e-02 -1.66545236e-01 -6.90313