# Top Charts Song Classification

## Outline
1. <a href='#1'>Introduction</a>
2. <a href='#2'>Dataset Setup</a>
3. <a href='#3'>Results</a>
4. <a href='#4'>Discussion</a>

## 1. Introduction

## 2. Dataset Setup

### Imports

In [50]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from spotipy import Spotify, oauth2
import secret
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
import time
from multiprocessing import Pool
import math
from datetime import datetime


In [2]:
def import_data():
    """
    Import all the song data from raw, unedited csv within this folder
    Returns csv as a dataframe object (still uncleaned)
    Link to Dataset on Kaggle: https://www.kaggle.com/datasets/multispiros/34740-hit-and-nonhit-songs-spotify-features?resource=download
    """
    unedited_song_data = pd.read_csv('unedited_song_data.csv')
    return unedited_song_data

### Spotify API Setup

In [294]:
artist_popularities = {} # concatenate artist popularities from all class instances

class SpotifyAPI:
    def __init__(self, client_id: str, client_secret: str):
        """
        Create a SpotifyAPI object
        :param client_id: Your Spotify client ID
        :param client_secret: Your Spotify client secret
        """
        credentials_manager = oauth2.SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
        self.sp = Spotify(client_credentials_manager=credentials_manager)
        self.artist_popularity = pd.DataFrame(columns=['Track ID', 'Artist ID', 'Artist Popularity'])
        self.tracks = []
        self.artists = []
    
    def get_tracks(self, track_ids):
        if (len(track_ids) > 50): raise Exception("Limit is 50 tracks")

        try:
            track_objects = self.sp.tracks(tracks=track_ids)
            time.sleep(30)
        except Exception as se:
            print("Error details: {}".format(se))
            time.sleep(30)
        
        actual_tracks = track_objects['tracks']
        self.tracks.extend(actual_tracks)
        return actual_tracks
    
    def get_artist_popularity(self, track_ids):
        track_objects = self.get_tracks(track_ids)
        artist_ids = []
        artist_popularities = []
        for track in track_objects:
            primary_artist_id = track['artists'][0]['id']
            artist_ids.append(primary_artist_id)

        try:
            artist_objects_dict = self.sp.artists(artists=artist_ids)
            time.sleep(30)
        except Exception as se:
            print("Error details: {}".format(se))
            time.sleep(30)
        
        artist_objects = artist_objects_dict['artists']
        self.artists.extend(artist_objects)
        
        for artist in artist_objects:
            artist_popularities.append(artist['popularity'])
            
        artist_pop_df = pd.DataFrame({'Track ID': track_ids, 'Artist ID': artist_ids, 'Artist Popularity': artist_popularities})
        self.artist_popularity = pd.concat([self.artist_popularity, artist_pop_df], ignore_index=True)
        print('---artist_popularity length', len(self.artist_popularity))


#### Spotify API Clients

In [295]:
spotify_api_0 = SpotifyAPI(client_id=secret.spotify_client_id_tester, client_secret=secret.spotify_secret_key_tester)
spotify_api_1 = SpotifyAPI(client_id=secret.spotify_client_id_1, client_secret=secret.spotify_secret_key_1)
spotify_api_2 = SpotifyAPI(client_id=secret.spotify_client_id_2, client_secret=secret.spotify_secret_key_2)
spotify_api_3 = SpotifyAPI(client_id=secret.spotify_client_id_3, client_secret=secret.spotify_secret_key_3)
spotify_api_4 = SpotifyAPI(client_id=secret.spotify_client_id_4, client_secret=secret.spotify_secret_key_4)

In [296]:
spotify_clients = [spotify_api_0, spotify_api_1, spotify_api_2, spotify_api_3, spotify_api_4]

In [297]:
unedited_song_data = import_data()
test_cleaned_song_data = unedited_song_data.copy().dropna()
del test_cleaned_song_data['track_title']
del test_cleaned_song_data['key']
del test_cleaned_song_data['time_signature']

test_cleaned_song_data['artist_name'] = test_cleaned_song_data['artist_name'].str.strip()
dups_removed = test_cleaned_song_data.drop_duplicates(subset=['artist_name']).reset_index()

In [298]:
fifths = np.array_split(dups_removed, 5)

In [299]:
def split_dataframe(df, chunk_size = 50): 
    chunks = []
    num_chunks = math.ceil(len(df) / chunk_size)
    for i in range(num_chunks):
        chunks.append(df[i*chunk_size:(i+1)*chunk_size])
    return chunks

In [300]:
test_df = split_dataframe(fifths[0])

In [301]:
test_list = test_df[1].track_id
test_list = test_list.tolist()
# spotify_api_0.get_artist_popularity(test_list)

In [302]:
spotify_api_0.artist_popularity

Unnamed: 0,Track ID,Artist ID,Artist Popularity


In [303]:
def process_dataset_portion(idx):
    api = spotify_clients[idx]
    dataset_portion = fifths[idx]
    
    # Split dataframe into chunks of 50 to adhere to rate/size limit
    dataset_chunks = split_dataframe(dataset_portion)
    print('----------------------api:', idx, '   start:', datetime.now().strftime("%H:%M:%S"))
    i = 1
    for subset in dataset_chunks:
        track_ids = subset.track_id
        track_id_list = track_ids.tolist()
        api.get_artist_popularity(track_id_list)
        print('api:', idx, '   finished:', i, '            ', datetime.now().strftime("%H:%M:%S"))
        i+=1
    print('-----------------------api:', idx, '   end:', datetime.now().strftime("%H:%M:%S"))

In [304]:
# Sumana: 0, 1, 2
process_dataset_portion(0)

----------------------api: 0    start: 15:42:15
---artist_popularity length 50
api: 0    finished: 1              15:43:15
---artist_popularity length 100
api: 0    finished: 2              15:44:16
---artist_popularity length 150
api: 0    finished: 3              15:45:16


Process SpawnPoolWorker-107:
Traceback (most recent call last):
  File "/Users/Sumana/opt/anaconda3/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/Users/Sumana/opt/anaconda3/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/Sumana/opt/anaconda3/lib/python3.8/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/Users/Sumana/opt/anaconda3/lib/python3.8/multiprocessing/queues.py", line 355, in get
    with self._rlock:
  File "/Users/Sumana/opt/anaconda3/lib/python3.8/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
KeyboardInterrupt
Process SpawnPoolWorker-106:
Traceback (most recent call last):
  File "/Users/Sumana/opt/anaconda3/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/Users/Sumana/opt/anaconda3/lib/python3.8/multiprocessing/process.py", line 108, in run
    self

KeyboardInterrupt: 

In [None]:
# Gerri: 3 & 4
# Note: the spotify_api_3 etc. instance's .artist_popularity param will have the dataframe
process_dataset_portion(3)

### Data Cleaning Methods

In [49]:
def clean_data(unedited_song_data):
    """
    Cleans the given csv by removing not helpful features from our database and
    replaces the artist_name column with the artist popularity metric from the
    Spotify API
    Returns the cleaned DataFrame
    :param unedited_song_data: the unedited and uncleaned DataFrame object
    """
    cleaned_song_data = unedited_song_data.copy().dropna()
    del cleaned_song_data['track_title']
    del cleaned_song_data['key']
    del cleaned_song_data['time_signature']
    cleaned_song_data['artist_name'] = cleaned_song_data['artist_name'].str.strip()

    # cleaned_song_data = add_popularity_info(cleaned_song_data)

    del cleaned_song_data['track_id']

    # scale data 
    

    return cleaned_song_data
    

In [None]:
def scale_train_test_x(train, test, scaler = None):
    '''
    Standardizes the given train and test data.
    Returns the scaled train and test data
    :param train: the training data to scale
    :param test: the testing data to scale
    :param scaler: the scaler to use to standardize, else default is StandardScaler
    '''
    if scaler is None:
        scaler = StandardScaler()
    scaler.fit(train)
    train = scaler.transform(train)
    test = scaler.transform(test)
    return train, test

In [25]:
def split_data(cleaned_song_data):
    """
    Splits the training data into a 75%/25% split with the 25% allotted for testing
    Scales the training data before using it to fit models
    Returns the data split into x_train, x_test, y_train and y_test
    :param cleaned_song_data: the cleaned DataFrame object 
    """
    song_y = cleaned_song_data['on_chart']
    song_x = cleaned_song_data.drop(['on_chart'], axis=1)
    x_train, x_test, y_train, y_test = train_test_split(song_x, song_y, test_size=0.25)
    x_train = x_train.reset_index(drop=True)
    x_test = x_test.reset_index(drop=True)
    y_train = y_train.reset_index(drop=True)
    y_test = y_test.reset_index(drop=True)
    
    x_train, y_train = scale_train_test_x(x_train, x_test)
    return x_train, x_test, y_train, y_test

### Cleaning the Data

In [43]:
uncleaned_song_data = import_data()

In [44]:
uncleaned_song_data

Unnamed: 0,track_title,artist_name,track_id,duration_ms,energy,key,mode,time_signature,acousticness,danceability,instrumentalness,liveness,loudness,speechiness,valence,tempo,On_chart
0,Red Planet - Red Planet Radio,Joe,3FeufAV1f6fajILIQuAXMw,256080,0.96800,7,0,4,0.02150,0.698,0.862000,0.1110,-11.012,0.0337,0.3900,136.884,0
1,Red Planet - Red Planet Extended,Joe,2o7oYjZuo3S2QitY4R5dII,395442,0.97000,2,1,4,0.00339,0.724,0.896000,0.1060,-8.415,0.0358,0.4320,136.868,0
2,Basejump,Robin Schulz,6nWoNNkfffbmHU9z3kGPao,391680,0.40600,5,0,4,0.00433,0.803,0.825000,0.0948,-9.444,0.0608,0.0769,125.014,0
3,"Goldberg Variations, BWV 988 (Arr. for Accordi...",Johann Sebastian Bach,5Zg1UOcmee5oiRw3gbXna0,347840,0.00267,7,0,5,0.91000,0.150,0.028700,0.1250,-24.701,0.0434,0.1050,135.652,0
4,"Goldberg Variations, BWV 988 (Arr. for Accordi...",Johann Sebastian Bach,2ZWmWGoZS2OTGZSnMhck7Q,181733,0.22600,7,1,4,0.76800,0.290,0.000002,0.1050,-14.990,0.0385,0.4700,98.348,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34735,Además de Mí - Remix,Rusherking,7I8L3vYCLThw2FDrE6LuzE,330040,0.61300,6,1,4,0.03960,0.397,0.000000,0.1920,-5.077,0.0549,0.4050,78.400,1
34736,Drunk,Elle King & Miranda Lambert,0QULNNd9z5s35entfiiXoa,245626,0.88400,5,1,4,0.00516,0.612,0.000000,0.0997,-4.400,0.0459,0.6260,119.991,1
34737,Breaking Up Was Easy In The 90's,Sam Hunt,4sf2L157iEgAR7yrCNLgSq,215933,0.64900,9,1,4,0.23100,0.562,0.000000,0.3410,-5.400,0.0494,0.3760,145.913,1
34738,Nobody,Dylan Scott,5TWAIHYaOnYg4txfmCgon5,160707,0.74700,0,1,4,0.51900,0.573,0.000001,0.0867,-6.460,0.0319,0.6450,79.952,1


In [45]:
uncleaned_song_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34740 entries, 0 to 34739
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   track_title       34739 non-null  object 
 1   artist_name       34739 non-null  object 
 2   track_id          34740 non-null  object 
 3   duration_ms       34740 non-null  int64  
 4   energy            34740 non-null  float64
 5   key               34740 non-null  int64  
 6   mode              34740 non-null  int64  
 7   time_signature    34740 non-null  int64  
 8   acousticness      34740 non-null  float64
 9   danceability      34740 non-null  float64
 10  instrumentalness  34740 non-null  float64
 11  liveness          34740 non-null  float64
 12  loudness          34740 non-null  float64
 13  speechiness       34740 non-null  float64
 14  valence           34740 non-null  float64
 15  tempo             34740 non-null  float64
 16  On_chart          34740 non-null  int64 

In [52]:
cleaned_song_data = clean_data(uncleaned_song_data)

In [47]:
cleaned_song_data

Unnamed: 0,artist_name,duration_ms,energy,mode,acousticness,danceability,instrumentalness,liveness,loudness,speechiness,valence,tempo,On_chart
0,Joe,256080,0.96800,0,0.02150,0.698,0.862000,0.1110,-11.012,0.0337,0.3900,136.884,0
1,Joe,395442,0.97000,1,0.00339,0.724,0.896000,0.1060,-8.415,0.0358,0.4320,136.868,0
2,Robin Schulz,391680,0.40600,0,0.00433,0.803,0.825000,0.0948,-9.444,0.0608,0.0769,125.014,0
3,Johann Sebastian Bach,347840,0.00267,0,0.91000,0.150,0.028700,0.1250,-24.701,0.0434,0.1050,135.652,0
4,Johann Sebastian Bach,181733,0.22600,1,0.76800,0.290,0.000002,0.1050,-14.990,0.0385,0.4700,98.348,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
34735,Rusherking,330040,0.61300,1,0.03960,0.397,0.000000,0.1920,-5.077,0.0549,0.4050,78.400,1
34736,Elle King & Miranda Lambert,245626,0.88400,1,0.00516,0.612,0.000000,0.0997,-4.400,0.0459,0.6260,119.991,1
34737,Sam Hunt,215933,0.64900,1,0.23100,0.562,0.000000,0.3410,-5.400,0.0494,0.3760,145.913,1
34738,Dylan Scott,160707,0.74700,1,0.51900,0.573,0.000001,0.0867,-6.460,0.0319,0.6450,79.952,1


In [48]:
cleaned_song_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 34739 entries, 0 to 34739
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   artist_name       34739 non-null  object 
 1   duration_ms       34739 non-null  int64  
 2   energy            34739 non-null  float64
 3   mode              34739 non-null  int64  
 4   acousticness      34739 non-null  float64
 5   danceability      34739 non-null  float64
 6   instrumentalness  34739 non-null  float64
 7   liveness          34739 non-null  float64
 8   loudness          34739 non-null  float64
 9   speechiness       34739 non-null  float64
 10  valence           34739 non-null  float64
 11  tempo             34739 non-null  float64
 12  On_chart          34739 non-null  int64  
dtypes: float64(9), int64(3), object(1)
memory usage: 3.7+ MB


### Training Models

In [None]:
def predict_model(model, x):
    '''
    Predicts the y values for the given x values using the given model.
    Returns the Y predictions
    :param model: the trained model to use for our prediction
    :param x: x data to get y predictions for
    '''
    y_pred = model.predict(x);
    return y_pred

In [None]:
def train_random_forest(x_train, y_train, n_estimators=100):
    '''
    Trains a random forest classifier ensemble using the given x and y training data
    Returns the trained random forest model
    :param x_train: the x data to train our random forest model on
    :param y_train: the y data to train our random forest model on
    '''
    rfc = RandomForestClassifier(n_estimators = n_estimators)
    rfc.fit(x_train, y_train)
    return rfc

In [None]:
def get_metrics(model, x_test, y_true, y_pred):
    '''
    Gets evaluation metrics confusion matrix, classification report, accuracy sore, precision score,
    recall score, f1 score and auc score
    for the given y true and y predicted values.
    Returns the confusion matrix, the classification report, the accuracy score, precision score,
    recall score, f1 score and the auc score
    :param model: the model we are getting metrics for based on its predictions
    :param x_test: the testing data we used to predict from
    :param y_true: the true y data 
    :param y_pred: the y data predicted by the model
    '''
    conf_matrix = confusion_matrix(y_true,y_pred)
    class_report = classification_report(y_true,y_pred)
    accuracy = accuracy_score(y_true,y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1_sc = f1_score(y_true, y_pred)
    y_pred_proba = model.predict_proba(x_test)[::,1]
    auc = roc_auc_score(y_true, y_pred_proba)
    return conf_matrix, class_report, accuracy, precision, recall, f1_sc, auc

    

#### Random Forest Ensemble

In [2]:
n_estimators = list(range(1,500))
accuracies = []
precisions = []
recalls = []
f1_scs = []
aucs = []

# train with each estimator from 1 to 500 and store evaluation metrics
for ne in n_estimators:
    rfc = train_random_forest(x_train, y_train, ne)

    y_pred = predict_model(rfc, x_test)

    conf_matrix, class_report, accuracy, precision, recall, f1_sc, auc = get_metrics(rfc, x_test, y_test, y_pred)
    accuracies.append(accuracy)
    precisions.append(precision)
    recalls.append(recall)
    f1_scs.append(f1_sc)
    aucs.append(auc)

## plot all evaluation metrics as y values with estimator as x 
plt.plot(n_estimators, accuracies, color="red")
plt.plot(n_estimators, precisions, color="blue")
plt.plot(n_estimators, recalls, color="pink")
plt.plot(n_estimators, f1_scs, color="green")
plt.plot(n_estimators, aucs, color="purple")
plt.legend()
plt.xlabel('Number of Estimators')
plt.ylabel('Evaluation Metrics')
plt.title('Random Forest Ensemble Evaluation Metrics Based on Number of Estimators')

## plot feature importance of random forest using ensemble trained on best number of estimators to use as determined previous
rfc = train_random_forest(x_train, y_train, 100)
importances = rfc.feature_importances_

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.bar(cleaned_song_data.columns[:,-1], importances)
plt.xticks(rotation=90)
plt.xlabel('Features')
plt.ylabel('Feature Importances')
plt.title('Feature Importances for Random Forest Ensemble using _ Number of Estimators')
plt.show()


[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,
 185

#### Random Forest Ensemble Analysis

Best number of estimators to use: 