# Clustering and Testing

This notebook outlines the modelling approach of the project.

The basic methodology is as follows:
- Organise Spotify feature data - scale and dummify - from top 4000 RYM
- Organise rating data and associated Spotify data - from user's ratings
- Cluster on original spotify data (from top 4000 RYM) to form global clustering model
- Go through each user and predict their ratings based on:
    - Genre - find all genres of songs they listen to and observe scores
    - Clusters - use global clustering model to predict labels of songs they have scored
- Collect all results and determine optimal clustering k-values
- Check statistical significance and performance of clustering mode


In [2]:
import warnings
warnings.filterwarnings("ignore")

# Data handling
import numpy as np
import pandas as pd
import re

# Data prep
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

#Clustering
from sklearn.cluster import KMeans

# Modelling
from sklearn.neighbors import KNeighborsClassifier

# Evaluation
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score
from scipy.stats import ttest_ind

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns

# Storing models
import joblib

In [None]:
%%capture
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()

# Clustering data preparation

In [29]:
# Import from top 4000 albums from rym
df = pd.read_csv('../Cluster_Data.csv')

In [5]:
df.head()

Unnamed: 0,acousticness,album,analysis_url,danceability,duration_ms,energy,spotifyid,instrumentalness,key,liveness,...,valence,artist,descriptors,genre,album_id,desc1,desc2,desc3,desc4,desc5
0,0.00792,OK Computer,https://api.spotify.com/v1/audio-analysis/3OsU...,0.306,287880.0,0.872,3OsUjkcv1C1v5udFhgRSFg,0.8,2.0,0.261,...,0.648,Radiohead,"melancholic, anxious, futuristic, alienation, ...","Alternative Rock, Art Rock",0,melancholic,anxious,futuristic,alienation,existential
1,0.0377,OK Computer,https://api.spotify.com/v1/audio-analysis/2nTs...,0.252,387213.0,0.849,2nTsKOXIVGDf2iPeVQO2Gm,0.00592,5.0,0.0545,...,0.194,Radiohead,"melancholic, anxious, futuristic, alienation, ...","Alternative Rock, Art Rock",0,melancholic,anxious,futuristic,alienation,existential
2,0.0498,OK Computer,https://api.spotify.com/v1/audio-analysis/19yG...,0.316,267693.0,0.592,19yGmm9FjEZdZc5j98WDe4,0.00139,0.0,0.103,...,0.32,Radiohead,"melancholic, anxious, futuristic, alienation, ...","Alternative Rock, Art Rock",0,melancholic,anxious,futuristic,alienation,existential
3,0.229,OK Computer,https://api.spotify.com/v1/audio-analysis/4Na0...,0.293,267187.0,0.276,4Na0siMtWOW9pJoWJ1Ponv,0.119,7.0,0.167,...,0.196,Radiohead,"melancholic, anxious, futuristic, alienation, ...","Alternative Rock, Art Rock",0,melancholic,anxious,futuristic,alienation,existential
4,0.000124,OK Computer,https://api.spotify.com/v1/audio-analysis/4aOA...,0.352,299560.0,0.674,4aOAzvRdOsZSwZIgwcdeL0,0.12,9.0,0.18,...,0.145,Radiohead,"melancholic, anxious, futuristic, alienation, ...","Alternative Rock, Art Rock",0,melancholic,anxious,futuristic,alienation,existential


In [7]:
df.shape

(43110, 30)

In [30]:
# Store descriptor list for later
desc_list = ['desc1', 'desc2', 'desc3', 'desc4', 'desc5']

In [31]:
def prep_data(data):
    
    data.drop_duplicates(inplace = True)
    data[desc_list] = data[desc_list].applymap(lambda x: str(x).strip().replace('nan', 'None'))
    df.dropna(inplace = True)
    
    return data

In [32]:
def min_scale(var):
    scaler = MinMaxScaler()
    return pd.DataFrame(scaler.fit_transform(var),
                        columns = var.columns,
                        index = var.index)

def std_scale(var):
    scaler = StandardScaler()
    return pd.DataFrame(scaler.fit_transform(var),
                        columns = var.columns,
                        index = var.index)

In [33]:
def extract_spotify_features(users_df):
    return std_scale(pd.concat([
                        users_df[['danceability', 'energy', 'loudness', 'speechiness',
                            'acousticness', 'instrumentalness','liveness', 'valence',
                            'tempo', 'duration_ms', 'mode']],
                        (pd.get_dummies(users_df[['key']].astype(str), drop_first = True))
                               ], axis = 1))

In [34]:
df = prep_data(df)

In [35]:
# Prep sonic features
sonic_feats = extract_spotify_features(df)

In [36]:
# Append album to the dataframe
sonic_feats['album'] = df['album']

In [37]:
# Grouped album level data
sonic_feats_album = sonic_feats.groupby('album').median()
# Song level data (more useful to have the index as album even though we can no longer see which song is which)
sonic_feats_song = sonic_feats.set_index('album', drop = True)

In [38]:
sonic_feats_album.head()

Unnamed: 0_level_0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,...,key_10.0,key_11.0,key_2.0,key_3.0,key_4.0,key_5.0,key_6.0,key_7.0,key_8.0,key_9.0
album,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""Al final de este viaje...""",0.548053,-1.245233,-0.23749,0.192247,1.419212,-0.719557,-0.575312,-0.12083,-0.752621,-0.264216,...,-0.245537,-0.282822,-0.373196,-0.176295,-0.309894,-0.284496,-0.246214,-0.369543,-0.240647,-0.371436
"""Born Into Trouble as the Sparks Fly Upward.""",-1.466124,-1.083358,-0.699521,-0.403986,0.460425,1.657276,0.438884,-1.329176,-0.484609,0.955661,...,-0.245537,-0.282822,-0.373196,-0.176295,-0.309894,-0.284496,-0.246214,-0.369543,-0.240647,-0.371436
"""Heroes""",0.016498,0.798667,0.680494,-0.379478,-0.941171,0.215452,-0.49621,-0.37109,0.110189,-0.178413,...,-0.245537,-0.282822,-0.373196,-0.176295,-0.309894,-0.284496,-0.246214,-0.369543,-0.240647,-0.371436
"""L'homme à tête de chou""",0.420376,-0.288211,-1.081716,-0.090868,-0.780532,-0.71897,-0.510335,0.616877,0.579795,-0.651054,...,-0.245537,-0.282822,-0.373196,-0.176295,-0.309894,-0.284496,-0.246214,-0.369543,-0.240647,-0.371436
"""Love and Theft""",0.727844,0.458906,0.916754,-0.532445,-0.803381,-0.719557,-0.603562,0.74761,-0.430626,-0.319279,...,-0.245537,-0.282822,-0.373196,-0.176295,-0.309894,-0.284496,-0.246214,-0.369543,-0.240647,-0.371436


In [39]:
sonic_feats_song.head()

Unnamed: 0_level_0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,...,key_10.0,key_11.0,key_2.0,key_3.0,key_4.0,key_5.0,key_6.0,key_7.0,key_8.0,key_9.0
album,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
OK Computer,-0.845976,1.085062,0.618723,-0.353279,-0.965365,1.526712,0.289156,0.837256,1.600086,0.179774,...,-0.245537,-0.282822,2.67956,-0.176295,-0.309894,-0.284496,-0.246214,-0.369543,-0.240647,-0.371436
OK Computer,-1.127388,1.003235,0.753919,-0.284824,-0.881878,-0.702935,-0.877593,-0.858538,1.463249,0.778092,...,-0.245537,-0.282822,-0.373196,-0.176295,-0.309894,3.514992,-0.246214,-0.369543,-0.240647,-0.371436
OK Computer,-0.793863,0.088905,0.351827,-0.525684,-0.847956,-0.715654,-0.603562,-0.387899,1.038513,0.058181,...,-0.245537,-0.282822,-0.373196,-0.176295,-0.309894,-0.284496,-0.246214,-0.369543,-0.240647,-0.371436
OK Computer,-0.913724,-1.035329,-0.053927,-0.479202,-0.345574,-0.385424,-0.241955,-0.851067,0.105173,0.055133,...,-0.245537,-0.282822,-0.373196,-0.176295,-0.309894,-0.284496,-0.246214,2.706043,-0.240647,-0.371436
OK Computer,-0.606255,0.380637,0.334845,-0.513852,-0.987221,-0.382617,-0.168503,-1.041564,-0.511399,0.250127,...,-0.245537,-0.282822,-0.373196,-0.176295,-0.309894,-0.284496,-0.246214,-0.369543,-0.240647,2.692252


This is all the data we will need to feed the clustering algorithm.

We have sound features scaled and dummified for album level and song level data.

# Modelling data preparation

Now that we have our clustering data prepared, lets now set up our users' data.

In [40]:
filepath = '../'
# Spotify data for our users' albums ratings
spot = pd.read_csv(filepath + 'SQL_SpotifyMetrics_Export.csv', index_col=0)
# Ratings data for users
ratings = pd.read_csv(filepath + 'SQL_Ratings_Export.csv', index_col=0)
# Genre data
rym_genres = pd.read_csv(filepath + 'SQL_RYM_Genre_Export.csv', index_col=0)
# Album metadata
albums = pd.read_csv(filepath + 'SQL_Album_Export.csv', index_col=0)
# Album genre data
album_genres = pd.read_csv(filepath + 'SQL_Album_genres_Export.csv', index_col=0)

In [41]:
# Set names for tidyness
spot.columns = ['acousticness', 'analysisUrl', 'danceability', 'duration_ms', 'energy',
       'spotId', 'instrumentalness', 'key', 'liveness', 'loudness', 'mode',
       'songId', 'songName', 'speechiness', 'tempo', 'timeSignature',
       'trackHref', 'type', 'uri', 'valence', 'albumId', 'artistName',
       'artistId', 'albumName']

ratings.columns = ['ratingId', 'score', 'ratingYear',
                   'ratingMonth', 'ratingDay', 'identifierRYM',
                   'albumId', 'profileId']

df = pd.merge(spot, ratings, on = 'albumId', how = 'left')

rym_genres.columns = ['genreId', 'genreName']

album_genres = pd.merge(album_genres, rym_genres, on = 'genreId', how = 'left')

album_genres = album_genres.sort_values('albumId').drop_duplicates(subset = 'albumId')

df = pd.merge(df, album_genres, on = 'albumId', how = 'left')

df.dropna(inplace = True)

In [42]:
df.head()

Unnamed: 0,acousticness,analysisUrl,danceability,duration_ms,energy,spotId,instrumentalness,key,liveness,loudness,...,albumName,ratingId,score,ratingYear,ratingMonth,ratingDay,identifierRYM,profileId,genreId,genreName
0,0.00121,https://api.spotify.com/v1/audio-analysis/53dm...,0.159,79929.0,0.403,53dmBFkPpJQM41LF3wJdFM,0.561,2.0,0.213,-14.229,...,The 1975,1.0,4.0,2018.0,12.0,10.0,[Rating106661327],1.0,7.0,Pop Rock
1,0.00121,https://api.spotify.com/v1/audio-analysis/53dm...,0.159,79929.0,0.403,53dmBFkPpJQM41LF3wJdFM,0.561,2.0,0.213,-14.229,...,The 1975,17284.0,3.5,2018.0,10.0,5.0,[Rating104194021],9.0,7.0,Pop Rock
2,0.00121,https://api.spotify.com/v1/audio-analysis/53dm...,0.159,79929.0,0.403,53dmBFkPpJQM41LF3wJdFM,0.561,2.0,0.213,-14.229,...,The 1975,59563.0,5.0,2019.0,1.0,4.0,[Rating107608198],52.0,7.0,Pop Rock
3,0.00121,https://api.spotify.com/v1/audio-analysis/53dm...,0.159,79929.0,0.403,53dmBFkPpJQM41LF3wJdFM,0.561,2.0,0.213,-14.229,...,The 1975,75797.0,2.0,2018.0,11.0,20.0,[Rating105894323],54.0,7.0,Pop Rock
4,0.00121,https://api.spotify.com/v1/audio-analysis/53dm...,0.159,79929.0,0.403,53dmBFkPpJQM41LF3wJdFM,0.561,2.0,0.213,-14.229,...,The 1975,120692.0,2.0,2018.0,8.0,9.0,[Rating102099544],41.0,7.0,Pop Rock


# Modelling functions

In [22]:
# If we want to bring in a separate clustering model
def retrieve_cluster_model(file):
    return joblib.load(file)

In [43]:
# Some users will only have rated songs with certain values from dummified vars
# This function fills in the remaining ones if need be
def assert_columns(users_df, necessary_cols):
    missing_cols = set(necessary_cols) - set(users_df.columns)   
    for col in missing_cols:
        users_df[col] = 0    
    return users_df

In [None]:
# Create clustered labels for each user's songs
def generate_new_genre_classification(users_df, clustering_model):
    return clustering_model.predict(users_df)

In [None]:
# Return binned scores for 2-6 bins
def split_scores_into_bins(nbins, y):
    
    label_dict = {
        2: ['low', 'high'],
        3: ['low', 'mid', 'high'],
        4: ['low', 'mid-low', 'mid-high', 'high'],
        5: ['low', 'mid-low', 'mid', 'mid-high', 'high'],
        6: ['bottom', 'low', 'mid-low', 'mid-high', 'high', 'top'],
    }
    
    return pd.cut(y, bins = nbins, labels = label_dict[nbins])

In [23]:
# Resample data up/down based on method
def resample(X, y, method):
    
    if method == 'up':        
        Xs, ys = RandomOverSampler(random_state=0).fit_resample(X, y) 
    else:
        Xs, ys = RandomUnderSampler(random_state=0, replacement = True).fit_resample(X, y)
    
    return pd.DataFrame(Xs, columns = X.columns), ys

In [None]:
# Train test split
def split_data(X, y, split):
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = split, random_state = 1, shuffle = True)
    return X_train, X_test, y_train, y_test

In [44]:
# Create classification data per user at song level 
def create_classification_data_per_user(user, sample_method, classes, cluster_model, split = False):
    
    # Clear any nans per user
    user_df = df[df['profileId'] == user].dropna(axis = 0)
    
    # Get their sound features
    relevant_sound_features = std_scale(extract_spotify_features(user_df))

    # Check columns
    necessary_cols = set(sonic_feats_album.columns) - set(relevant_sound_features.columns)
    relevant_sound_features = assert_columns(relevant_sound_features, necessary_cols)

    # Create cluster labels
    clusters = generate_new_genre_classification(relevant_sound_features, cluster_model)

    # Dummy code cluster labels as variables
    cluster_df = pd.get_dummies(pd.DataFrame(clusters, index = user_df.index)[0].astype(str))

    # Merge them in
    user_df = pd.merge(user_df, cluster_df, left_index = True, right_index = True, how = 'left')

    # Min Max scale
    X_clust = min_scale(user_df.groupby('albumId').sum().iloc[:,21:])
    
    # Create genre data seperately
    X_gen = std_scale(pd.get_dummies(pd.merge(user_df[['genreName', 'albumId', 'genreId']],
                           rym_genres, on='genreId', how = 'left', suffixes=('', '_y')) \
                           .drop('genreName_y', axis = 1).groupby('albumId').mean().astype(str)))
    
    # Collect score per album
    y = user_df.groupby('albumId').mean()['score']
    
    # Bin scores into categories
    y_binned = split_scores_into_bins(classes, y)
    
    # Resample to balance
    Xs_clust, ys_clust = resample(X_clust, y_binned, sample_method)
    Xs_gen, ys_gen = resample(X_gen, y_binned, sample_method)
    
    if split == False:
        
        return Xs_clust, ys_clust, Xs_gen, ys_gen
    
    else:
        
        X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(Xs_clust, ys_clust, train_size = 0.8)
        X_train_g, X_test_g, y_train_g, y_test_g = train_test_split(Xs_gen, ys_gen, train_size = 0.8)
        
        return X_train_c, X_test_c, y_train_c, y_test_c, X_train_g, X_test_g, y_train_g, y_test_g
    

In [25]:
# Model on genre and clusters specific users' ratings
def create_model(user, sample_method, classes, split, cluster_model, model):
    
    X_train_c, X_test_c, y_train_c, y_test_c, \
    X_train_g, X_test_g, y_train_g, y_test_g \
        = create_classification_data_per_user(user, sample_method, classes, cluster_model, split)
    
    model_c = model.fit(X_train_c, y_train_c)
    score_c_test = model_c.score(X_test_c, y_test_c)
    score_c_train = cross_val_score(model_c, X_train_c, y_train_c, cv = 5).mean()
    
    model_g = model.fit(X_train_g, y_train_g)
    score_g_train = cross_val_score(model_g, X_train_g, y_train_g, cv = 5).mean()
    score_g_test = model_g.score(X_test_g, y_test_g)
    
    return score_c_train, score_g_train, score_c_test, score_g_test, X_train_c.shape[0]
    

In [26]:
# Collect the results
def collect_results(sampling_method, classes, model, cluster_model):
    
    # Sort labelling for final results dataframe
    model_name = (str(sampling_method)+('/')+str(classes)+('/')+str(method.__str__().split('(')[0]))
    cluster_model_name = str(cluster_model.__str__().split(',')[4]).split()[0]
    
    # Model and collect
    results = []
    for user in list(df.profileId.unique()):
        try:
            score_c_train, score_g_train, score_c_test, score_g_test, length = create_model(user, sampling_method, classes, True, cluster_model, model)
            results.append([score_c_train, score_g_train, score_c_test, score_g_test, length, user, model_name, cluster_model_name])
        except:
            results.append([0, 0, 0, 0, 0, user, model_name, cluster_model_name])
    
    return pd.DataFrame(results, columns = ['clustScoreTrain', 'genreScoreTrain',
                                             'clustScoreTest', 'genreScoreTest',
                                             'N', 'User', 'Model', 'clusterModel']).sort_values(by = 'clustScoreTest', ascending = False)


In [29]:
# Prepare results df
all_results_song = pd.DataFrame(columns = ['clustScoreTrain', 'genreScoreTrain',
                                      'clustScoreTest', 'genreScoreTest',
                                      'N', 'User', 'Model', 'clusterModel'])

k_range = range(15,45)

# Prepare range of KMeans groups
# Can also run Spectral, Affinity Propogation and Birch for data shape but will be
# computationally demanding
for k in tqdm(k_range):
    cm = KMeans(n_clusters = k).fit(sonic_feats_song)
    for n in range(10,50):
        # KNN for computational tractability, can use others
        method = KNeighborsClassifier(n_neighbors = n)
        for bins in [2,3,4]:
            try:
                results = collect_results('down', bins, method, cm)
                all_results_song = pd.concat([results, all_results_song], axis = 0)
            except:
                print("Couldn't run {}, {}, {}, {}".format('down', bins, method, cm))
                

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




In [30]:
# Remove all zero scores
results = all_results_song[(all_results_song['clustScoreTrain']>0) & (all_results_song['genreScoreTrain']>0)]

In [31]:
# Print wins
results['clust_wins_test'] = results.apply(lambda x: x[2]>x[3], axis = 1)
results['clust_wins_train'] = results.apply(lambda x: x[0]>x[1], axis = 1)

In [34]:
# Check wins per clustering job
for i in k_range:
    print(i, results[(results['clusterModel']=='n_clusters={}'.format(i))&(results['N']>0)].groupby('User').median().apply(lambda x: x[2]>=x[3], axis = 1).mean())

15 0.5454545454545454
16 0.42424242424242425
17 0.6212121212121212
18 0.5303030303030303
19 0.5454545454545454
20 0.5074626865671642
21 0.5522388059701493
22 0.5671641791044776
23 0.48484848484848486
24 0.4696969696969697
25 0.6515151515151515
26 0.5151515151515151
27 0.5223880597014925
28 0.5909090909090909
29 0.5757575757575758
30 0.5757575757575758
31 0.5074626865671642
32 0.696969696969697
33 0.5454545454545454
34 0.5303030303030303
35 0.5909090909090909
36 0.5151515151515151
37 0.5373134328358209
38 0.5757575757575758
39 0.6515151515151515
40 0.5909090909090909
41 0.6119402985074627
42 0.4393939393939394
43 0.5303030303030303
44 0.45454545454545453


# Success!

K = 32 is our winner.

Lets rerun with just a subset for our k = 32.

In [45]:
final_model_results = pd.DataFrame(columns = ['clustScoreTrain', 'genreScoreTrain',
                                      'clustScoreTest', 'genreScoreTest',
                                      'N', 'User', 'Model', 'clusterModel'])

optimal_cluster_model = KMeans(n_clusters = 32).fit(sonic_feats_song)
models = [KNeighborsClassifier(i) for i in range(5,30)]

for method in tqdm(models):
    for num in [2,3,4]:
        try:
            results = collect_results('down', num, method, optimal_cluster_model)
            final_model_results = pd.concat([results, final_model_results], axis = 0)
        except:
            print("Couldn't run {}, {}, {}, {}".format('down', bins, method, cm))

HBox(children=(FloatProgress(value=0.0, max=25.0), HTML(value='')))




In [46]:
# Filter for non-zeros
final_model_results = final_model_results[(final_model_results['clustScoreTrain']>0) & (final_model_results['genreScoreTrain']>0)]

In [47]:
# Inspect
final_model_results.sort_values('genreScoreTest', ascending = False)[0:50]

Unnamed: 0,clustScoreTrain,genreScoreTrain,clustScoreTest,genreScoreTest,N,User,Model,clusterModel
49,0.483333,0.566667,0.2,1.0,19,43.0,down/2/KNeighborsClassifier,n_clusters=32
20,0.5,0.3,0.333333,1.0,9,35.0,down/2/KNeighborsClassifier,n_clusters=32
68,0.3,0.6,0.4,1.0,17,51.0,down/2/KNeighborsClassifier,n_clusters=32
60,0.477778,0.457778,0.583333,0.916667,48,80.0,down/2/KNeighborsClassifier,n_clusters=32
0,0.586111,0.461111,0.545455,0.909091,41,1.0,down/2/KNeighborsClassifier,n_clusters=32
11,0.59,0.694167,0.75,0.9,78,65.0,down/2/KNeighborsClassifier,n_clusters=32
54,0.485714,0.428571,0.666667,0.888889,35,11.0,down/2/KNeighborsClassifier,n_clusters=32
53,0.543939,0.613636,0.333333,0.866667,57,86.0,down/2/KNeighborsClassifier,n_clusters=32
34,0.5375,0.4625,0.35,0.85,80,23.0,down/2/KNeighborsClassifier,n_clusters=32
40,0.58,0.54,0.166667,0.833333,24,81.0,down/2/KNeighborsClassifier,n_clusters=32


In [151]:
test_wins = round(final_model_results.apply(lambda x: x[2]>=x[3], axis = 1).mean()*100,1)

In [152]:
train_wins = round(final_model_results.apply(lambda x: x[0]>=x[1], axis = 1).mean()*100,1)

In [153]:
print('Spirit scores higher than genre in {}% of training instances'.format(train_wins))
print('Spirit scores higher than genre in {}% of testing instances'.format(test_wins))

Spirit scores higher than genre in 62.7% of training instances
Spirit scores higher than genre in 59.7% of testing instances


In [133]:
final_model_results.shape

(4253, 8)

In [125]:
ttest_results = ttest_ind(final_model_results['clustScoreTest'], final_model_results['genreScoreTest'])
print('Test statistic comparing means of genre test scores and clustering test scores {} with a p-value of {}.\nThis tells us that the groups are not significantly similar'.format(round(ttest_results[0],3),round(ttest_results[1],3)))
print('')
ttest_results = ttest_ind(final_model_results['clustScoreTrain'], final_model_results['genreScoreTrain'])
print('Test statistic comparing means of genre train scores and clustering train scores {} with a p-value of {}.\nThis tells us that the groups are not significantly similar'.format(round(ttest_results[0],3),round(ttest_results[1],3)))


Test statistic comparing means of genre test scores and clustering test scores 3.934 with a p-value of 0.0.
This tells us that the groups are not significantly similar

Test statistic comparing means of genre train scores and clustering train scores 8.33 with a p-value of 0.0.
This tells us that the groups are not significantly similar


In [132]:
trimmed = final_model_results[final_model_results['N']>0]
ttest_results = ttest_ind(final_model_results['clustScoreTrain'], final_model_results['genreScoreTrain'])
print('- Average cross-validated training score (genre) {}'.format(round(trimmed['genreScoreTrain'].mean(),3)))
print('- Average cross-validated training score (spirit) {}'.format(round(trimmed['clustScoreTrain'].mean(),3)))
print('- Significantly different with p-value: {} and t-statistic: {}'.format(round(ttest_results[1],3),round(ttest_results[0],3)))
print('--------------------------')
ttest_results = ttest_ind(final_model_results['clustScoreTest'], final_model_results['genreScoreTest'])
print('- Average cross-validated testing score (genre) {}'.format(round(trimmed['genreScoreTest'].mean(),3)))
print('- Average cross-validated testing score (spirit) {}'.format(round(trimmed['clustScoreTest'].mean(),3)))
print('- Significantly different with p-value: {} and t-statistic: {}'.format(round(ttest_results[1],3),round(ttest_results[0],3)))

- Average cross-validated training score (genre) 0.405
- Average cross-validated training score (spirit) 0.428
- Significantly different with p-value: 0.0 and t-statistic: 8.33
--------------------------
- Average cross-validated testing score (genre) 0.393
- Average cross-validated testing score (spirit) 0.407
- Significantly different with p-value: 0.0 and t-statistic: 3.934


In [119]:
trimmed = final_model_results[final_model_results['N']>0]
print(trimmed['genreScoreTest'].mean())
print(trimmed['clustScoreTest'].mean())

0.3931241222907847
0.407215601131557


In [52]:
# Export
final_model_results.to_csv('finalResults.csv')

# ---------------------------------------------------
# Attempted Scoring Function

The below is the attempt to select value of k for clustering analysis on a custom scoring function.

The desire was to select K based on the set of clusters that optimised purity of descriptors per cluster.

i.e. a cluster with just the word 'Dark' would be scored more highly than a cluster with the words 'Dark', 'Happy' and 'Sad'.

The collective scores of all the clusters in each clustering run (each value of K) would be aggregated and the best performing value of K would be selected.

The descriptors were taken from rateyourmusic.com. Ideally the clustering would have take these as an input variable but they couldn't be collected for all albums so couldn't be used in modelling. Rather, the attempt was to use them as a scoring metric for how the clustering performed.

In [14]:
def perform_clustering(model, df):
    
    model.fit(df)
    
    return pd.DataFrame({'cluster':model.labels_}, index = df.index)


In [15]:
descs = df[['desc1', 'desc2', 'desc3', 'desc4', 'album']].set_index('album', drop=True)

def append_descriptors(df, type_of_df):
    
    if type_of_df == 'album':
        return pd.merge(df, descs, left_index = True, right_index = True).drop_duplicates()
    else:
        return pd.concat([df, descs], axis = 1)

Typical metrics for scoring clustering don't really apply.

We want to know the extent to which our clustering has managed to split clusters by RYM descriptors.

We will create functions to calculate the sum of the quantity of descriptors per cluster over the total amount of the descriptors in the data. And then we divide by the size of the cluster.

In [16]:
# Turn descriptors into a list
def get_descriptors_for_cluster(df, cluster_id):
    cluster_rows = df[df["cluster"] == cluster_id]
    descriptor_list = list(cluster_rows[['desc1','desc2','desc3','desc4']].values.ravel('K'))
    return [word for word in descriptor_list if word != 'None']

# Map all descriptors to a cluster_id
def get_descriptors_by_cluster(df, num_clusters):
    return { _id: get_descriptors_for_cluster(df, _id) for _id in range(num_clusters)}

# Count descriptors per cluster
def count_descriptors_by_cluster(df, num_clusters):
    raw = get_descriptors_by_cluster(df, num_clusters)
    return { _id: pd.Series(d).value_counts() for _id, d in raw.items() }

def score_all(clusters):
    total = [i for c in clusters.values() for i in c]
    
    def score(c):
        return (sum([c.count(i) / total.count(i) for i in set(c)])) / len(c)
    
    return {k: score(v) for k, v in clusters.items()}


In [17]:
def get_scores_per_clustering_run(df,number_of_clusters):
    scores = []
    for cid, score in score_all(get_descriptors_by_cluster(df, number_of_clusters)).items():
        scores.append([score,len(get_descriptors_for_cluster(df, cid))])
    return scores, np.mean(scores), np.median(scores)

# Optimal Clustering at a Song Level

Can add other models into AWS but locally can't handle processing this quantity of data.

In [125]:
models = [model for models in [
    [KMeans(n_clusters = i) for i in range(10,50)]
    ] for model in models]


In [126]:
scores = []
for model in tqdm(models):
    try:
        clust_df = append_descriptors(perform_clustering(model, sonic_feats_song), 'song')
        nclusters = len(clust_df['cluster'].unique())    
        scores.append([model, get_scores_per_clustering_run(clust_df, nclusters)])
    except:
        pass

HBox(children=(FloatProgress(value=0.0, max=40.0), HTML(value='')))




In [127]:
mm = []
for i in range(len(scores)):
    mm.append([scores[i][1][1], scores[i][1][2], scores[i][0]])

In [176]:
mmdf = pd.DataFrame(mm)
mmdf['nclusts'] = mmdf[2].apply(lambda x: str(x).split()[4].strip()[11:13])

In [177]:
mmdf['1*'] = mmdf[1]*(mmdf['nclusts'].astype(int))

In [178]:
mmdf.sort_values('1*')

Unnamed: 0,0,1,2,nclusts,1*
36,1713.42449,110.500835,"KMeans(algorithm='auto', copy_x=True, init='k-...",46,5083.038415
8,4378.750587,759.000759,"KMeans(algorithm='auto', copy_x=True, init='k-...",18,13662.01366
4,5629.822,986.00078,"KMeans(algorithm='auto', copy_x=True, init='k-...",14,13804.010917
7,4636.324113,821.500769,"KMeans(algorithm='auto', copy_x=True, init='k-...",17,13965.513072
5,5254.500582,946.500765,"KMeans(algorithm='auto', copy_x=True, init='k-...",15,14197.511469
12,3582.614227,657.000768,"KMeans(algorithm='auto', copy_x=True, init='k-...",22,14454.016886
10,3940.87559,736.500788,"KMeans(algorithm='auto', copy_x=True, init='k-...",20,14730.015754
9,4148.290061,779.50079,"KMeans(algorithm='auto', copy_x=True, init='k-...",19,14810.515011
13,3426.848409,653.50078,"KMeans(algorithm='auto', copy_x=True, init='k-...",23,15030.517929
11,3753.214874,726.500793,"KMeans(algorithm='auto', copy_x=True, init='k-...",21,15256.516647


In [133]:
for i in range(0,10):
    print(list(pd.DataFrame(mm).sort_values(1, ascending = False)[20:30][2])[i])

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=30, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=33, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=32, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=31, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=34, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)
KMeans(algorithm='auto', copy_

# Best models:

1. KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=20, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)    
2. KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=10, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)
3. KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=13, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)   

None of these clustering models performed particularly well, so they method was dropped.