# Running this file requires a bunch of stuff to be run first. You'll probably need to start with reshape_and_analyze.ipynb.

In [19]:
import pandas as pd
import re
import time
import numpy as np
import ast
import sys
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
import pickle

In [2]:
# Load in the subset so you can see what row corresponds to what

filename = "../EDA/millionsong_subset.csv"
subset = pd.read_csv(filename)

In [3]:
#load in values
#only do these one at a time, because otherwise there may be memory issues.
#alternatively, if you've got a computer that can handle it, give it a shot. 
#just make sure to restart the kernel and not run anything else at the time.

pitches = pd.read_csv('pitches_half.csv', index_col = 'Unnamed: 0')
timbre = pd.read_csv('segments_timbre_unpacked_95_percent_nonnan.csv', index_col = 'Unnamed: 0')
non_nest = pd.read_csv('95_percent_full_non_nested_list_columns.csv', index_col = 'Unnamed: 0')

In [64]:
#Drops any column where null values consitute more than x percentage of the material
def drop_na_cols(df, threshold = 0.95): 
    df_isnan_sum = df.isna().sum()
    df_isnan_bool = (df_isnan_sum <= df.shape[0] * threshold) == True
    df_isnan_bool = df_isnan_bool[df_isnan_bool].index
    df = df[df_isnan_bool]
    return df

# Imputes the music-related data by repeating it until it fills the row
# Basically, this amount to repeating a song X number of times until it takes the same amount of space as the longest song in the dataset.
# Kind of a crummy method, but it worked better than using means or constants, as far as I can tell
def fill_row(row):
    valid_values = row.dropna().values 
    if len(valid_values) == 0:
        return pd.Series([-1 for _ in range(len(row))])  # Return -1 if no valid values
    else: 
        repeat_count = (len(row) // len(valid_values)) + 1 
        filled_row = np.tile(valid_values, repeat_count)[: len(row)] 
        return pd.Series(filled_row, index=row.index)  

#See above. Basically applies this to the entire dataset. Also thresholds it.
# I re-call the drop_na_cols at the end to remove anything where there might still be a null value, which shouldn't happen but does sometimes.
def fill_impute(df,threshold=0.95): 
    df = drop_na_cols(df,threshold=threshold)
    df = df.apply(fill_row,axis=1)
    df.columns = df.columns.astype(str)
    df = drop_na_cols(df,threshold = 0)
    return df

# Utility function for running a PCA, because I'm lazy. 
def foolproof_PCA(df,components):
    scaling = StandardScaler()
    df = scaling.fit_transform(df)
    use_pca = PCA(n_components=components)
    pcafit = use_pca.fit(df)
    exp_var = pcafit.explained_variance_ratio_
    pca_performed = use_pca.transform(df)
    return exp_var, pca_performed

# Gets cosine similarities between one row and every other row, then sorts it in descending order of similarity
def get_cossim(d, reference, index): 
    a = d[index, :].reshape(1, -1)  # Query vector
    similarities = cosine_similarity(a, d)[0]  # Compute similarity for all rows
    outlist = sorted(enumerate(similarities), key=lambda x: -x[1]) 
    return outlist

# Runs through original data set and prints out artist names and artist terms. Mostly for checking if the rest of this is working well.
def check_artists(inlist, reference,listlen=10):
    get_terms = lambda x: ast.literal_eval(reference.iloc[inlist[x][0]]['artist_terms'])[0:5]
    print('\n'.join([str((inlist[i][0],reference.iloc[inlist[i][0]]['artist_name'], get_terms(i),inlist[i][1])) for i in range(listlen)]))


# Combines and and weights the measurements from earlier
def combine_cos_sims(inlist, weights = None):
    if weights == None: 
        weights = [1 for _ in inlist]
    #else: 
        #weights = [i/sum(weights) for i in weights]
    inlist = [sorted(i, key = lambda x: x[0]) for i in inlist]
    print([len(i) for i in inlist])
    inlist = [(inlist[0][i][0], sum([(inlist[j][i][1]*weights[j]) for j in range(len(inlist))])) for i in range(len(inlist[0]))]
    #inlist = [(inlist[0][i][0], sum([(inlist[j][i][1]*weights[j]) **2 for j in range(len(inlist))])**0.5) for i in range(len(inlist[0]))]
    return sorted(inlist, key = lambda x: -x[1])

def similar_artists_id_and_cossim(df, index): 
    row = df.iloc[index]
    sorted_row = row[2:len(row)].sort_values(ascending=False)
    #outlist = [(sorted_row.index[i], sorted_row.iloc[i], df['artist_name'].iloc[sorted_row.index[i]]) for i in range(len(sorted_row))]
    outlist = [(sorted_row.index[i], sorted_row.iloc[i]) for i in range(len(sorted_row))]
    return outlist

In [5]:
# For timbre

timbre = fill_impute(timbre)
#timbre.to_csv('timbre_imputed.csv', index=False)

In [6]:
# For pitch
pitches = fill_impute(pitches)
#pitches.to_csv('pitches_imputed.csv', index = False)

In [7]:
pitches_exp_var, pitches_PCA = foolproof_PCA(pitches, 2000)
timbre_exp_var, timbre_PCA = foolproof_PCA(timbre,2000)

At this point, let's go over the non_nested list

In [8]:
split_column_list = ['_'.join(i.split('_')[0:2]) for i in non_nest.columns]

In [9]:
split_column_uniques = list(set(split_column_list))
split_column_dict = {split_column_uniques[i]:i for i in range(len(split_column_uniques))}
split_column_numbered = [(split_column_list[i], split_column_dict[split_column_list[i]],i) for i in range(len(split_column_list))]

non_nest_full_imp = None

for subset_key in split_column_dict.keys(): 
    subset_index = split_column_dict[subset_key]
    split_indices = [i[2] for i in split_column_numbered if i[1] == subset_index]
    selected_columns = non_nest.columns[split_indices]
    non_nest_subset = non_nest[selected_columns]
    non_nest_subset.columns = non_nest_subset.columns.astype(str)
    non_nest_subset_imp = fill_impute(non_nest_subset)
    if type(non_nest_full_imp) == type(None): 
        non_nest_full_imp = non_nest_subset_imp
    else: 
        non_nest_full_imp = pd.concat([non_nest_full_imp,non_nest_subset_imp], axis=1)

In [10]:
non_nest_exp_var, non_nest_PCA = foolproof_PCA(non_nest_full_imp, 2000)

In [11]:
genre_PCA = pd.read_csv('artist_term_components.csv',index_col = 'Unnamed: 0')

In [22]:
with open('relevant_artist_columns', 'rb') as f: 
    similar_artists = pickle.load(f)

In [65]:
compare_row = 910

non_nest_PCA_checklist = get_cossim(non_nest_PCA, subset, compare_row)
#non_nest_checklist = get_cossim(non_nest_full_imp.to_numpy(),subset,compare_row)
pitches_PCA_checklist  = get_cossim(pitches_PCA,subset,compare_row)
#pitches_checklist = get_cossim(pitches.to_numpy(),subset,compare_row)
timbre_PCA_checklist = get_cossim(timbre_PCA,subset,compare_row)
#timbre_checklist = get_cossim(timbre.to_numpy(),subset,compare_row)
genre_PCA_checklist = get_cossim(genre_PCA.to_numpy(),subset,compare_row)
similar_artists_checklist = similar_artists_id_and_cossim(similar_artists,compare_row)

In [88]:
use_list = [non_nest_PCA_checklist,pitches_PCA_checklist,timbre_PCA_checklist,genre_PCA_checklist,similar_artists_checklist]
#use_list = [non_nest_checklist,pitches_checklist,timbre_checklist,genre_PCA_checklist]
weights = [4,0.5,5,5,5]
#weights = [1,1,1,1]
#weights=[0,1,0,0]

cossim_combined = combine_cos_sims(use_list,weights)

[10000, 10000, 10000, 10000, 10000]


In [89]:
check_artists(cossim_combined,subset,20)

(910, "b'Finntroll'", [b'heavy metal', b'viking metal', b'folk rock', b'black metal', b'progressive trance'], np.float64(19.499999999999986))
(1485, "b'Finntroll'", [b'heavy metal', b'viking metal', b'folk rock', b'black metal', b'progressive trance'], np.float64(16.261640343511576))
(5984, "b'Finntroll'", [b'heavy metal', b'viking metal', b'folk rock', b'black metal', b'progressive trance'], np.float64(15.252781375854784))
(7777, "b'Finntroll'", [b'heavy metal', b'viking metal', b'folk rock', b'black metal', b'progressive trance'], np.float64(15.249437389957688))
(3027, "b'Finntroll'", [b'heavy metal', b'viking metal', b'folk rock', b'black metal', b'progressive trance'], np.float64(14.853312254576537))
(5641, "b'Elvenking'", [b'folk metal', b'heavy metal', b'speed metal', b'power metal', b'viking metal'], np.float64(13.476437286706217))
(6185, "b'Elvenking'", [b'folk metal', b'heavy metal', b'speed metal', b'power metal', b'viking metal'], np.float64(13.25422849183229))
(233, "b'Moon