In [241]:
import pandas as pd
import re
import time
import numpy as np
import ast
import sys
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc
from sklearn.impute import SimpleImputer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
import prince

In [3]:
# Load in the data

filename = "../EDA/millionsong_subset.csv"
subset = pd.read_csv(filename)

In [169]:
def get_top_terms(inlist, number=5): 
    if inlist == []: 
        return inlist
    maxlen = min(number,len(inlist))
    return inlist[0:maxlen]

In [259]:
def listify(instr): 
    outlist = []
    if type(instr) == str:
        rawlist = instr.split(',')
    else: 
        rawlist = instr
    pattern = r"[\'\"](.*)"
    for element in rawlist: 
        red_element = re.findall(pattern,element)
        if len(red_element) != 0: 
            red_element = red_element[0]
            cleaned_element = re.sub(r'[\"\'\]\[]','',red_element)
            outlist.append(cleaned_element)
    return outlist

def collect_list(use_col):
    out_dict = dict()
    for element in use_col: 
        listed_el = listify(element)
        for sub_element in listed_el: 
            if sub_element not in out_dict.keys(): 
                out_dict[sub_element] = 1
            else: 
                out_dict[sub_element] += 1
    return out_dict

def sort_genre(in_dict): 
    uselist = [(i,in_dict[i]) for i in in_dict]
    uselist = sorted(uselist, key = lambda x: -x[1])
    return uselist

artist_term_dict = collect_list(subset['artist_terms'])
artist_term_sorted = sort_genre(artist_term_dict)

artist_mbtags_dict = collect_list(subset['artist_mbtags'])
artist_mbtags_sorted = sort_genre(artist_mbtags_dict)

In [260]:
artist_terms_literal = subset['artist_terms'].apply(lambda x: listify(x))

In [263]:
term_number = 3000
artist_terms_literal = artist_terms_literal.apply(lambda x: get_top_terms(x,number=term_number))

In [265]:
str(artist_terms_literal.iloc[0][0])

'hip hop'

In [267]:
genre_df = {i:[0 for _ in range(subset.shape[0])] for i in artist_term_dict.keys()}
genre_df = pd.DataFrame(genre_df)

In [367]:
for i in range(artist_terms_literal.shape[0]):
    use_list = artist_terms_literal.iloc[i]
    #print(use_list)
    genre_df.loc[i,use_list] = 1

In [273]:
genre_df.iloc[910]['heavy metal']

1

In [277]:
manhattan = lambda x,y:  abs((genre_df.iloc[y] - genre_df.iloc[x]).to_numpy()).sum()

In [279]:
outlist = []
compare_number = 910
for i in range(genre_df.shape[0]): 
    outlist.append((i,manhattan(i,compare_number)))
outlist = sorted(outlist, key=lambda x: x[1])

In [281]:
outlist[-1]

(5842, 112)

In [351]:
mca = prince.PCA(n_components=10)
mca = mca.fit(genre_df)


In [353]:
test = mca.fit_transform(genre_df)

In [355]:
test

component,0,1,2,3,4,5,6,7,8,9
0,3.049361,3.218337,-6.811547,-0.382060,8.643797,-2.722817,-7.468654,-2.938329,-0.519635,4.662224
1,-1.986512,-1.522330,4.797115,-4.219568,-0.265010,0.458569,2.404783,-2.718200,0.696164,0.679887
2,-2.477696,1.740065,-2.313956,2.443366,-1.432996,-3.758712,-0.127287,0.946832,0.708592,-0.893924
3,3.890561,-3.072097,1.966947,-2.223197,-0.746983,1.324332,1.715971,-1.687887,-0.522158,1.104874
4,5.191418,1.201986,-0.582506,-4.108287,0.361188,-1.128096,0.597573,-0.602123,0.255631,1.429320
...,...,...,...,...,...,...,...,...,...,...
9995,2.132998,8.196952,0.405285,-0.059156,-1.517625,1.747695,-4.509106,-0.461919,-0.954016,0.371439
9996,-2.729619,-3.793287,3.640437,0.022569,-0.510473,-1.303835,0.626607,0.241273,0.163218,-0.616316
9997,-1.142729,0.376992,-2.805691,1.352466,0.931952,-0.239663,-2.134002,1.053076,1.215466,-0.685071
9998,-1.816540,1.944021,-1.229976,0.338535,1.111180,-0.496078,-0.160441,1.211583,0.635061,-0.481046


In [357]:
outlist = []
compare_number = 910
a = test.iloc[910,:].to_numpy().reshape(1,-1)
for i in range(test.shape[0]): 
    b = test.iloc[i,:].to_numpy().reshape(1,-1)
    outlist.append((i,cosine_similarity(a,b)[0][0]))
outlist = sorted(outlist, key=lambda x: -x[1])

In [363]:
#test.to_csv("artist_term_components.csv")