In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pickle
import pandas as pd
import numpy as np

# TSNE
from sklearn.manifold import TSNE
from bokeh.plotting import figure, show, output_notebook, save, output_file
from bokeh.models import HoverTool, value, LabelSet, Legend, ColumnDataSource
from bokeh import palettes
from bokeh.palettes import Spectral, Spectral4, Spectral6, Category20, Turbo256, Dark2, inferno, Plasma256, Plasma, Paired
from bokeh.transform import factor_cmap
from bokeh.models.widgets import Tabs, Panel

output_notebook()

In [None]:
# Import dataset

music_df = pd.read_csv("/content/drive/My Drive/hetrec2011-lastfm-2k/music_tags_new.csv")
df = music_df.pivot_table(index = 'userID',  values = 'artistID', aggfunc=np.count_nonzero).reset_index().rename(columns = {'artistID':'no_of_artists'})
list_id = list(df[df['no_of_artists'] < 10]['userID'])
music_df_new = music_df[~music_df['userID'].isin(list_id)]
item_list = pd.read_csv("/content/drive/My Drive/hetrec2011-lastfm-2k/item_list.csv")

# Merge to get all information
music_df_new = pd.merge(music_df_new, item_list, left_on = 'artistID', right_on = 'artistID')

# Create count tagvalue column
music_df_new['tagValues'] = music_df_new.tagValue.str.split('|')
tagValue_count = [len(i) for i in music_df_new.tagValues]
tagValue_count_df = pd.DataFrame(tagValue_count, columns = ['tagValue_count'])
music_df_new = pd.concat([music_df_new, tagValue_count_df], axis = 1).drop(columns=['tagValues'])
print("Maximum number of tags", max(music_df_new.tagValue_count))
music_df_new.head()

Maximum number of tags 1


Unnamed: 0,userID,artistID,timestamp,tagValue,item_id,tagValue_count
0,2,52,1238536800000,other,49,1
1,43,52,1272664800000,other,49,1
2,255,52,1225494000000,other,49,1
3,283,52,1222812000000,other,49,1
4,320,52,1138748400000,other,49,1


In [None]:
# Under 1 tag value
music_1tagvalue = music_df_new[music_df_new['tagValue_count'] == 1]
print("Dataset contains", len(music_1tagvalue), "rows")
print("Number of unique tags", len(music_1tagvalue['tagValue'].unique()))

Dataset contains 81449 rows
Number of unique tags 33


In [None]:
# Under 1 tag value without other
music_2tagvalue = music_1tagvalue[music_1tagvalue['tagValue'] != 'other']
print("Dataset contains", len(music_2tagvalue), "rows")
print("Number of unique tags", len(music_2tagvalue['tagValue'].unique()))

Dataset contains 53041 rows
Number of unique tags 32


In [None]:
col = 'tagValue'  
n = 1000
fliter_df = music_1tagvalue[music_1tagvalue.groupby(col)[col].transform('count').ge(n)]
print("Dataset contains", len(fliter_df), "rows")
print("Number of unique tags", len(fliter_df['tagValue'].unique()))

Dataset contains 74519 rows
Number of unique tags 20


In [None]:
# Only top 3 tags value
music_top3 = music_df_new[music_df_new['tagValue'].isin(['rock','pop','alternative'])]
print("Dataset contains", len(music_top3), "rows")

# Only top 5 tags value
music_top5 = music_df_new[music_df_new['tagValue'].isin(['rock','pop','alternative','electronic','indie'])]
print("Dataset contains", len(music_top5), "rows")

Dataset contains 15529 rows
Dataset contains 24822 rows


In [None]:
music_sig = music_df_new[music_df_new['tagValue'].isin(['piano','seen live','folk'])]

In [None]:
# Import trained embeddings

with open("/content/drive/My Drive/gmf_item_embedding_neg.pickle", 'rb') as gmf_item:
  trained_gmf_items = pickle.load(gmf_item)
with open("/content/drive/My Drive/mlp_item_embeddings_neg.pickle", 'rb') as mlp_item:
  trained_mlp_items = pickle.load(mlp_item)

In [None]:
# choose any of the 2 embedding layers below:
# 'trained_gmf_items', 'trained_mlp_items'

def visualise(embedding_layer, n_components=2, perplexity=30, n_iter=1000, learning_rate=10):
  tsne = TSNE(n_components=2, verbose=1, perplexity=30, n_iter=1000,learning_rate=10)
  tsne_results = tsne.fit_transform(embedding_layer)
  
  return tsne_results

gmf_item_2d = visualise(trained_gmf_items)
mlp_item_2d = visualise(trained_mlp_items)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 10720 samples in 0.074s...
[t-SNE] Computed neighbors for 10720 samples in 14.316s...
[t-SNE] Computed conditional probabilities for sample 1000 / 10720
[t-SNE] Computed conditional probabilities for sample 2000 / 10720
[t-SNE] Computed conditional probabilities for sample 3000 / 10720
[t-SNE] Computed conditional probabilities for sample 4000 / 10720
[t-SNE] Computed conditional probabilities for sample 5000 / 10720
[t-SNE] Computed conditional probabilities for sample 6000 / 10720
[t-SNE] Computed conditional probabilities for sample 7000 / 10720
[t-SNE] Computed conditional probabilities for sample 8000 / 10720
[t-SNE] Computed conditional probabilities for sample 9000 / 10720
[t-SNE] Computed conditional probabilities for sample 10000 / 10720
[t-SNE] Computed conditional probabilities for sample 10720 / 10720
[t-SNE] Mean sigma: 0.283936
[t-SNE] KL divergence after 250 iterations with early exaggeration: 95.492859
[t-SNE] KL

In [None]:
def tsne_item_visual(tsne_results, name, dataset):
  '''input: Embedding vector to visualize and the name of the embedding vector.
  output: interactive plot with user id labeled on each point. The plot is saved in html file and can be download from colab'''

  df_combine = pd.DataFrame([i for i in range(len(tsne_results[:,0]))])
  df_combine.columns = ['item_id']
  df_combine['x-tsne'] = tsne_results[:,0]
  df_combine['y-tsne'] = tsne_results[:,1] 
  df_combine = pd.merge(df_combine, dataset, left_on = 'item_id', right_on = 'item_id').dropna()

  source = ColumnDataSource(dict( item_id = df_combine['item_id'], x = df_combine['x-tsne'], y = df_combine['y-tsne'],
                            tagValue = df_combine['tagValue']))
  
  # Define palette
  unique_tagValue = df_combine['tagValue'].unique()

  def define_palette(len_cluster):
    if len_cluster <= 2:
      palette = Dark2[3]
    elif len_cluster <= 8:
      palette = Dark2[len_cluster]
    elif len_cluster <= 20:
      palette = Category20[len_cluster]
    else:
      try:
        palette = inferno(len_cluster)
      except ValueError:
        palette = inferno(256)
    return palette

  title = 'T-SNE visualization of embeddings '+ name

  plot_tagValue = figure(plot_width=1000, plot_height=600,
                     title=title, tools="pan,wheel_zoom,box_zoom,reset,hover,save",
                     x_axis_type=None, y_axis_type=None, min_border=1)
  plot_tagValue.scatter(x='x', y='y', source=source,
                alpha=0.7, size=10, fill_color=factor_cmap('tagValue', palette = define_palette(len(unique_tagValue)), factors = unique_tagValue),
                legend_group='tagValue')
  # hover tools
  tagValue_hover = plot_tagValue.select(dict(type=HoverTool))
  tagValue_hover.tooltips = {"content": "itemid: @item_id"}
  plot_tagValue.legend.location = "top_left"
  plot_tagValue.legend.orientation = "horizontal"
  plot_tagValue.legend.click_policy="hide"

  # Create two panels, one for each conference
  By_tagValue = Panel(child=plot_tagValue, title='By tagValue')

  # Assign the panels to Tabs
  tabs = Tabs(tabs=[By_tagValue])

  # Show the tabbed layout
  show(tabs)

In [None]:
# Top5 tagValue
tsne_item_visual(gmf_item_2d, 'gmf_item_embeddings', music_top5)

In [None]:
# Top5 tagValue
tsne_item_visual(mlp_item_2d, 'mlp_item_embeddings', music_top5)

Predict embeddings

In [None]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
import xgboost as xgb

def logistic(trainx,trainy,testx,testy):
    logreg = LogisticRegression(fit_intercept = True, solver='liblinear').fit(trainx,trainy)
    return sum(logreg.predict(testx) == testy)/len(testy)

def LDA(trainx,trainy,testx,testy):
    LDA = LinearDiscriminantAnalysis(solver = 'lsqr').fit(trainx,trainy)
    return sum(LDA.predict(testx) == testy)/len(testy)

def KNN(trainx,trainy,testx,testy):
    neigh = KNeighborsClassifier().fit(trainx, trainy)
    return sum(neigh.predict(testx) == testy)/len(testy)

def SVM(trainx,trainy,testx,testy):
    SVM = svm.SVC().fit(trainx, trainy)
    return sum(SVM.predict(testx) == testy)/len(testy)

def balSVM(trainx,trainy,testx,testy):
    balSVM = svm.SVC(class_weight = 'balanced').fit(trainx, trainy)
    return sum(balSVM.predict(testx) == testy)/len(testy)

def XG(trainx,trainy,testx,testy):
    XGboost = xgb.XGBClassifier(objective="multi:softprob", random_state=42).fit(trainx, trainy)
    return sum(XGboost.predict(testx) == testy)/len(testy)

def NB(trainx,trainy,testx,testy):
    gnb = GaussianNB().fit(trainx, trainy)
    return sum(gnb.predict(testx) == testy)/len(testy)

def tree(trainx,trainy,testx,testy):
    tree = DecisionTreeClassifier().fit(trainx, trainy)
    return sum(tree.predict(testx) == testy)/len(testy)

def RF(trainx,trainy,testx,testy):
    rf = RandomForestClassifier().fit(trainx, trainy)
    return sum(rf.predict(testx) == testy)/len(testy)
  
def zero_rule_algorithm(trainy, testy):
    prediction = max(list(trainy),key = list(trainy).count)
    predicted = [prediction for i in range(len(testy))]
    return sum(predicted == testy)/len(testy)

In [None]:
# Create combine data set
dataset = pd.DataFrame(trained_mlp_items)
dataset['item_id'] = dataset.index
dataset = pd.merge(dataset, fliter_df, left_on = 'item_id', right_on = 'item_id').dropna()
dataset = dataset.drop(['userID', 'timestamp'], axis=1).drop_duplicates().reset_index(drop=True)
dataset['Label'] = pd.factorize(dataset['tagValue'])[0] # Create LabelEncoder
label_code_dict = dict(zip(dataset['Label'], dataset['tagValue'])) # Create dict to map LabelEncoder
dataset.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,item_id,artistID,tagValue,tagValue_count,Label
0,0.02871,0.228472,-0.182835,-0.257693,0.233152,0.102852,0.063559,0.226125,-0.087413,-0.09768,0.034947,0.228931,0.16711,-0.083559,0.159514,-0.040975,0.095757,-0.058876,-0.121407,0.085236,-0.105945,-0.057172,0.065878,0.180889,0.082445,0.041867,-0.113571,0.037968,0.050365,0.003092,0.102777,0.106085,0,1,other,1,0
1,0.180358,-0.30569,0.347976,0.101505,0.100503,0.256151,-0.380656,0.115021,0.141255,-0.167207,-0.089147,0.18079,0.072377,0.050508,0.19818,0.22294,0.033853,-0.079205,0.006591,-0.075997,-0.155587,0.278437,0.0754,-0.262052,-0.1595,-0.030429,-0.072197,-0.12344,-0.392146,0.023985,-0.035292,0.105229,1,2,other,1,0
2,0.047774,-0.005385,-0.0068,0.074297,-0.044058,0.06238,0.113256,0.070319,-0.143604,-0.001148,-0.053092,0.157422,-0.12191,0.155959,0.25171,-0.252012,-0.137892,0.016689,0.021251,0.228982,0.157899,-0.030645,-0.038248,0.020716,0.125289,-0.119088,0.099001,-0.025166,-0.352689,0.112921,0.015134,0.194624,2,3,other,1,0
3,0.06633,-0.089108,-0.068609,-0.254234,0.245231,-0.02725,-0.027798,0.249476,-0.000915,0.076621,-0.016975,0.202802,0.068392,0.079113,0.147152,0.05758,0.114935,-0.215153,-0.106445,0.111061,0.16152,-0.027549,0.110118,0.038921,-0.0872,-0.010252,-0.016572,-0.208289,0.124241,0.046822,0.10525,0.05832,3,4,other,1,0
4,0.036263,-0.068945,0.003374,-0.201651,-0.013347,0.123709,0.001213,0.169078,-0.04844,-0.196258,0.135842,0.021389,0.166956,0.131793,0.011068,-0.139461,0.118141,-0.148755,0.186051,0.111637,-0.055538,0.066024,0.044078,-0.081221,0.031313,-0.075919,-0.024338,-0.088109,-0.109116,0.177316,0.109931,0.061967,4,5,other,1,0


In [None]:
# Define target variables
X = dataset.iloc[:, np.r_[0:32]]
Y = dataset['Label']
print("input:\n", X)
print("Target:\n", Y)

input:
             0         1         2   ...        29        30        31
0     0.028710  0.228472 -0.182835  ...  0.003092  0.102777  0.106085
1     0.180358 -0.305690  0.347976  ...  0.023985 -0.035292  0.105229
2     0.047774 -0.005385 -0.006800  ...  0.112921  0.015134  0.194624
3     0.066330 -0.089108 -0.068609  ...  0.046822  0.105250  0.058320
4     0.036263 -0.068945  0.003374  ...  0.177316  0.109931  0.061967
...        ...       ...       ...  ...       ...       ...       ...
9548 -0.039096  0.010348 -0.093432  ...  0.216307  0.189981  0.207877
9549 -0.111866  0.066978 -0.072040  ...  0.287445  0.144671  0.114676
9550 -0.016811  0.156694  0.088300  ...  0.127399 -0.115773  0.268949
9551 -0.087291  0.169154 -0.041797  ... -0.037944  0.272202  0.167765
9552 -0.112296  0.012771 -0.009605  ...  0.188161  0.116888  0.172059

[9553 rows x 32 columns]
Target:
 0        0
1        0
2        0
3        0
4        0
        ..
9548     0
9549     8
9550    13
9551     0
9552   

In [None]:
# k-fold
# for mlp implicit
logistic_acc = []
LDA_acc = []
KNN_acc = []
SVM_acc = []
balSVM_acc = []
XG_acc = []
NB_acc = []
tree_acc = []
rf_acc = []
baseline = []
index_dict = {}


kf = KFold(n_splits = 3, shuffle = True, random_state = 2) # split train and test using K-folds
i = 0
for train_index, test_index in kf.split(X):
    index_dict[i] = train_index
    trainX, testX = X.iloc[train_index], X.iloc[test_index] 
    trainy, testy = Y.iloc[train_index], Y.iloc[test_index]

    # Run all models
    logistic_acc.append(logistic(trainX,trainy,testX,testy))
    LDA_acc.append(LDA(trainX,trainy,testX,testy))
    KNN_acc.append(KNN(trainX,trainy,testX,testy))
    SVM_acc.append(SVM(trainX,trainy,testX,testy))
    balSVM_acc.append(balSVM(trainX,trainy,testX,testy))
    XG_acc.append(XG(trainX,trainy,testX,testy))
    NB_acc.append(NB(trainX,trainy,testX,testy))
    tree_acc.append(tree(trainX,trainy,testX,testy))
    rf_acc.append(RF(trainX,trainy,testX,testy))
    baseline.append(zero_rule_algorithm(trainy,testy))

    i += 1

results = pd.DataFrame(
    {'Baseline': baseline,
     'Logistic Regression': logistic_acc,
     'Linear discriminant analysis': LDA_acc,
     'Naive Bayes': NB_acc,
     'K-Neighbors Classifier': KNN_acc,
     'Decision Tree': tree_acc,
     'Random forest': rf_acc,
     'Support Vector Machines': SVM_acc,
     'XGBoost': XG_acc
    }) 
results.loc['Accuracy'] = results.mean()
results = results.transpose().sort_values(by=['Accuracy'], ascending=False)
results

Unnamed: 0,0,1,2,Accuracy
Support Vector Machines,0.551962,0.539259,0.545854,0.545692
XGBoost,0.549765,0.540201,0.543656,0.54454
Random forest,0.552276,0.534234,0.544598,0.543703
Logistic Regression,0.540345,0.532977,0.535176,0.536166
Baseline,0.538148,0.529837,0.531721,0.533235
Linear discriminant analysis,0.539403,0.520415,0.527638,0.529152
K-Neighbors Classifier,0.525589,0.512249,0.528266,0.522035
Decision Tree,0.363893,0.351131,0.355214,0.356746
Naive Bayes,0.324333,0.320666,0.329774,0.324924


In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
logistic_acc = []
LDA_acc = []
KNN_acc = []
SVM_acc = []
XG_acc = []
NB_acc = []
tree_acc = []
rf_acc = []
baseline = []

sss = StratifiedShuffleSplit(n_splits=3, random_state=2)
for train_index, test_index in sss.split(X,Y):
    trainX, testX = X.iloc[train_index], X.iloc[test_index] 
    trainy, testy = Y.iloc[train_index], Y.iloc[test_index]

    # Run all models
    logistic_acc.append(logistic(trainX,trainy,testX,testy))
    LDA_acc.append(LDA(trainX,trainy,testX,testy))
    KNN_acc.append(KNN(trainX,trainy,testX,testy))
    SVM_acc.append(SVM(trainX,trainy,testX,testy))
    XG_acc.append(XG(trainX,trainy,testX,testy))
    NB_acc.append(NB(trainX,trainy,testX,testy))
    tree_acc.append(tree(trainX,trainy,testX,testy))
    rf_acc.append(RF(trainX,trainy,testX,testy))
    baseline.append(zero_rule_algorithm(trainy,testy))

results = pd.DataFrame(
    {'Baseline': baseline,
     'Logistic Regression': logistic_acc,
     'Linear discriminant analysis': LDA_acc,
     'Naive Bayes': NB_acc,
     'K-Neighbors Classifier': KNN_acc,
     'Decision Tree': tree_acc,
     'Random forest': rf_acc,
     'Support Vector Machines': SVM_acc,
     'XGBoost': XG_acc
    }) 
results.loc['Accuracy'] = results.mean()
results = results.transpose().sort_values(by=['Accuracy'], ascending=False)
results

Unnamed: 0,0,1,2,Accuracy
Support Vector Machines,0.550209,0.548117,0.551255,0.549861
XGBoost,0.547071,0.550209,0.547071,0.548117
Random forest,0.550209,0.547071,0.543933,0.547071
Logistic Regression,0.536611,0.540795,0.538703,0.538703
Baseline,0.533473,0.533473,0.533473,0.533473
Linear discriminant analysis,0.526151,0.529289,0.538703,0.531381
K-Neighbors Classifier,0.518828,0.544979,0.523013,0.52894
Decision Tree,0.351464,0.356695,0.353556,0.353905
Naive Bayes,0.303347,0.334728,0.354603,0.330893


In [None]:
from sklearn.model_selection import StratifiedKFold
logistic_acc = []
LDA_acc = []
KNN_acc = []
SVM_acc = []
XG_acc = []
NB_acc = []
tree_acc = []
rf_acc = []
baseline = []

skf = StratifiedKFold(n_splits = 3, random_state=2) 
for train_index, test_index in skf.split(X,Y):
    trainX, testX = X.iloc[train_index], X.iloc[test_index] 
    trainy, testy = Y.iloc[train_index], Y.iloc[test_index]

    # Run all models
    logistic_acc.append(logistic(trainX,trainy,testX,testy))
    LDA_acc.append(LDA(trainX,trainy,testX,testy))
    KNN_acc.append(KNN(trainX,trainy,testX,testy))
    SVM_acc.append(SVM(trainX,trainy,testX,testy))
    XG_acc.append(XG(trainX,trainy,testX,testy))
    NB_acc.append(NB(trainX,trainy,testX,testy))
    tree_acc.append(tree(trainX,trainy,testX,testy))
    rf_acc.append(RF(trainX,trainy,testX,testy))
    baseline.append(zero_rule_algorithm(trainy,testy))

results = pd.DataFrame(
    {'Baseline': baseline,
     'Logistic Regression': logistic_acc,
     'Linear discriminant analysis': LDA_acc,
     'Naive Bayes': NB_acc,
     'K-Neighbors Classifier': KNN_acc,
     'Decision Tree': tree_acc,
     'Random forest': rf_acc,
     'Support Vector Machines': SVM_acc,
     'XGBoost': XG_acc
    }) 
results.loc['Accuracy'] = results.mean()
results = results.transpose().sort_values(by=['Accuracy'], ascending=False)
results



Unnamed: 0,0,1,2,Accuracy
Support Vector Machines,0.543485,0.538631,0.534548,0.538888
Random forest,0.542229,0.540829,0.531721,0.53826
XGBoost,0.541287,0.538945,0.532349,0.537527
Logistic Regression,0.542857,0.532663,0.532663,0.536061
Baseline,0.533124,0.533291,0.533291,0.533236
Linear discriminant analysis,0.500157,0.531721,0.519158,0.517012
K-Neighbors Classifier,0.512402,0.509736,0.514761,0.5123
Decision Tree,0.297017,0.335113,0.359925,0.330685
Naive Bayes,0.248666,0.348932,0.383794,0.327131
