In [1]:
from neo4j import GraphDatabase
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
import time
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [2]:
start = time.time()

In [3]:
client_id = "e37e4ca76eaf4ccc90759700fae67208"
client_secret = "d5013a3166b0439d8913884b639cd5ae"
neo4j_url = "bolt://localhost:7687"                               # url of the neo4j database.
neo4j_username = "neo4j"                                            # neo4j username. defaults to 'neo4j'.
neo4j_password = "1234"

In [4]:
def create_neo4j_session(url, user, password):
    driver = GraphDatabase.driver(url, auth=(user, password))
    return driver.session()

In [5]:
neo4j = create_neo4j_session(url=neo4j_url, user=neo4j_username, password=neo4j_password)

In [6]:
main_user = 'sari'
followinguser = []

neighbours = """
            MATCH (u1:User)-[f:FOLLOWS]->(u:User)
WHERE u1.username = $mainuser
WITH (f.mutualweight * f.similarity * f.influence) as score, u.username as user
WHERE score IS NOT NULL  // exclude rows where score is null
RETURN user, score
ORDER BY score DESC LIMIT 5"""
recos = {}           
result = neo4j.run(neighbours, mainuser = main_user)
for item in result:
    recos[item[0]] = item[1]
    
followinguser = list(recos.keys())
print(followinguser)

['faturmuhammad', 'tolliesw', 'avrilestmonmois', 'der', 'niko']


In [7]:
def tracks_mainuser(followinguser):
    # create dataset from list track mainuser
    tracks_user = """
        MATCH (u1:User)-[:LISTEN]->(t:Track)<-[:LISTEN]-(u2:User), (t:Track)-[:HAS_ARTIST]->(a:Artist)-[:HAS_GENRE]->(g1:Genre)
        WHERE u1 <> u2
        AND u1.username = $mainuser AND u2.username = $followinguser
        WITH distinct t as tracklike, g1.name as genre 

        MATCH (u3:User)-[:LISTEN]->(t1:Track)-[:HAS_ARTIST]->(a:Artist)-[:HAS_GENRE]->(g:Genre)
        WHERE g.name in genre AND u3.username = $mainuser
        with distinct t1 as trackgenre, tracklike

        match (t:Track)-[:HAS_ARTIST]->(ar:Artist)
        where t in [tracklike]
        with DISTINCT ar as artistlike ,trackgenre, tracklike

        match (t:Track)-[:IN_ALBUM]->(al:Album)
        where t in [tracklike]
        with DISTINCT al as albumlike, artistlike ,trackgenre, tracklike

        match (t:Track)-[:HAS_ARTIST]->(a:Artist)-[:HAS_GENRE]->(g:Genre)
        where t in [trackgenre] and a in [artistlike]
        return t.name as track_name, COLLECT(distinct g.name) AS genre,  t.acousticness as acousticness, t.danceability as danceability,
                            t.energy as energy, t.liveness as liveness, t.loudness as loudness, t.speechiness as speechiness, 
                            t.tempo as tempo
                        """

    playlist_user = pd.DataFrame([dict(_) for _ in neo4j.run(tracks_user, mainuser = main_user, followinguser = followinguser)])
    playlist_user = playlist_user.drop_duplicates(subset=['track_name'], keep="last")
    playlist_user.reset_index(drop = True, inplace = True)
    return playlist_user

In [8]:
def tracks_followinguser(followinguser):
    # create dataset from list track followinguser
    tracks_recom = """
        MATCH (u1:User)-[:LISTEN]->(t:Track)<-[:LISTEN]-(u2:User), (t:Track)-[:HAS_ARTIST]->(a:Artist)-[:HAS_GENRE]->(g1:Genre)
        WHERE u1 <> u2
        AND u1.username = $mainuser AND u2.username = $followinguser
        WITH distinct t as tracklike, g1.name as genre 

        MATCH (u3:User)-[:LISTEN]->(t1:Track)-[:HAS_ARTIST]->(a:Artist)-[:HAS_GENRE]->(g:Genre)
        WHERE g.name in genre AND u3.username = $followinguser
        with distinct t1 as trackgenre, tracklike

        match (t:Track)-[:HAS_ARTIST]->(ar:Artist)
        where t in [tracklike]
        with DISTINCT ar as artistlike ,trackgenre, tracklike

        match (t:Track)-[:IN_ALBUM]->(al:Album)
        where t in [tracklike]
        with DISTINCT al as albumlike, artistlike ,trackgenre, tracklike

        match (t:Track)-[:HAS_ARTIST]->(a:Artist)-[:HAS_GENRE]->(g:Genre)
        where t in [trackgenre] and a in [artistlike]
        return t.name as track_name, COLLECT(distinct g.name) AS genre, t.acousticness as acousticness, t.danceability as danceability,
                            t.energy as energy, t.liveness as liveness, t.loudness as loudness, t.speechiness as speechiness, 
                            t.tempo as tempo
                        """

    recoms = pd.DataFrame([dict(_) for _ in neo4j.run(tracks_recom, mainuser = main_user, followinguser = followinguser)])
    recoms = recoms.drop_duplicates(subset=['track_name'], keep="last")
    recoms.reset_index(drop = True, inplace = True)
    return recoms

In [9]:
def create_similarity_score(df1,df2):
    assert list(df1.columns[2:]) == list(df2.columns[2:])
    features = list(df1.columns[2:])
    df_features1,df_features2 = df1[features],df2[features]
    
    #cosine similarity antara lagu yang disukai user pada genre tersebut dibandingkan dengan 
    cosine_sim = cosine_similarity(df_features1, df_features2)
    return cosine_sim

In [10]:
def minmaxscaler(df1,df2):
    #menyamakan jumlah column dataframe
    assert list(df1.columns[2:]) == list(df2.columns[2:])
    features = list(df1.columns[2:])
    df_features1,df_features2 = df1[features],df2[features]
    
    # Combine the two datasets along the rows axis
    combined_df = pd.concat([df_features1, df_features2], axis=0)

    # Initialize an instance of MinMaxScaler and fit_transform the combined dataset
    scaler = MinMaxScaler()
    scaled_df = scaler.fit_transform(combined_df)

    # Split the scaled dataset back into the two original datasets using indexing
    df_features1_new = pd.DataFrame(scaled_df[:len(df_features1)], columns=df_features1.columns)
    df_features2_new = pd.DataFrame(scaled_df[len(df_features1):], columns=df_features2.columns)
    
    df_features1_scaler = pd.concat([df1.drop(df_features1, axis=1), df_features1_new], axis=1)
    df_features2_scaler = pd.concat([df2.drop(df_features2, axis=1), df_features2_new], axis=1)
    
    return df_features1_scaler, df_features2_scaler

In [11]:
## generate data from neo4j
all_track_follsuer = pd.DataFrame(columns=['track_name','genre', 'acousticness','danceability','energy','liveness','loudness', 'speechiness', 'tempo'])
all_track_user = pd.DataFrame(columns=['track_name','genre', 'acousticness','danceability','energy','liveness','loudness', 'speechiness', 'tempo'])

for user in followinguser:
    tracks_user = tracks_mainuser(user)
    tracks_folluser = tracks_followinguser(user)
    
    all_track_user = pd.concat([all_track_user, tracks_user])
    all_track_follsuer = pd.concat([all_track_follsuer, tracks_folluser])
    

In [12]:
# cleaning data
all_track_user = all_track_user.drop_duplicates(subset=['track_name'], keep="last")
all_track_user.reset_index(drop = True, inplace = True)
all_track_user

all_track_follsuer = all_track_follsuer.drop_duplicates(subset=['track_name'], keep="last")
all_track_follsuer.reset_index(drop = True, inplace = True)

all_track_user

Unnamed: 0,track_name,genre,acousticness,danceability,energy,liveness,loudness,speechiness,tempo
0,BOOMBAYAH,"[k-pop, pop, k-pop girl group]",0.00264,0.658,0.836,0.507,-3.282,0.0549,124.969
1,WHISTLE,"[k-pop, pop, k-pop girl group]",0.00682,0.822,0.712,0.0955,-4.877,0.15,102.874
2,WE GO,"[k-pop, k-pop girl group]",0.0162,0.666,0.925,0.203,-3.193,0.102,124.001
3,Rollin',"[k-pop, k-pop girl group]",0.0312,0.69,0.878,0.188,-3.908,0.0917,124.948
4,We Ride,"[k-pop, k-pop girl group]",0.182,0.602,0.947,0.132,-1.613,0.0615,110.055
5,LOCO,"[k-pop, k-pop girl group]",0.0109,0.764,0.886,0.325,-3.067,0.177,102.012
6,SWIPE,"[k-pop, k-pop girl group]",0.00151,0.781,0.757,0.19,-4.268,0.0547,96.009
7,RUMOR,[k-pop],0.0491,0.726,0.758,0.077,-3.324,0.178,88.999
8,"thank u, next",[pop],0.28,0.724,0.647,0.102,-5.642,0.0658,106.96
9,ghostin,[pop],0.418,0.287,0.364,0.185,-8.295,0.0306,103.777


In [13]:
all_track_follsuer

Unnamed: 0,track_name,genre,acousticness,danceability,energy,liveness,loudness,speechiness,tempo
0,BREATHE,[k-pop],0.7840,0.609,0.246,0.0825,-8.449,0.0376,123.773
1,"1, 2, 3, 4",[k-pop],0.3640,0.905,0.649,0.0881,-4.116,0.0473,115.962
2,Yours,[k-pop],0.1370,0.676,0.774,0.1060,-3.988,0.0338,86.994
3,Tip Toe (with LeeHi),[k-pop],0.6710,0.767,0.692,0.1750,-5.686,0.1940,89.942
4,We'll shine brighter than any other stars,[k-pop],0.5150,0.511,0.530,0.4720,-6.525,0.1390,169.641
...,...,...,...,...,...,...,...,...,...
82,Attention,"[k-pop, k-pop girl group]",0.2360,0.811,0.648,0.0761,-3.684,0.0368,104.988
83,Cookie,"[k-pop, k-pop girl group]",0.0596,0.847,0.638,0.0894,-3.764,0.0575,156.935
84,H.S.K.T. (feat. Wonstein),[k-pop],0.2520,0.756,0.672,0.1170,-5.711,0.0373,113.975
85,For You,[k-pop],0.7240,0.588,0.452,0.1070,-7.778,0.0390,83.936


In [14]:
# normalization dataset mainuser and followinguser
user_scaler, folls_scaler = minmaxscaler(all_track_user,all_track_follsuer)

In [15]:
# Splitting the dataset into training and test sets
X_train, X_test = train_test_split(user_scaler, test_size=0.2, random_state=41)

In [16]:
print(len(X_train))
X_train

32


Unnamed: 0,track_name,genre,acousticness,danceability,energy,liveness,loudness,speechiness,tempo
15,Interlude,[korean pop],0.899061,0.533981,0.255911,0.113874,0.0,0.280759,0.396324
4,We Ride,"[k-pop, k-pop girl group]",0.19249,0.509709,0.974965,0.136126,0.906615,0.076087,0.332733
22,Feel My Rhythm,"[k-pop, k-pop girl group]",0.067964,0.202265,0.965229,0.431937,0.865649,0.431082,0.732125
10,Energetic,"[k-pop boy group, k-pop]",0.060739,0.690939,0.890125,0.109948,0.799207,0.040472,0.457705
33,Savage,"[k-pop, k-pop girl group]",0.130864,0.711974,0.880389,0.282723,0.939358,0.232192,0.640382
5,LOCO,"[k-pop, k-pop girl group]",0.010694,0.771845,0.890125,0.388743,0.799868,0.343201,0.265683
28,Rewrite The Stars,"[post-teen pop, pop, hollywood, movie tunes]",0.075189,0.642395,0.518776,0.123037,0.510755,0.023127,0.457705
36,Hype Boy,"[k-pop, k-pop girl group]",0.283866,0.495146,0.958275,0.337696,0.773218,0.454209,0.24871
2,WE GO,"[k-pop, k-pop girl group]",0.016326,0.613269,0.944367,0.229058,0.790617,0.16975,0.448993
17,I am not your ocean anymore,[korean pop],0.203115,0.419094,0.511822,0.061649,0.598121,0.049029,0.082756


In [17]:
print(len(X_test))
X_test

8


Unnamed: 0,track_name,genre,acousticness,danceability,energy,liveness,loudness,speechiness,tempo
29,Heart Attack (츄),"[k-pop, k-pop girl group]",0.054788,0.608414,0.849791,0.736911,0.771236,0.068224,0.24069
32,Next Level,"[k-pop, k-pop girl group]",0.517619,0.86246,0.842837,0.082068,0.836576,0.317761,0.324238
8,"thank u, next",[pop],0.296616,0.70712,0.557719,0.096859,0.610822,0.086031,0.306932
39,HOLO,[k-pop],0.815123,0.192557,0.317107,0.102094,0.558916,0.002081,0.91511
14,lovelovelove,[korean pop],0.981937,0.255663,0.090403,0.099476,0.299537,0.028215,0.361852
27,WEE WOO,"[k-pop, k-pop girl group]",0.094208,0.885113,0.933241,0.032984,0.81668,0.160962,0.498845
7,RUMOR,[k-pop],0.051282,0.710356,0.7121,0.064136,0.781,0.345513,0.157201
31,aenergy,"[k-pop, k-pop girl group]",0.108552,0.739482,0.905424,0.15445,1.0,0.13321,0.299004


In [18]:
data_folls = folls_scaler
final_recomms = pd.DataFrame(columns=['track_name','genre','acousticness',
                                           'danceability','energy','liveness',
                                           'loudness', 'speechiness', 'tempo', 'similarity'])

recomms_count = 0
while final_recomms.shape[0] < 10:
    similarity_score = create_similarity_score(X_train,data_folls)
    recomms = data_folls.iloc[[np.argmax(i) for i in similarity_score]]
    # print(final_recomms)
    similarity = []
    for i in similarity_score:
        kambing = np.amax(i)
        if kambing >= 0.999997:
            third_highest = np.partition(i, -3)[-3]
            similarity.append(third_highest)
        else:
            similarity.append(kambing)

    recomms.insert(loc=9, column="similarity", value=similarity)
    recomms = recomms.drop_duplicates(subset=['track_name'], keep="last")
    recomms = recomms[~recomms["track_name"].isin(X_train["track_name"])]


    recomms = recomms.sort_values(by='similarity', ascending=False).head(10)
    value_list = recomms['track_name'].tolist()
    data_folls = data_folls[~data_folls['track_name'].isin(value_list)]
    
    final_recomms = pd.concat([final_recomms, recomms])
    row_count = final_recomms.shape[0]
    
# data_folls
final_recomms = final_recomms.sort_values(by='similarity', ascending=False).head(10)
final_recomms.reset_index(drop = True, inplace = True)
final_recomms   

Unnamed: 0,track_name,genre,acousticness,danceability,energy,liveness,loudness,speechiness,tempo,similarity
0,PLAYING WITH FIRE,"[k-pop, pop, k-pop girl group]",0.041507,0.679612,0.7121,0.147906,0.693855,0.14593,0.224209,0.993382
1,Be In Love,"[k-pop, k-pop girl group]",0.030882,0.781553,0.646732,0.065576,0.625211,0.078168,0.207419,0.992774
2,Feel Good (SECRET CODE),"[k-pop, k-pop girl group]",0.045545,0.600324,0.930459,0.344241,0.750753,0.157493,0.457321,0.99275
3,In the morning,"[k-pop, k-pop girl group]",0.08677,0.906149,0.789986,0.087435,0.682329,0.285384,0.582844,0.991693
4,Talk To Me,"[k-pop, k-pop girl group]",0.305117,0.480583,0.788595,0.414921,0.671977,0.329325,0.789029,0.98982
5,Yours,[k-pop],0.144677,0.62945,0.734353,0.102094,0.732252,0.012026,0.140486,0.989457
6,How You Like That,"[k-pop, pop, k-pop girl group]",0.072851,0.875405,0.74548,0.034555,0.730343,0.146161,0.499112,0.988559
7,Honeymoon Avenue,[pop],0.230741,0.65534,0.503477,0.418848,0.640849,0.028677,0.457363,0.988389
8,Not Shy,"[k-pop, k-pop girl group]",0.163802,0.791262,0.924896,0.314136,0.798913,0.236818,0.257472,0.988161
9,Lovesick Girls,"[k-pop, pop, k-pop girl group]",0.004563,0.600324,0.675939,0.133508,0.675428,0.085106,0.482289,0.987861


In [19]:
from scipy.stats import pearsonr

recoms_mae= final_recomms.drop(['similarity','genre','track_name'], axis=1)
test_mae = X_test.drop(['genre','track_name'], axis=1)

row = len(final_recomms)
row1 = len(X_test)
correlation_mean = []
for i in range(row):
    my_array = []
    recoms = recoms_mae.iloc[i].astype(float)
    recoms_track = final_recomms.iloc[i]
    print(recoms_track['track_name'])
    for x in range(row1):
        test = test_mae.iloc[x].astype(float)
        corr, _ = pearsonr(recoms, test)
        
        test_track = X_test.iloc[x]
        my_array.append(corr)

    max_value = max(my_array)
    correlation_mean.append(max_value)
    print(max_value)
    print('\n')
    
mean = sum(correlation_mean) / len(correlation_mean)
# print(mean)
print("mean correlation :", mean )

PLAYING WITH FIRE
0.9798670416918451


Be In Love
0.9627636650085201


Feel Good (SECRET CODE)
0.9228690302587951


In the morning
0.972432205849745


Talk To Me
0.7136163267679214


Yours
0.9809359549029517


How You Like That
0.9882603606517276


Honeymoon Avenue
0.8112299066858722


Not Shy
0.9638924382440694


Lovesick Girls
0.9650984838225114


mean correlation : 0.9260965413883959


In [20]:
# Concatenate the two DataFrames into a single DataFrame
recoms_mae= final_recomms.drop(['similarity','genre','track_name'], axis=1)
test_mae = X_test.drop(['genre','track_name'], axis=1)

df = pd.concat([test_mae, recoms_mae], axis=1)
# df
# Calculate the absolute errors for each feature column
for col in test_mae.columns:
    df['abs_error_' + col] = np.abs(recoms_mae[col] - test_mae[col])
#     print(df['abs_error_' + col])

# Calculate the mean absolute error across all feature columns
mae = df[[col for col in df.columns if col.startswith('abs_error_')]].mean().mean()

# Print the result
print("Mean Absolute Error:", mae)

Mean Absolute Error: 0.19605063452563926
