In [1]:
from neo4j import GraphDatabase
import pandas as pd
import numpy as np
import numpy as np
from numpy.linalg import norm
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

In [2]:
uri = "bolt://localhost:7687"
username = "neo4j"
password = "password"
driver = GraphDatabase.driver(uri, auth=(username, password))

spotify_data = pd.read_csv("spotify.csv")

# Filter the data set to only include the three Regina Spektor songs 
regina_songs = ['The Call', 'Two Birds', 'Samson']
regina_data = spotify_data[spotify_data['track_name'].isin(regina_songs)]

sample_size = 1000
remaining_data = spotify_data[~spotify_data['track_name'].isin(regina_songs)] # use the negation operator to exclude the Regina Spektor songs
sample_data = remaining_data.sample(n=min(sample_size-len(regina_data), len(remaining_data)), replace=False) 

# Combine the Regina Spektor songs with the sampled data
sample_data = pd.concat([sample_data, regina_data])

In [3]:
#get cosine similarity of songs to every other song 
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

#numeric features used for cosine similarity
features = ['danceability', 'energy', 'tempo','loudness','valence']
df_1 = sample_data[features]

#return np array of cosine similarity of numeric features
cosine_sim_results = cosine_similarity(sample_data[features])

#make dataframe with cosine similarities
cos_sim_df = pd.DataFrame(cosine_sim_results, columns=df_1.index, index=df_1.index)

#df with three columns - track1_name, track2_name, and similarity score
# sim_score already has features 
cols = ['track1_name', 'track2_name', 'similarity_score']
songs_cosine_similarities = pd.DataFrame(columns = cols)

dictionary_list = []

#compute cosine similarity of one song with every other
for i in range(len(sample_data)):
    #compare with next
    for j in range(i+1, len(sample_data)):
        #get track names
        track1_name = sample_data.iloc[i]['track_name']
        track2_name = sample_data.iloc[j]['track_name']
        #get the cosine_similarity score for (i,j) pair
        cosine_similarity_pair_score = cos_sim_df.iloc[i, j]
        #make df with names and scores
        dictionary_data = {'track1_name' : track1_name, 
                          'track2_name' : track2_name, 
                         'similarity_score': cosine_similarity_pair_score}
        dictionary_list.append(dictionary_data)

In [4]:
df_final = pd.DataFrame.from_dict(dictionary_list)
df_final['similarity_score'].mean()

0.9944313664850571

In [5]:
df_final.to_csv('spotify_with_similarity_score.csv')

In [6]:
# Define the query to create the nodes and edges
query = """
UNWIND $rows AS row
MATCH (s1:Song {name: row.track1_name})
MATCH (s2:Song {name: row.track2_name})
WHERE row.similarity_score > 0.80
MERGE (s1)-[s:SIMILAR_TO {score: row.similarity_score}]->(s2)
"""

# Define the parameters for the query
params = {"rows": df_final[df_final["similarity_score"] > 0.80].to_dict("records")}

# Run the query using the driver
with driver.session() as session:
    session.run(query, params)

ServiceUnavailable: Couldn't connect to localhost:7687 (resolved to ()):
Failed to establish connection to ResolvedIPv6Address(('::1', 7687, 0, 0)) (reason [Errno 61] Connection refused)
Failed to establish connection to ResolvedIPv4Address(('127.0.0.1', 7687)) (reason [Errno 61] Connection refused)

In [None]:
with driver.session() as session:
    query = """
    MATCH (s1:Song)-[r:SIMILAR_TO]->(s2:Song)
    WHERE s1.name <> 'The Call' AND s1.name <> 'Two Birds' AND s1.name <> 'Samson'
    AND s2.name IN ['The Call', 'Two Birds', 'Samson']
    AND r.similarity_score IS NOT NULL AND toFloat(r.similarity_score) > 0.8
    RETURN DISTINCT s2.name AS song_name, r.similarity_score AS similarity_score
    ORDER BY similarity_score DESC
    LIMIT 5
    """
    result = session.run(query)

    # Print the recommended songs
    print("Song Recommendations:")
    for record in result:
        print(f"{record['song_name']} (Similarity Score: {record['similarity_score']})")

In [None]:
query = """
MATCH (n:Song)
RETURN n
"""

# Execute the query and convert the results to a Pandas DataFrame
with driver.session() as session:
    results = session.run(query)
df = pd.DataFrame([r.values() for r in results], columns=results.keys())

# Display the DataFrame in the notebook
display(df)