In [10]:
from neo4j import GraphDatabase
import pandas as pd
import numpy as np

uri = "bolt://localhost:7687"
username = "neo4j"
password = "password"
driver = GraphDatabase.driver(uri, auth=(username, password))

# Read the Spotify data
spotify_data = pd.read_csv("spotify.csv")

# Filter the data set to only include the three Regina Spektor songs
regina_songs = ['The Call', 'Two Birds', 'Samson']
regina_data = spotify_data[spotify_data['track_name'].isin(regina_songs)]

# Sample the remaining songs from the dataset
sample_size = 1000
remaining_data = spotify_data[-spotify_data['track_name'].isin(regina_songs)]
sample_data = remaining_data.sample(n=min(sample_size, len(remaining_data)), replace=False)

# Combine the Regina Spektor songs with the sampled data
sample_data = pd.concat([sample_data, regina_data])

# Create the graph database
with driver.session() as session:
    # Create nodes for each song and their properties
    for index, row in sample_data.iterrows():
        query = "CREATE (:Song {name: $name, artist: $artist, danceability: $danceability, energy: $energy, loudness: $loudness, acousticness: $acousticness, instrumentalness: $instrumentalness, liveness: $liveness, tempo: $tempo, valence: $valence, genre: $genre})"
        session.run(query, name=row['track_name'], artist=row['artists'], danceability=row['danceability'], energy=row['energy'], loudness=row['loudness'], acousticness=row['acousticness'], instrumentalness=row['instrumentalness'], liveness=row['liveness'], tempo=row['tempo'], valence=row['valence'], genre=row['track_genre'])

    # Connect similar songs via edges
    for index1, row1 in sample_data.iterrows():
        for index2, row2 in sample_data.iterrows():
            if index1 != index2:
                distance = np.linalg.norm(row1[['danceability', 'energy', 'loudness', 'acousticness', 'instrumentalness', 'liveness', 'tempo', 'valence']] - row2[['danceability', 'energy', 'loudness', 'acousticness', 'instrumentalness', 'liveness', 'tempo', 'valence']])
                if distance < 2:
                    query = "MATCH (s1:Song {name: $name1}), (s2:Song {name: $name2}) CREATE (s1)-[:SIMILAR_TO {distance: $distance}]->(s2)"
                    session.run(query, name1=row1['track_name'], name2=row2['track_name'], distance=distance)


In [16]:
# Generate FIVE song recommendations for Professor Rachlin based on his liking of Regina Spektor songs
with driver.session() as session:
    # Find songs that are similar to Regina Spektor songs but exclude Regina Spektor songs
    query = """
    MATCH (s1:Song)-[r:SIMILAR_TO]->(s2:Song)
    WHERE s1.artist <> 'Regina Spektor' AND s1.name <> 'The Call' AND s1.name <> 'Two Birds' AND s1.name <> 'Samson'
    AND NOT s2.name IN ['The Call', 'Two Birds', 'Samson'] AND s2.artist <> 'Regina Spektor'
    RETURN DISTINCT s2.name AS song_name, s2.artist AS artist_name, r.distance AS similarity_score
    ORDER BY similarity_score DESC
    LIMIT 5
    """
    result = session.run(query)

    # Print the recommended songs
    print("Song Recommendations:")
    for record in result:
        print(f"{record['song_name']} by {record['artist_name']} (Similarity Score: {record['similarity_score']})")

Song Recommendations:
Super Freak by Rick James (Similarity Score: 1.9998579856002277)
Absurdity by Iron Lung (Similarity Score: 1.9998579856002277)
Kaagaz Ki Naav by Last Minute India (Similarity Score: 1.9997891924593336)
Yo Perreo Sola by Bad Bunny (Similarity Score: 1.9997891924593336)
誰よりも高く跳べ! by Keyakizaka46 (Similarity Score: 1.9997546273480589)
