In [21]:
import os
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import tqdm 
from openai import OpenAI

from pinecone import Pinecone

from langchain.graphs import Neo4jGraph
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.schema import TextNode
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import VectorStoreIndex
from llama_index.core import StorageContext

from langchain_community.graphs.graph_document import (
    Node as BaseNode,
    Relationship as BaseRelationship,
    GraphDocument,
)
from langchain.schema import Document
from typing import List, Dict, Any, Optional
from langchain.pydantic_v1 import Field, BaseModel
from langchain.chains import GraphCypherQAChain

In [2]:
load_dotenv(dotenv_path='secrets.env')
graph = Neo4jGraph(
    url=os.environ["NEO4J_URI"],
    username=os.environ["NEO4J_USERNAME"],
    password=os.environ["NEO4J_PASSWORD"]
)

In [3]:
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

In [31]:
def get_embeddings(text):
    response = client.embeddings.create(
        model="text-embedding-3-small",
        input = text
    )
    return response.data[0].embedding

In [5]:
df = pd.read_csv('dataset.csv')
# df.head()
track_id_list = df['track_id']
track_artists_list = df['artists']
track_name_list = df['track_name']

In [23]:
# initialize Pinecone
api_key = os.environ['PINECONE_API_KEY']
environment = os.environ['PINECONE_ENVIRONMENT']
pinecone = Pinecone(api_key=api_key, environment=environment)

index_name = "cos-15"
pinecone_index = pinecone.Index(index_name)

In [6]:
print(df['popularity'])

0         73
1         55
2         57
3         71
4         82
          ..
113995    21
113996    22
113997    22
113998    41
113999    22
Name: popularity, Length: 114000, dtype: int64


In [15]:
# normalize relevant columns
# popularity	duration_ms	explicit	danceability	energy	key	loudness	mode
# 	speechiness	acousticness	instrumentalness	liveness	valence	tempo	time_signature

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

# df['popularity_normalized'] = scaler.fit_transform(df[['popularity']])
df['duration_ms_normalized'] = scaler.fit_transform(df[['duration_ms']])
df['explicit_normalized'] = scaler.fit_transform(df[['explicit']])
df['danceability_normalized'] = scaler.fit_transform(df[['danceability']])
df['energy_normalized'] = scaler.fit_transform(df[['energy']])
df['key_normalized'] = scaler.fit_transform(df[['key']])
df['loudness_normalized'] = scaler.fit_transform(df[['loudness']])
df['mode_normalized'] = scaler.fit_transform(df[['mode']])
df['speechiness_normalized'] = scaler.fit_transform(df[['speechiness']])
df['acousticness_normalized'] = scaler.fit_transform(df[['acousticness']])
df['instrumentalness_normalized'] = scaler.fit_transform(df[['instrumentalness']])
df['liveness_normalized'] = scaler.fit_transform(df[['liveness']])
df['valence_normalized'] = scaler.fit_transform(df[['valence']])
df['tempo_normalized'] = scaler.fit_transform(df[['tempo']])
df['time_signature_normalized'] = scaler.fit_transform(df[['time_signature']])
# If you want to replace the original column instead of creating a new one, use:
# df['popularity'] = scaler.fit_transform(df[['popularity']])



In [19]:
# UPSERT
to_upsert = []

# iterate through the df
for index, row in df.iterrows():
    values = [row['popularity_normalized'],
              row['duration_ms_normalized'],
              row['explicit_normalized'],
              row['danceability_normalized'],
              row['energy_normalized'],
              row['key_normalized'],
              row['loudness_normalized'],
              row['mode_normalized'],
              row['speechiness_normalized'],
              row['acousticness_normalized'],
              row['instrumentalness_normalized'],
              row['liveness_normalized'],
              row['valence_normalized'],
              row['tempo_normalized'],
              row['time_signature_normalized']
    ]
    data = {
        'id': row['track_id'],
        'values': values,
        'metadata': {
            'track_name': row['track_name'],
            'artists': row['artists']
        }
    }
    to_upsert.append(data)



In [35]:
to_upsert_song_names = []
for index, row in df.iterrows():
    if index % 100 == 0:
        print(index, "/", len(df))
    name = row['track_name']
    id = row['track_id']
    vector = get_embeddings(name)
    data = {
        'id': id,
        'values': vector,
        'metadata': {
            'track_name': row['track_name'],
            'artists': row['artists']
        }
    }
    to_upsert_song_names.append(data)

batch_size = 10
for i in tqdm.tqdm(range(0, len(to_upsert_song_names), batch_size), desc="Upserting batches"):
    batch = to_upsert_song_names[i:i + batch_size]
    pinecone_index.upsert(vectors=batch, namespace="SongNames")

0 / 114000
100 / 114000
200 / 114000
300 / 114000
400 / 114000
500 / 114000
600 / 114000
700 / 114000
800 / 114000
900 / 114000
1000 / 114000
1100 / 114000
1200 / 114000
1300 / 114000
1400 / 114000
1500 / 114000


KeyboardInterrupt: 

In [26]:
to_upsert[:1]

[{'id': '5SuOikwiRyPMVoIQDJUgSV',
  'values': [0.73,
   0.04404296492750552,
   0.0,
   0.6862944162436548,
   0.461,
   0.09090909090909091,
   0.7913915247026616,
   0.0,
   0.14818652849740932,
   0.032329317269076306,
   1.01e-06,
   0.358,
   0.7185929648241205,
   0.3612453363575103,
   0.8],
  'metadata': {'track_name': 'Comedy', 'artists': 'Gen Hoshino'}}]

In [28]:
pinecone_index.upsert(vectors=to_upsert[:1], namespace="Tracks")

{'upserted_count': 1}

In [37]:
import math

accepted_songs = []
for index, row in df.iterrows():
    if index % 100 == 0:
        print(index, "/", len(df))
    if not (pd.isnull(row['track_name']) or pd.isnull(row['artists'])):
        if not ("'" in row['track_name']) and not ("'" in row['artists']) and not ('"' in row['track_name']) and not ('"' in row['artists']):
            accepted_songs.append(row)

df = pd.DataFrame(accepted_songs)
    

0 / 114000
100 / 114000
200 / 114000
300 / 114000
400 / 114000
500 / 114000
600 / 114000
700 / 114000
800 / 114000
900 / 114000
1000 / 114000
1100 / 114000
1200 / 114000
1300 / 114000
1400 / 114000
1500 / 114000
1600 / 114000
1700 / 114000
1800 / 114000
1900 / 114000
2000 / 114000
2100 / 114000
2200 / 114000
2300 / 114000
2400 / 114000
2500 / 114000
2600 / 114000
2700 / 114000
2800 / 114000
2900 / 114000
3000 / 114000
3100 / 114000
3200 / 114000
3300 / 114000
3400 / 114000
3500 / 114000
3600 / 114000
3700 / 114000
3800 / 114000
3900 / 114000
4000 / 114000
4100 / 114000
4200 / 114000
4300 / 114000
4400 / 114000
4500 / 114000
4600 / 114000
4700 / 114000
4800 / 114000
4900 / 114000
5000 / 114000
5100 / 114000
5200 / 114000
5300 / 114000
5400 / 114000
5500 / 114000
5600 / 114000
5700 / 114000
5800 / 114000
5900 / 114000
6000 / 114000
6100 / 114000
6200 / 114000
6300 / 114000
6400 / 114000
6500 / 114000
6600 / 114000
6700 / 114000
6800 / 114000
6900 / 114000
7000 / 114000
7100 / 114000
7200

In [38]:
len(df)

104917

In [29]:
batch_size = 10
for i in tqdm.tqdm(range(0, len(to_upsert), batch_size), desc="Upserting batches"):
    batch = to_upsert[i:i + batch_size]
    pinecone_index.upsert(vectors=batch, namespace="Tracks")

Upserting batches:  58%|█████▊    | 6590/11400 [24:50<18:08,  4.42it/s]  


PineconeApiException: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Date': 'Sat, 14 Sep 2024 22:09:44 GMT', 'Content-Type': 'text/plain', 'Content-Length': '80', 'Connection': 'keep-alive', 'server': 'envoy'})
HTTP response body: Unexpected token.
ta": {"track_name": NaN, "artists": NaN}
                    ^


In [30]:
batch_size = 10
for i in tqdm.tqdm(range(70000, len(to_upsert), batch_size), desc="Upserting batches"):
    try:
        batch = to_upsert[i:i + batch_size]
        pinecone_index.upsert(vectors=batch, namespace="Tracks")
    except:
        continue

Upserting batches: 100%|██████████| 4400/4400 [11:13<00:00,  6.54it/s]


In [None]:
topK = 10

query = "Someone Like You"

vector = get_embeddings(query)

for i in tqdm.tqdm(range(0, len(df)), desc="computing top K"):
    try:
        pinecone_index.query(namespace="Tracks", vector=, top_k=10, include_metadata=True)
    except:
        continue

# Neo4j

In [None]:
class Property(BaseModel):
    key: str = Field(..., description="key")
    value: str = Field(..., description="value")

class Node(BaseNode):
    properties: Optional[List[Property]] = Field(None, description="node properties")

class Relationship(BaseRelationship):
    properties: Optional[List[Property]] = Field(None, description="relationship properties")

class KnowledgeGraph(BaseModel):
    nodes: List[Node] = Field(..., description="nodes in the graph")
    rels: List[Relationship] = Field(..., description="relationships in the graph")

In [7]:
import uuid

16a6337c-f844-4434-95ea-09f4c89f710a


In [None]:
nodes = [] # str list
nodeTopK = [] # list of lists

In [None]:
# add every node to the graph database
for nodeName in nodes:
    query = (
        f"CREATE (n:{nodeName} {{name: '{nodeName}'}})"
    )
    graph.query(query)

In [14]:
# Function to add a node to the Neo4j graph
def add_node_to_graph(nodeName: str, trackId: str, artists: str):
    # Create a dictionary of properties
    query = (
        f"CREATE (n:Track {{ name: '{nodeName}', id: '{trackId}', artists: '{artists}' }})"
    )
    # run Node query
    graph.query(query)

In [11]:
add_node_to_graph("test1", "testId1")

In [12]:
# Delete a specific node
def delete_node_from_graph(node_id: str):
    query = (
        f"MATCH (n:Track {{id: '{node_id}'}}) "
        "DETACH DELETE n"
    )
    graph.query(query)

# delete_node_from_graph("testId1")


In [26]:
# Delete all nodes
def delete_all_nodes():
    query = (
        "MATCH (n) "
        "DETACH DELETE n"
    )
    graph.query(query)

# Uncomment the following line to execute the deletion
delete_all_nodes()


In [21]:
print(len(track_id_list))

114000


In [27]:
for i in range(len(track_id_list)):
    if i % 100 == 0:
        print(i, "/", len(track_id_list))
    try:
        add_node_to_graph(track_name_list[i], track_id_list[i], track_artists_list[i])
    except:
        continue

0 / 114000
100 / 114000
200 / 114000
300 / 114000
400 / 114000
500 / 114000
600 / 114000
700 / 114000
800 / 114000
900 / 114000
1000 / 114000
1100 / 114000
1200 / 114000
1300 / 114000
1400 / 114000


# spotify idea
- build a graph of songs / artists that can do recommendation engines
# predev replication
- build a knowledge graph for planning a full stack website

# frontend
- text box
- Graph visualization in JavaScript