In [46]:
import pandas as pd
import numpy as np
import ast
import json
from sklearn.preprocessing import MinMaxScaler
import networkx as nx
import pickle
import matplotlib.pyplot as plt

In [47]:
edges = pd.read_csv('assets/edges.csv')
nodes = pd.read_csv('assets/nodes.csv')

In [48]:
edges

Unnamed: 0,id_0,id_1
0,76M2Ekj8bG8W7X2nbx2CpF,7sfl4Xt5KmfyDs2T3SVSMK
1,0hk4xVujcyOr6USD95wcWb,7Do8se3ZoaVqUt3woqqSrD
2,38jpuy3yt3QIxQ8Fn1HTeJ,4csQIMQm6vI2A2SCVDuM2z
3,6PvcxssrQ0QaJVaBWHD07l,6UCQYrcJ6wab6gnQ89OJFh
4,2R1QrQqWuw3IjoP5dXRFjt,4mk1ScvOUkuQzzCZpT6bc0
...,...,...
300381,0PK0Dx3s9et0Uf4XbdFpiW,6SmpksRq3jxeDZ7roKDb6F
300382,5hqRsNHDZH1jHzI9LgxFRZ,7KYseTkErcYMRvcIqacxUq
300383,0wjb0t7aYMuIwyuXnPOilh,2p1fiYHYiXz9qi0JJyxBzN
300384,3ciRvbBIVz9fBoPbtSYq4x,5einkgXXrjhfYCyac1FANB


In [49]:
nodes = nodes.sort_values(by="popularity", ascending=False).drop("chart_hits", axis=1)
nodes['genres'] = nodes['genres'].apply(ast.literal_eval)
nodes = nodes[nodes['genres'].apply(lambda x: len(x) > 0)]
popularity = nodes["popularity"].to_list()

In [50]:
from collections import Counter
c = Counter(popularity)
c = sorted(c.items(), key= lambda x:-x[0])

brk = 3000
ct = 0
e = 0
for (k, v) in c:
    ct += v
    if ct >= brk:
        e = k
        print(k)
        break

62


In [51]:
nodes = nodes.loc[nodes['popularity'] >= e]
nodes = nodes[~nodes['name'].duplicated(keep=False)]
nodes

Unnamed: 0,spotify_id,name,followers,popularity,genres
11203,4q3ewBCX7sLwd24euuV69X,Bad Bunny,55669387.0,100,"[reggaeton, trap latino]"
18726,3TVXtAsR1Inumwj472S9r4,Drake,66852536.0,95,"[canadian hip hop, canadian pop, hip hop, rap,..."
4646,06HL4z0CvFAxyc27GXpf02,Taylor Swift,58554324.0,94,[pop]
13879,1Xyo4u8uXC1ZmMpatF05PJ,The Weeknd,49387909.0,93,"[canadian contemporary r&b, canadian pop, pop]"
19533,5K4W6rqBFWDnAN6FQUkS6x,Kanye West,18170307.0,91,"[chicago rap, rap]"
...,...,...,...,...,...
25121,7N0SBLJFpCyQSsv4MfRJ5d,Swarnalatha,772882.0,62,"[filmi, tamil pop]"
16640,6VxCmtR7S3yz4vnzsJqhSV,Sheppard,270154.0,62,"[australian indie, australian pop, folk-pop, n..."
7941,5xYkM2vMrE23taj6tl7qkm,Pk,654426.0,62,"[funk carioca, trap funk]"
131164,6DJEUXZm0e2rAohdoZ5Voo,Black Thought,190045.0,62,"[alternative hip hop, boom bap, east coast hip..."


In [52]:
genres = nodes["genres"].to_list()
all_genres = set()
for g in genres:
    l = set(g)
    all_genres = all_genres.union(l)

In [53]:
columns = ["id", "name", "followers", "popularity"] + list(all_genres)
genres_to_ind = {}

for i in range(len(columns)):
    genres_to_ind[columns[i]] = i

In [54]:
data = []

for ind, row in nodes.iterrows():
    r = [0] * len(columns)
    r[0] = (row["spotify_id"])
    r[1] = (row["name"])
    r[2] = (row["followers"])
    r[3] = (row["popularity"])
    for g in row["genres"]:
        r[genres_to_ind[g]] = 1

    data.append(r)

df = pd.DataFrame(data, columns=columns)
df = df.loc[:, df.sum() != 0]

### feature

In [55]:
id_map = pd.Series(df.index, index=df["id"]).to_dict()
name_map = pd.Series(df.index, index=df["name"]).to_dict()

In [56]:
mat = df.iloc[:, 2:].reset_index(drop=True).to_numpy()

scaler = MinMaxScaler()
X = scaler.fit_transform(mat)

### adjacency

In [57]:
n = len(X)
A = np.zeros((n, n))

ids = set(id_map.keys())

for _, row in edges.iterrows():
    u, v = row["id_0"], row["id_1"]

    if u in ids and v in ids and u != v:
        ui, vi = id_map[u], id_map[v]
        A[ui][vi] = 1
        A[vi][ui] = 1

### making graph

In [58]:
ind_to_name = {v:k for (k, v) in name_map.items()}

In [59]:
G = nx.from_numpy_array(A)

### remove degree zero nodes

In [60]:
empty = []
for node in G.nodes():
    if G.degree[node] == 0:
        empty.append(node)

ccs = [c for c in sorted(nx.connected_components(G), key=len, reverse=True)]

for cc in ccs[1:]:
    pcc = []
    for a in cc:
        pcc.append(ind_to_name[a])
        empty.append(a)
    print(pcc)

['Einár', '23', 'Ballinciaga', 'Veronica Maggio', 'Victor Leksell', 'ADAAM', 'Hov1']
['İrem Derici', 'Kurtuluş Kuş', 'Mustafa Ceceli', 'Burak Bulut']
['System Of A Down', 'Wu-Tang Clan']
['Em Beihold', 'Stephen Sanchez']
['Freddie Dredd', 'Lil Darkie']
['Pastel Ghost', 'Mr.Kitty']
['Megadeth', 'Lamb of God']
['Enya', 'Howard Shore']
['Grover Washington, Jr.', 'Bill Withers']
['iamjakehill', 'Josh A']
['Los Acosta', 'Grupo Bryndis']
['Brooke Ligertwood', 'Hillsong UNITED']
['Egzod', 'Neoni']
['Aimyon', 'RADWIMPS']
['11:11 Music Group', 'creamy']
['Fiersa Besari', 'Feby Putri']
['Kep1er', 'LOONA']
['HA SUNG WOON', 'Punch']
['Skusta Clee', 'Flow G']
['Alice Cooper', 'Theory of a Deadman']
['Harry Styles']
['Arctic Monkeys']
['Die drei ???']
['Adele']
['Red Hot Chili Peppers']
['Twenty One Pilots']
['Cigarettes After Sex']
['Nirvana']
['Zach Bryan']
['ABBA']
['AC/DC']
['Kate Bush']
['Mitski']
['Fleetwood Mac']
["Guns N' Roses"]
['Slipknot']
['Pink Floyd']
['Melanie Martinez']
['Mac DeMarco

In [61]:
G.remove_nodes_from(empty)

In [62]:
A = nx.to_numpy_array(G)

In [63]:
np.save("components/A.npy", A)

In [64]:
nx.write_graphml(G, 'components/graph.graphml')

In [65]:
df = df.drop(empty, axis=0).reset_index(drop=True)

In [66]:
df.to_csv('components/nodes.csv')

id_map = pd.Series(df.index, index=df["id"]).to_dict()
name_map = pd.Series(df.index, index=df["name"]).to_dict()

with open("components/id_to_ind.json", "w") as file:
    json.dump(id_map, file, indent=2)

with open("components/name_to_ind.json", "w") as file:
    json.dump(name_map, file, indent=2)

ind_to_name = {v:k for (k, v) in name_map.items()}
ind_to_id = {v:k for (k, v) in id_map.items()}

with open("components/ind_to_id.json", "w") as file:
    json.dump(ind_to_id, file, indent=2)

with open("components/ind_to_name.json", "w") as file:
    json.dump(ind_to_name, file, indent=2)

mat = df.iloc[:, 2:].reset_index(drop=True).to_numpy()

scaler = MinMaxScaler()
X = scaler.fit_transform(mat)

np.save("components/X.npy", X)