# Package Imports

In [1]:
#from pymongo import MongoClient
from collections import defaultdict, Counter
from tqdm.auto import tqdm
import networkx as nx
import random
import math
import pickle
import torch
import pandas as pd
import torch_geometric
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T
from torch_geometric.utils.convert import to_networkx, from_networkx
from torch_geometric.utils import to_undirected, is_undirected
import numpy as np
from networkx import to_dict_of_dicts
from torch_geometric.loader import NeighborLoader
import matplotlib.pyplot as plt


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(torch.__version__)
print(torch_geometric.__version__)

2.2.2+cu118
2.5.3


# Load Data

## Load User Track Graph

In [3]:
import pickle

# First, extract the contents of dataset.rar to a folder

# Then, load the data from the dataset.pickle file
with open('data/MRecury_data/dataset_typed.pickle', 'rb') as f:
    dataset = pickle.load(f)


# access the different parts of the dataset:
full_graph = dataset['full']
train_graph = dataset['train']
test_graph = dataset['test']
users_mapping = dataset['users']
#artist_tracks_mapping = dataset['artist-tracks']
# Load New Mapping: 
artist_tracks_mapping = pd.read_csv('data/new_artist_tracks_mapping_df.csv')


# Accessing nodes and edges of the graphs:
# For example, to access nodes and edges of the full graph:
full_nodes = full_graph.nodes()
full_edges = full_graph.edges()

train_nodes = train_graph.nodes()
train_edges = train_graph.edges()

test_nodes = test_graph.nodes()
test_edges = test_graph.edges()


In [4]:
def create_dataframe_from_graph(graph):
    # Initialize lists to store extracted information
    user_ids = []
    song_ids = []
    scrobbles = []
    positions = []
    dates = []

    # Iterate over users
    for user_id in tqdm(range(3307)):
        if user_id in graph:
            # Iterate over the songs
            for song_id, songs_info in graph[user_id].items():
                user_ids.append(user_id)
                song_ids.append(song_id)
                scrobbles.append(songs_info['scrobbles'])
                positions.append(songs_info['pos'])
                dates.append(songs_info['date'])
                
    # Create a DataFrame from the lists
    graph_df = pd.DataFrame({
        'User_ID': user_ids,
        'Song_ID': song_ids,
        'Scrobbles': scrobbles,
        'Position': positions,
        'Date': dates
    })
    
    return graph_df

In [5]:
full_graph_df = create_dataframe_from_graph(full_graph)
train_graph_df = create_dataframe_from_graph(train_graph)
test_graph_df = create_dataframe_from_graph(test_graph)

  0%|          | 0/3307 [00:00<?, ?it/s]

100%|██████████| 3307/3307 [00:01<00:00, 2073.27it/s]
100%|██████████| 3307/3307 [00:01<00:00, 2583.43it/s]
100%|██████████| 3307/3307 [00:00<00:00, 14536.85it/s]


## Load Social Graph

In [6]:
def load_social(file_users, file_edges, users_ids):
    df_users = pd.read_csv(file_users, sep='\t', names=['id', 'user'])
    df_edges = pd.read_csv(file_edges, sep=' ', names=['origin', 'destination'])
    old_new = {}
    for _, r in tqdm(df_users.iterrows(), total=len(df_users)):
        if r['user'] in users_ids:
            old_new[r['id']] = users_ids[r['user']]
    social_graph = nx.DiGraph()
    social_graph.add_nodes_from(old_new.values())
    for _, r in tqdm(df_edges.iterrows(), total=len(df_edges)):
        if r['origin'] in old_new and r['destination'] in old_new:
            social_graph.add_edge(old_new[r['origin']], old_new[r['destination']])
    return social_graph

social_graph = load_social('data/MRecury_data/lastfm.nodes', 'data/MRecury_data/lastfm.edges', users_mapping)

100%|██████████| 136420/136420 [00:01<00:00, 75447.52it/s]
100%|██████████| 1685524/1685524 [00:17<00:00, 98588.49it/s] 


# Transformation into PyG Graph

## Graph Data Inspection for Transformation

In [7]:
user_nodes = [node for node in full_nodes if full_graph.nodes[node]['type'] == 'user']

In [8]:
dataset = test_graph
print(f'Dataset: {dataset}:')
print('======================')

#print(f'Number of graphs: {len(full_graph)}') # seems like the number of graphs is wrong, this is identital with nodes
print(f'Number of nodes: {len(full_nodes)}')
print(f'Number of edges: {len(full_edges)}')
print(f'Number of User Nodes: {len(user_nodes)}') # i know this is the case from my inspection in Notebook 1. Also I could inspect the train_graph_df if needed.
print(f'Number of Tracks Nodes: {(len(full_nodes) - len(user_nodes))}')



#print(f'Number of features: {dataset.num_features}')
#print(f'Number of classes: {dataset.num_classes}')

Dataset: Graph with 159128 nodes and 453301 edges:
Number of nodes: 255321
Number of edges: 3018209
Number of User Nodes: 3307
Number of Tracks Nodes: 252014


## Train Graph Preprocessing (Skip on Rerun)
Cleaning the Train Graph of missing songs and resetting Indices of each Node for Pyg Graph Init

Can be Skipped on Re-Runs for the same dataset (eg. Train)

In [9]:
## Initialize Mapping

In [10]:
artist_tracks_mapping = pd.read_csv('data/new_artist_tracks_mapping_df.csv')
artist_tracks_mapping = artist_tracks_mapping.rename(columns = {'Song_Node_ID':'Song_ID'})
artist_tracks_mapping

Unnamed: 0,Artist,Song_Name,Song_ID
0,Black Kids,I'm Not Gonna Teach Your Boyfriend How to Danc...,3307
1,Black Kids,Hit The Heartbrakes,3308
2,Black Kids,I've Underestimated My Charm (Again),3309
3,Black Kids,Partie Traumatic,3310
4,Black Kids,I'm Making Eyes at You,3311
...,...,...,...
252008,Jamie Lancaster,Boys Don't Cry,255203
252009,Sleeperstar,I Was Wrong,255208
252010,Anthony Naples,Mad Disrespect,255228
252011,Irene,Stardust,255253


In [11]:
# Merge the train_graph_df with artist_tracks_mapping on the "Song_ID" column
full_graph_df = pd.merge(full_graph_df, artist_tracks_mapping, on="Song_ID", how="left")

# Display the structure of the merged DataFrame
full_graph_df

Unnamed: 0,User_ID,Song_ID,Scrobbles,Position,Date,Artist,Song_Name
0,0,3307,370,inf,,Black Kids,I'm Not Gonna Teach Your Boyfriend How to Danc...
1,0,3308,357,inf,,Black Kids,Hit The Heartbrakes
2,0,3309,349,inf,,Black Kids,I've Underestimated My Charm (Again)
3,0,3310,347,inf,,Black Kids,Partie Traumatic
4,0,3311,346,inf,,Black Kids,I'm Making Eyes at You
...,...,...,...,...,...,...,...
3018204,3306,3815,1,inf,,Kings of Leon,Use Somebody
3018205,3306,14156,1,800.0,"Saturday 23 Jan 2010, 3:01pm",Kings of Leon,Wasted Time
3018206,3306,12191,1,inf,,Kiss,Love Gun
3018207,3306,173432,1,inf,,Kittie,Charlotte


In [12]:
## length of the Train Graph df before (and after merge). 14 is wrong, should be 13, since one song was double in the original graph and mapping. Hence we have to remove this song
len(full_graph_df["Song_ID"].unique())

252014

In [13]:
len(full_nodes) - len(user_nodes)

252014

In [14]:
## length of the Train Graph df before (and after merge). 14 is wrong, should be 13, since one song was double in the original graph and mapping. Hence we have to remove this song
len(full_graph_df["User_ID"].unique())

3307

In [15]:
len(user_nodes)

3307

### check for NaNs in Songname or ID
Our 1 song " 	Artist 	Song_Name 	Song_Node_ID
74634 	Banda UÓ 	Cavalo de Fogo 	ID: '68691" that was correctly excluded in the new mapping (and appeared twice with different IDs in the old mapping) is still being joined here because
it exists in the original Train Df (that we take as the merge input). Hence it will not get left joined with any New Mapping data, and we have to delete it.

In [16]:
full_graph_df["Song_ID"][full_graph_df["Song_ID"].isnull()]

Series([], Name: Song_ID, dtype: int64)

In [17]:
full_graph_df[full_graph_df["Song_Name"].isnull()]

Unnamed: 0,User_ID,Song_ID,Scrobbles,Position,Date,Artist,Song_Name
2523296,2614,68691,22,inf,,,
2545671,2645,68691,158,1058.0,"Sunday 12 Sep 2021, 1:20pm",,


In [18]:
full_graph_df[full_graph_df["Song_Name"] == "Cavalo de Fogo"]

Unnamed: 0,User_ID,Song_ID,Scrobbles,Position,Date,Artist,Song_Name
308474,170,194224,17,inf,,Banda UÓ,Cavalo de Fogo
478336,348,194224,51,inf,,Banda UÓ,Cavalo de Fogo
1840837,1694,194224,59,inf,,Banda UÓ,Cavalo de Fogo
1965452,1867,194224,27,inf,,Banda UÓ,Cavalo de Fogo
2021407,1939,194224,61,inf,,Banda UÓ,Cavalo de Fogo
2150856,2107,194224,52,inf,,Banda UÓ,Cavalo de Fogo
2204124,2179,194224,95,1307.0,"Wednesday 8 Sep 2021, 5:55pm",Banda UÓ,Cavalo de Fogo
2216821,2196,194224,18,inf,,Banda UÓ,Cavalo de Fogo
2255599,2249,194224,46,inf,,Banda UÓ,Cavalo de Fogo
2286079,2289,194224,10,inf,,Banda UÓ,Cavalo de Fogo


In [19]:
full_graph_df.loc[2545670]

User_ID           2645
Song_ID         129235
Scrobbles          158
Position           inf
Date              None
Artist          Anitta
Song_Name    Paradinha
Name: 2545670, dtype: object

In [20]:
## identify this song by ID:
full_graph_df[full_graph_df["Song_ID"] == 68691]

Unnamed: 0,User_ID,Song_ID,Scrobbles,Position,Date,Artist,Song_Name
2523296,2614,68691,22,inf,,,
2545671,2645,68691,158,1058.0,"Sunday 12 Sep 2021, 1:20pm",,


###  remove empty songs / duplicate Song_ID

In [21]:
full_graph_df = full_graph_df.drop(2523296).reset_index()

In [22]:
full_graph_df[full_graph_df["Song_Name"].isnull()]

Unnamed: 0,index,User_ID,Song_ID,Scrobbles,Position,Date,Artist,Song_Name
2545670,2545671,2645,68691,158,1058.0,"Sunday 12 Sep 2021, 1:20pm",,


In [23]:
full_graph_df = full_graph_df.drop(2545670).reset_index() # - 1 because we remove the first one too

In [24]:
full_graph_df[full_graph_df["Song_ID"] == 68691]

Unnamed: 0,level_0,index,User_ID,Song_ID,Scrobbles,Position,Date,Artist,Song_Name


In [25]:
## no more empty song names
full_graph_df["Song_Name"][full_graph_df["Song_Name"].isnull()]

Series([], Name: Song_Name, dtype: object)

### Reset Song Indices of Song IDS:



In [26]:
#Same function but wrote it myself
New_Song_ID_dict = {}
new_ID_Counter = 0
missing_id = 68691

for song_id in np.sort(full_graph_df['Song_ID'].unique()):
    #print(song_id)
    if song_id != missing_id:
        
        #new_ID = new_ID_Counter
        New_Song_ID_dict[song_id] = new_ID_Counter
        new_ID_Counter += 1
        #print(New_Song_ID)

#New_Song_ID_dict


In [27]:
# Apply the mapping to the dataset
full_graph_df['Song_ID'] = full_graph_df['Song_ID'].map(New_Song_ID_dict)

In [28]:
full_graph_df

Unnamed: 0,level_0,index,User_ID,Song_ID,Scrobbles,Position,Date,Artist,Song_Name
0,0,0,0,0,370,inf,,Black Kids,I'm Not Gonna Teach Your Boyfriend How to Danc...
1,1,1,0,1,357,inf,,Black Kids,Hit The Heartbrakes
2,2,2,0,2,349,inf,,Black Kids,I've Underestimated My Charm (Again)
3,3,3,0,3,347,inf,,Black Kids,Partie Traumatic
4,4,4,0,4,346,inf,,Black Kids,I'm Making Eyes at You
...,...,...,...,...,...,...,...,...,...
3018202,3018203,3018204,3306,508,1,inf,,Kings of Leon,Use Somebody
3018203,3018204,3018205,3306,10849,1,800.0,"Saturday 23 Jan 2010, 3:01pm",Kings of Leon,Wasted Time
3018204,3018205,3018206,3306,8884,1,inf,,Kiss,Love Gun
3018205,3018206,3018207,3306,170124,1,inf,,Kittie,Charlotte


In [29]:
np.sort(full_graph_df['Song_ID'].unique())

array([     0,      1,      2, ..., 252010, 252011, 252012], dtype=int64)

In [30]:
# Iterate through the New_Song_ID column and check if IDs run consecutively from 0
consecutive = True
prev_id = -1

for new_id in np.sort(full_graph_df['Song_ID'].unique()):
    if new_id != prev_id + 1:
        consecutive = False
        break
    prev_id = new_id

# Print the result
if consecutive:
    print("New Song IDs run consecutively from 0.")
else:
    print("New Song IDs do not run consecutively from 0.")


New Song IDs run consecutively from 0.


In [31]:
full_graph_df["Song_ID"].unique().max(), len(full_graph_df["Song_ID"].unique())

(252012, 252013)

### check number of songs

In [32]:
unique_song_ids= full_graph_df["Song_ID"].unique()
print("Length:", len(unique_song_ids))
print("Max Value:", full_graph_df["Song_ID"].unique().max())
#sorted(unique_song_ids) 

Length: 252013
Max Value: 252012


In [35]:
full_graph_df = full_graph_df.drop(columns=['level_0', 'index'])

In [36]:
full_graph_df

Unnamed: 0,User_ID,Song_ID,Scrobbles,Position,Date,Artist,Song_Name
0,0,0,370,inf,,Black Kids,I'm Not Gonna Teach Your Boyfriend How to Danc...
1,0,1,357,inf,,Black Kids,Hit The Heartbrakes
2,0,2,349,inf,,Black Kids,I've Underestimated My Charm (Again)
3,0,3,347,inf,,Black Kids,Partie Traumatic
4,0,4,346,inf,,Black Kids,I'm Making Eyes at You
...,...,...,...,...,...,...,...
3018202,3306,508,1,inf,,Kings of Leon,Use Somebody
3018203,3306,10849,1,800.0,"Saturday 23 Jan 2010, 3:01pm",Kings of Leon,Wasted Time
3018204,3306,8884,1,inf,,Kiss,Love Gun
3018205,3306,170124,1,inf,,Kittie,Charlotte


### Save Clean TrainGraph DF

In [38]:
import os

# Check if the file already exists
if not os.path.exists('data/full_graph_df_clean.csv'):
    # Save the merged DataFrame to a CSV file
    full_graph_df.to_csv('data/full_graph_df_clean.csv', index=False)
else:
    # Load the existing CSV file
    full_graph_df = pd.read_csv('data/full_graph_df_clean.csv')

## Import Clean TrainGraph DF (Dont Skip on Re-Run)

In [39]:
full_graph_df[full_graph_df["Song_Name"] == "Oblique City"]

Unnamed: 0,User_ID,Song_ID,Scrobbles,Position,Date,Artist,Song_Name
6102,0,185405,8,130.0,"Thursday 23 Sep 2021, 9:31am",Phoenix,Oblique City
41312,5,185405,2,inf,,Phoenix,Oblique City
69988,8,185405,2,inf,,Phoenix,Oblique City
307123,169,185405,10,inf,,Phoenix,Oblique City
401257,267,185405,69,inf,,Phoenix,Oblique City
1106489,1038,185405,30,inf,,Phoenix,Oblique City
1272505,1205,185405,25,inf,,Phoenix,Oblique City
1596257,1594,185405,8,inf,,Phoenix,Oblique City
1768939,1672,185405,1,inf,,Phoenix,Oblique City
1829550,1678,185405,2,inf,,Phoenix,Oblique City


## Node Data Setup

### Select All Unique Users

In [40]:
user_ids = len(full_graph_df["User_ID"].unique())
user_ids

3307

### Select All Unique Song Nodes

In [41]:
song_ids = len(full_graph_df["Song_ID"].unique())
song_ids

252013

### Select All Unique Artists

In [42]:
artist_ids = len(full_graph_df["Artist"].unique())
artist_ids

28120

In [43]:
unique_artists = full_graph_df["Artist"].unique()

In [44]:
# Initialize the starting node ID for artists
#reset this to 0! Important because PyG resets indices for each node type on initialization, and then my edge mapping will be wrong if it doesnt start from 0
artist_node_id = 0

# Dictionary to store node IDs for artists
artist_nodes_dic = {}
artist_nodes = []

# Iterate over each artist and assign node IDs
for artist in unique_artists:

    artist_nodes_dic[artist] = artist_node_id
    
    artist_nodes.append(artist_node_id)
    artist_node_id += 1



In [45]:
artist_nodes_dic

{'Black Kids': 0,
 'Foster the People': 1,
 'The Vaccines': 2,
 'Of Monsters and Men': 3,
 'Panic! at the Disco': 4,
 'The xx': 5,
 'Architecture in Helsinki': 6,
 'Grouplove': 7,
 'Two Door Cinema Club': 8,
 'Slow Club': 9,
 'White Lies': 10,
 'We Have Band': 11,
 'Empire of the Sun': 12,
 'The Ting Tings': 13,
 'Electric Guest': 14,
 'Arcade Fire': 15,
 'The Maccabees': 16,
 'Alabama Shakes': 17,
 'San Cisco': 18,
 'CHVRCHES': 19,
 'Diamond Rings': 20,
 'Cold Specks': 21,
 "Allo Darlin'": 22,
 'Lorde': 23,
 'The 1975': 24,
 'The Wombats': 25,
 'Marina & the Diamonds': 26,
 'Bloc Party': 27,
 'Kate Nash': 28,
 'La Roux': 29,
 'Wolf Gang': 30,
 'Adele': 31,
 'Kaiser Chiefs': 32,
 'Clarice Falcão': 33,
 'Mystery Jets': 34,
 'She Wants Revenge': 35,
 'Penguin Prison': 36,
 'The Killers': 37,
 'The Temper Trap': 38,
 'Arctic Monkeys': 39,
 'Metronomy': 40,
 'of Montreal': 41,
 'The Naked and Famous': 42,
 'Hospitality': 43,
 'Los Campesinos!': 44,
 'Phoenix': 45,
 'Santigold': 46,
 'MGMT'

In [46]:
len(artist_nodes)

28120

## Edge Indices Setup

### User-Tracks Edges
Collect Edge Information from User-Tracks

#### Reset Song_ID Index

##### With edge attributes as List

In [47]:
# 1. Find unique User_IDs
unique_user_ids = len(full_graph_df['User_ID'].unique())
unique_user_ids

3307

#### Create User-Track Edge Indices & Attributes

In [48]:
def create_user_track_edge_index_and_attributes(graph_df):
    
    """
    Create edge index and attributes from a graph.

    Args:
    - graph as NetworkX Graph Object

    Returns:
    - user_song_edge_index (list): List of edges represented by node indices.
    - user_song_edge_scrobbel_attributes (list): List of scrobble attributes for each edge.
    """

    # Initialize index and dictionary
    user_song_edge_index = []
    user_song_edge_scrobbel_attributes = []
    
    for user_id, song_info in graph_df[["User_ID", "Song_ID", "Scrobbles"]].groupby("User_ID"):
        
        scrobbles_per_user_node = []
        
        song_ids = song_info['Song_ID']#.tolist()
        scrobbles = song_info['Scrobbles']
        #song_infos = song_info[['Song_ID', 'Scrobbles']]#
        #print(song_infos)
        #iterate over each song node and add its ID
        for song_id in song_ids:
            # get user and id information
            #print(song_id)
            user_id = user_id
            song_id = song_id

            ## for debudding wrong nodeID error which caused the problem in the LNH Sampler. 13 instead of 12
            if song_id == 252013:
                print(song_id, user_id)
            
            #store as edge tuple
            current_edge_directed = [user_id, song_id]
    
            #append to edge list
            user_song_edge_index.append(current_edge_directed)
            
        for scrobble in scrobbles:
            # Extract edge attributes and append to list
            scrobbles_per_song = scrobble
            scrobbles_per_user_node.append(scrobbles_per_song)
            
        user_song_edge_scrobbel_attributes.extend(scrobbles_per_user_node)
    return user_song_edge_index, user_song_edge_scrobbel_attributes
        
    


In [49]:
user_song_edge_index, user_song_edge_scrobbel_attributes = create_user_track_edge_index_and_attributes(full_graph_df)

In [50]:
print(len(user_song_edge_index), len(user_song_edge_scrobbel_attributes))

3018207 3018207


In [51]:
# Convert the list of edge attributes to a tensor
user_song_edge_attr_tensor = torch.tensor(user_song_edge_scrobbel_attributes, dtype=torch.long)
user_song_edge_attr_tensor.t().size()

torch.Size([3018207])

In [52]:
# Convert the list of edge index to a tensor
user_song_edge_index = torch.tensor(user_song_edge_index , dtype=torch.long)
user_song_edge_index.t().size()

torch.Size([2, 3018207])

### User User Edges
Collect User User Edge Information

#### User User Edge Index

In [53]:
def create_user_user_edge_index(social_graph):
    """
    Create edge index for user-user relationships in a social graph.

    Args:
    - social_graph: NetworkX Graph Object

    Returns:
    - user_user_edge_index (list): List of edges represented by node indices.
     One way - Directed Only. (Will be made undirected withing PyG
    """
    #initialize index
    user_user_edge_index = []
    
    # Iterate over all user nodes
    for user_node in full_graph_df['User_ID'].unique():
        #print(type(user_node))
        #print(user_node)
           
        # iterate over all edges of each user node in the social graph
        for key, value in social_graph[user_node].items():
            ## add edges twice for undirection - Not necessary since this will be done in PyG
            current_edge_directed = [user_node, key]
            current_edge_undirected = [key, user_node]
            user_user_edge_index.append(current_edge_directed)
            #user_user_edge_index.append(current_edge_undirected)
    return user_user_edge_index
    
user_user_edge_index = create_user_user_edge_index(social_graph)

In [54]:
user_user_edge_index = torch.tensor(user_user_edge_index , dtype=torch.long)
user_user_edge_index.t().size()

torch.Size([2, 142919])

###  Artist-Track Edges

#### Creating a Artist-Track Dictionary to feed into the edge Data

In [55]:
full_graph_df[["Artist","Song_ID","Song_Name"]]

Unnamed: 0,Artist,Song_ID,Song_Name
0,Black Kids,0,I'm Not Gonna Teach Your Boyfriend How to Danc...
1,Black Kids,1,Hit The Heartbrakes
2,Black Kids,2,I've Underestimated My Charm (Again)
3,Black Kids,3,Partie Traumatic
4,Black Kids,4,I'm Making Eyes at You
...,...,...,...
3018202,Kings of Leon,508,Use Somebody
3018203,Kings of Leon,10849,Wasted Time
3018204,Kiss,8884,Love Gun
3018205,Kittie,170124,Charlotte


In [56]:
def artist_to_song(df):
    # Initialize an empty dictionary to store the mapping of artists to songs
    artist_to_songs = {}
    
    # Iterate through the DataFrame rows
    for index, row in df.iterrows():
        artist = row['Artist']
        song_name = row['Song_Name']
        song_id = row['Song_ID']
        
        # Check if the artist is already in the dictionary
        if artist in artist_to_songs:
            # Add the song to the nested dictionary
            artist_to_songs[artist][song_id] = song_name
        else:
            # Create a new nested dictionary with the song and add it to the dictionary
            artist_to_songs[artist] = {song_id: song_name}


    return artist_to_songs

In [57]:
artist_to_songs_dic = artist_to_song(full_graph_df)


In [58]:
artist_to_songs_dic

{'Black Kids': {0: "I'm Not Gonna Teach Your Boyfriend How to Dance With You",
  1: 'Hit The Heartbrakes',
  2: "I've Underestimated My Charm (Again)",
  3: 'Partie Traumatic',
  4: "I'm Making Eyes at You",
  5: 'Listen To Your Body Tonight',
  6: 'Hurricane Jane',
  7: 'Look at Me (When I Rock Wichoo)',
  8: 'Love Me Already',
  9: 'I Wanna Be Your Limousine',
  54: 'My Christian Name',
  800: 'Obligatory Drugs',
  1191: 'V-Card (Not Nuthin’)',
  1231: 'Natural Born Kissers',
  1466: 'In A Song',
  1522: 'If My Heart Is Broken',
  1652: 'All The Emotions',
  1653: 'Way Into Leather',
  84466: "I'm Making Eyes At You (Joy Electric Remix)",
  51448: 'Hurricane Jane (The Cansecos Remix)',
  84467: "I'm Not Gonna Teach Your Boyfriend How to Dance with You - The Twelves Remix"},
 'Foster the People': {10: 'Helena Beat',
  11: 'Waste',
  12: 'Pumped Up Kicks',
  13: 'Call It What You Want',
  14: 'Houdini',
  15: 'I Would Do Anything for You',
  16: 'Miss You',
  17: 'Life on the Nickel',


In [59]:
def create_song_artist_edge_index(artist_tracks_mapping_dic):
    # init artist and songs dict from mapping df
    #artist_tracks_mapping_dic = dict(zip(artist_tracks_mapping['Artist'], artist_tracks_mapping['Song_ID']))
    
    # Initialize variables to store edge index and attributes
    artist_song_edge_index = []

     
    # Iterate over each artist and their songs
    for artist, songs in artist_tracks_mapping_dic.items():
        # Get the node ID of the current artist
        artist_node_id = artist_nodes_dic[artist]
        
        #print(artist, songs)
        #print(artist)
        #print(songs)
        
        # Iterate over each song and its node ID
        for song_node_id, song_name  in songs.items():
            
            # Create directed edges from artist to song
            #print(artist, song_node_id)
            #print(artist, song_node_id, song_name)
            artist_song_edge_index.append([artist_node_id, song_node_id])#, song_name, artist])
            
            # Optional: Create directed edges from song to artist
            #artist_song_edge_index.append([song_node_id, artist_node_id])
    return artist_song_edge_index
            
artist_song_edge_index = create_song_artist_edge_index(artist_to_songs_dic)
#artist_song_edge_index

In [60]:
artist_song_edge_index = torch.tensor(artist_song_edge_index , dtype=torch.long)
artist_song_edge_index.t().size()

torch.Size([2, 252013])

In [61]:
artist_song_edge_index

tensor([[     0,      0],
        [     0,      1],
        [     0,      2],
        ...,
        [ 28117, 112259],
        [ 28118,  14295],
        [ 28119, 115034]])

## Check All Edges
Edges are undirected:

In [62]:
print(len(user_song_edge_index))
print(len(user_user_edge_index))
print(len(artist_song_edge_index))

3018207
142919
252013


In [63]:
print(f"The Edge {user_song_edge_index} is undirected: {is_undirected(user_song_edge_index)}.")
print(f"The Edge {user_user_edge_index} is undirected: {is_undirected(user_user_edge_index)}.")
print(f"The Edge {artist_song_edge_index} is undirected: {is_undirected(artist_song_edge_index)}.")

The Edge tensor([[     0,      0],
        [     0,      1],
        [     0,      2],
        ...,
        [  3306,   8884],
        [  3306, 170124],
        [  3306, 230175]]) is undirected: False.
The Edge tensor([[   0,  763],
        [   0, 1435],
        [   0,  122],
        ...,
        [3306,  326],
        [3306,  926],
        [3306,  700]]) is undirected: False.
The Edge tensor([[     0,      0],
        [     0,      1],
        [     0,      2],
        ...,
        [ 28117, 112259],
        [ 28118,  14295],
        [ 28119, 115034]]) is undirected: False.


# Full Graph
## Prepare Data 
For Graph Creation. Full Graph needs information like Song Tags and Audio features, which are not present in base graph
Could also be called "Rhich" Graph

## load tags and audio features

## Clean Audio df from missing ID

In [8]:
# Load Track Tag
audio_df = pd.read_csv('data/final_audio_df.csv')
train_graph_df = pd.read_csv('data/full_graph_df_clean.csv') # LOADING TEST GRAPH; just not renaming all variables... lazy


In [9]:
audio_df

Unnamed: 0,Artist,Song_Name,Song_Tags,Song_ID,Spotify_ID,danceability,energy,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,key,First_Genre
0,Black Kids,I'm Not Gonna Teach Your Boyfriend How to Danc...,"indie,indie pop,indie rock",3307,,0.570667,0.829722,-5.416167,0.333333,0.048622,0.025465,0.097041,0.219406,0.743167,135.018667,4.833333,indie
1,Black Kids,Hit The Heartbrakes,"indie,indie rock,indie pop",3308,6XetDleVJnCxOZK2ILx4b5,0.523000,0.920000,-3.381000,0.000000,0.052500,0.000839,0.000000,0.300000,0.568000,122.980000,1.000000,indie
2,Black Kids,I've Underestimated My Charm (Again),"indie,black kids,indie pop",3309,,0.570667,0.829722,-5.416167,0.333333,0.048622,0.025465,0.097041,0.219406,0.743167,135.018667,4.833333,indie
3,Black Kids,Partie Traumatic,"indie,indie pop,pop",3310,2gPsk7vlntbDBsMOMbRJsI,0.550000,0.943000,-3.518000,0.000000,0.046100,0.022800,0.000004,0.573000,0.693000,121.987000,6.000000,indie
4,Black Kids,I'm Making Eyes at You,"indie,indie pop,black kids",3311,0yA7xMdPNxUZRhgLCWFaNQ,0.510000,0.763000,-7.716000,1.000000,0.045900,0.013100,0.000010,0.125000,0.654000,179.894000,4.000000,indie
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
252008,Juveniles,Ambitions,"french,electro,10s",19818,3F85tg9I8OsoB0Ed4LjTTp,0.570000,0.895000,-5.399000,0.000000,0.040100,0.002200,0.000195,0.185000,0.795000,161.005000,11.000000,french
252009,Daedelus,Get Off Your HiHats,"american i like,electronic,funky",204002,1XkhxJCr2V41yk9i5Gp0NI,0.620000,0.863000,-9.318000,1.000000,0.069000,0.001440,0.879000,0.083700,0.922000,139.985000,5.000000,american i like
252010,Floetry,Let Me In,"Neo-Soul,soul,floetry",222619,10bdAt1jHulsSx7FmAQFSw,0.726000,0.497000,-7.249000,1.000000,0.195000,0.282000,0.000000,0.198000,0.762000,73.024000,1.000000,Neo-Soul
252011,Katatonia,Sleeper,"doom metal,Progressive metal,metal",145949,7HTH7REu9c2MmrbY94fDko,0.451000,0.660000,-6.875000,1.000000,0.027500,0.011700,0.002310,0.249000,0.176000,95.050000,5.000000,doom metal


In [10]:
train_graph_df

Unnamed: 0,User_ID,Song_ID,Scrobbles,Position,Date,Artist,Song_Name
0,0,0,370,inf,,Black Kids,I'm Not Gonna Teach Your Boyfriend How to Danc...
1,0,1,357,inf,,Black Kids,Hit The Heartbrakes
2,0,2,349,inf,,Black Kids,I've Underestimated My Charm (Again)
3,0,3,347,inf,,Black Kids,Partie Traumatic
4,0,4,346,inf,,Black Kids,I'm Making Eyes at You
...,...,...,...,...,...,...,...
3018202,3306,508,1,inf,,Kings of Leon,Use Somebody
3018203,3306,10849,1,800.0,"Saturday 23 Jan 2010, 3:01pm",Kings of Leon,Wasted Time
3018204,3306,8884,1,inf,,Kiss,Love Gun
3018205,3306,170124,1,inf,,Kittie,Charlotte


reset audio df Song Ids to 0


In [3]:
# Remove the missing Song_IDs from audio_df and reset its other IDS starting from 0, so they are consecutive
print("Removing missing Song_IDs from audio_df and resetting its other IDS starting from 0, so they are consecutive...")
audio_df = audio_df.sort_values('Song_ID').reset_index(drop=True)
audio_df['Song_ID'] = range(len(audio_df))

Removing missing Song_IDs from audio_df and resetting its other IDS starting from 0, so they are consecutive...


In [4]:
# filter the audio_df in such a way that only songs remain that are present in the train_graph_df
audio_df = audio_df[audio_df['Song_ID'].isin(train_graph_df['Song_ID'].unique())]
len(audio_df), len(train_graph_df['Song_ID'].unique())


(252013, 252013)

In [5]:
# Check if any Song_ID is missing in audio_df
audio_missing_ids = set(range(audio_df['Song_ID'].min(), audio_df['Song_ID'].max() + 1)) - set(audio_df['Song_ID'])
print(f"Missing Song_IDs in audio_df: {audio_missing_ids}")

# Check if any Song_ID is missing in train_graph_df
train_graph_missing_ids = set(range(train_graph_df['Song_ID'].min(), train_graph_df['Song_ID'].max() + 1)) - set(train_graph_df['Song_ID'])
print(f"Missing Song_IDs in train_graph_df: {train_graph_missing_ids}")



# Check if any Song_ID is missing in audio_df after resetting
audio_missing_ids = set(range(audio_df['Song_ID'].min(), audio_df['Song_ID'].max() + 1)) - set(audio_df['Song_ID'])
print(f"Missing Song_IDs in audio_df: {audio_missing_ids}")

# Check if any Song_ID is missing in train_graph_df after resetting
train_graph_missing_ids = set(range(train_graph_df['Song_ID'].min(), train_graph_df['Song_ID'].max() + 1)) - set(train_graph_df['Song_ID'])
print(f"Missing Song_IDs in train_graph_df: {train_graph_missing_ids}")

# Print the number of unique Song_IDs in both dfs
print(f"Number of unique Song_IDs in audio_df: {audio_df['Song_ID'].nunique()}")
print(f"Number of unique Song_IDs in train_graph_df: {train_graph_df['Song_ID'].nunique()}")


Missing Song_IDs in audio_df: set()
Missing Song_IDs in train_graph_df: set()
Missing Song_IDs in audio_df: set()
Missing Song_IDs in train_graph_df: set()
Number of unique Song_IDs in audio_df: 252013
Number of unique Song_IDs in train_graph_df: 252013


## Process Tags
Remove NaN tags, and make a dict that assigns the tags correctly to their according SongIDs Songs


In [69]:
# show all values were tags are nan in "Song Tags"
audio_df[audio_df['Song_Tags'].isnull()]

# convert these values to a string with "Unknown"
audio_df.loc[audio_df['Song_Tags'].isnull(), 'Song_Tags'] = 'Unknown'

In [70]:
# Sort the DataFrame by Song_ID in ascending order
audio_df = audio_df.sort_values(by='Song_ID', ascending=True)

# Replace "None" with a special token
audio_df['Song_Tags'] = audio_df['Song_Tags'].str.replace('None', 'unknown')

# Ensure all entries in 'Song_Tags' are strings
audio_df['Song_Tags'] = audio_df['Song_Tags'].astype(str)

# Extract and process tags
tags = audio_df['Song_Tags'].str.split(',')

# Flatten the list of tags and count unique tags
all_tags = [tag for sublist in tags for tag in sublist]
unique_tags = set(all_tags)
print(f"Number of unique tags: {len(unique_tags)}")

Number of unique tags: 29103


In [71]:
from sklearn.preprocessing import LabelEncoder

audio_df = audio_df.sort_values(by='Song_ID', ascending=True)

# Extract and process tags
tags = audio_df['Song_Tags'].str.split(',')

# Flatten the list of tags and create a mapping from tags to indices
all_tags = [tag for sublist in tags for tag in sublist]
label_encoder = LabelEncoder()
label_encoder.fit(all_tags)
tag_to_index = {tag: idx for idx, tag in enumerate(label_encoder.classes_)}

In [72]:
# Convert tags to indices
tags_indices = tags.apply(lambda x: [tag_to_index[tag] for tag in x])

# Pad the sequences to ensure they have the same length
from torch.nn.utils.rnn import pad_sequence

tags_indices_padded = pad_sequence([torch.tensor(t) for t in tags_indices], batch_first=True, padding_value=-1)

In [73]:
tags_indices_tensor = tags_indices_padded

In [74]:
tags_indices_tensor.shape

torch.Size([252013, 3])

create artist node tags. Top 3 most common tags for each artist


In [75]:
from collections import Counter

# Create a dictionary to store tags for each artist
artist_tags = {}

# Iterate over each song and its tags, ensuring Song_ID ascending order is kept
for idx, row in audio_df.sort_values(by='Song_ID', ascending=True).iterrows():
    artist = row['Artist']
    song_tags = row['Song_Tags'].split(',')
    
    if artist not in artist_tags:
        artist_tags[artist] = []
    
    artist_tags[artist].extend(song_tags)

# For each artist, find the top 3 most common tags
artist_top_tags = {}
for artist, tags in artist_tags.items():
    most_common_tags = [tag for tag, count in Counter(tags).most_common(3)]
    artist_top_tags[artist] = most_common_tags

# Convert artist tags to indices
artist_tags_indices = {artist: [tag_to_index[tag] for tag in tags] for artist, tags in artist_top_tags.items()}

# Create a tensor for artist tags, padding sequences to ensure they have the same length
artist_tags_indices_padded = pad_sequence([torch.tensor(tags) for tags in artist_tags_indices.values()], batch_first=True, padding_value=-1)

# Convert to tensor
artist_tags_tensor = artist_tags_indices_padded


In [76]:
artist_tags_tensor.shape

torch.Size([28120, 3])

In [77]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, to_hetero
from torch_geometric.data import HeteroData

# Convert to torch tensors
audio_features_tensor = torch.tensor(audio_df.iloc[:, 5:16].values, dtype=torch.float)
tags_indices_tensor = tags_indices_padded

final tags and audio features embeddings to use as node features in graph creation

In [78]:
audio_features_tensor.shape, tags_indices_tensor.shape

(torch.Size([252013, 11]), torch.Size([252013, 3]))

# Create Full Graph 3 Nodes - Rich
Including Audio features and  Track Tags Data

In [79]:
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T

In [80]:
# Create HeteroData object
data = HeteroData()


################### NODES ###################

# Save node indices:
data["users"].node_id = torch.arange(user_ids)
data["songs"].node_id = torch.arange(song_ids)
data["artists"].node_id = torch.arange(artist_ids)

# Add song nodes with audio and tag features
data['songs'].x_audio = audio_features_tensor
data['songs'].x_tag = tags_indices_tensor

# Add artist nodes with tag features
data['artists'].x_tag = artist_tags_tensor


################### EDGES ###################

# Add Edge Indices and Edge Attributes
data['users', 'listens_to', 'songs'].edge_index = user_song_edge_index.t().contiguous()
data['users', 'is_friends_with', 'users'].edge_index = user_user_edge_index.t().contiguous()
data['artists', 'makes', 'songs'].edge_index = artist_song_edge_index.t().contiguous()

# Add Edge Attributes
data['users', 'listens_to', 'songs'].edge_attr = user_song_edge_attr_tensor

# enable undirected edges for message passing
data = T.ToUndirected(merge = False)(data)


In [81]:
print('========================Nodes==============================')

# Gather some statistics about the graph.
print(f'Number of total nodes: {data.num_nodes}')
print(f'Number of user nodes: {data['users'].num_nodes}')
print(f'Number of song nodes: {data['songs'].num_nodes}')
print(f'Number of artist nodes: {data['artists'].num_nodes}')

print('========================Edges==============================')


print(f'Number of total edges: {data.num_edges}')
print(f'Number of Listening edges: {data['rev_listens_to'].num_edges}')
print(f'Number of Friends edges: {data['rev_is_friends_with'].num_edges}')
print(f'Number of Artist Makes Songs edges: {data['rev_makes'].num_edges}')

print('========================Reverse_Edges==============================')


print(f'Number of total edges: {data.num_edges}')
print(f'Number of Listening edges: {data['listens_to'].num_edges}')
print(f'Number of Friends edges: {data['is_friends_with'].num_edges}')
print(f'Number of Artist Makes Songs edges: {data['makes'].num_edges}')

print('========================Degree==============================')


print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')

print('========================Directed==============================')


print(f'Is undirected: {data.is_undirected()}')

Number of total nodes: 283440
Number of user nodes: 3307
Number of song nodes: 252013
Number of artist nodes: 28120
Number of total edges: 6826278
Number of Listening edges: 3018207
Number of Friends edges: 142919
Number of Artist Makes Songs edges: 252013
Number of total edges: 6826278
Number of Listening edges: 3018207
Number of Friends edges: 142919
Number of Artist Makes Songs edges: 252013
Average node degree: 24.08
Has isolated nodes: False
Has self-loops: False
Is undirected: True


In [85]:
import os

model_path = 'data/pyg_data/full_graph_hetero_data_3_nodes_rich.pt'

if os.path.exists(model_path):
    data = torch.load(model_path)
else:
    torch.save(data, model_path)
