# Package Imports

In [1]:
#from pymongo import MongoClient
from collections import defaultdict, Counter
from tqdm.auto import tqdm
import networkx as nx
import random
import math
import pickle
import torch
import pandas as pd
import torch_geometric
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T
from torch_geometric.utils.convert import to_networkx, from_networkx
from torch_geometric.utils import to_undirected, is_undirected
import numpy as np
from networkx import to_dict_of_dicts
from torch_geometric.loader import NeighborLoader
import matplotlib.pyplot as plt


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(torch.__version__)

2.2.2+cu118


In [3]:
print(torch_geometric.__version__)

2.5.3


# Load Data

## Load User Track Graph

In [4]:
import pickle

# First, extract the contents of dataset.rar to a folder

# Then, load the data from the dataset.pickle file
with open('data/MRecury_data/dataset_typed.pickle', 'rb') as f:
    dataset = pickle.load(f)

# access the different parts of the dataset:
full_graph = dataset['full']
train_graph = dataset['train']
test_graph = dataset['test']
users_mapping = dataset['users']
#artist_tracks_mapping = dataset['artist-tracks']
# Load New Mapping: 
artist_tracks_mapping = pd.read_csv('data/new_artist_tracks_mapping_df.csv')


# Accessing nodes and edges of the graphs:
full_nodes = full_graph.nodes()
full_edges = full_graph.edges()

train_nodes = train_graph.nodes()
train_edges = train_graph.edges()

test_nodes = test_graph.nodes()
test_edges = test_graph.edges()



In [5]:
def create_dataframe_from_graph(graph):
    # Initialize lists to store extracted information
    user_ids = []
    song_ids = []
    scrobbles = []
    positions = []
    dates = []

    # Iterate over users
    for user_id in tqdm(range(3307)):
        if user_id in graph:
            # Iterate over the songs
            for song_id, songs_info in graph[user_id].items():
                user_ids.append(user_id)
                song_ids.append(song_id)
                scrobbles.append(songs_info['scrobbles'])
                positions.append(songs_info['pos'])
                dates.append(songs_info['date'])
                
    # Create a DataFrame from the lists
    graph_df = pd.DataFrame({
        'User_ID': user_ids,
        'Song_ID': song_ids,
        'Scrobbles': scrobbles,
        'Position': positions,
        'Date': dates
    })
    
    return graph_df

full_graph_df = create_dataframe_from_graph(full_graph)
train_graph_df = create_dataframe_from_graph(train_graph)
test_graph_df = create_dataframe_from_graph(test_graph)

100%|██████████| 3307/3307 [00:01<00:00, 2158.72it/s]
100%|██████████| 3307/3307 [00:01<00:00, 2690.16it/s]
100%|██████████| 3307/3307 [00:00<00:00, 16491.74it/s]


## Load Social Graph

In [6]:
def load_social(file_users, file_edges, users_ids):
    df_users = pd.read_csv(file_users, sep='\t', names=['id', 'user'])
    df_edges = pd.read_csv(file_edges, sep=' ', names=['origin', 'destination'])
    old_new = {}
    for _, r in tqdm(df_users.iterrows(), total=len(df_users)):
        if r['user'] in users_ids:
            old_new[r['id']] = users_ids[r['user']]
    social_graph = nx.DiGraph()
    social_graph.add_nodes_from(old_new.values())
    for _, r in tqdm(df_edges.iterrows(), total=len(df_edges)):
        if r['origin'] in old_new and r['destination'] in old_new:
            social_graph.add_edge(old_new[r['origin']], old_new[r['destination']])
    return social_graph

social_graph = load_social('data/MRecury_data/lastfm.nodes', 'data/MRecury_data/lastfm.edges', users_mapping)

100%|██████████| 136420/136420 [00:01<00:00, 75937.23it/s]
100%|██████████| 1685524/1685524 [00:17<00:00, 98314.75it/s] 


# Transformation into PyG Graph

## Graph Data Inspection for Transformation

In [7]:
user_nodes = [node for node in test_nodes if test_graph.nodes[node]['type'] == 'user']

In [8]:
dataset = test_graph
print(f'Dataset: {dataset}:')
print('======================')

#print(f'Number of graphs: {len(full_graph)}') # seems like the number of graphs is wrong, this is identital with nodes
print(f'Number of nodes: {len(test_nodes)}')
print(f'Number of edges: {len(test_edges)}')
print(f'Number of User Nodes: {len(user_nodes)}') # i know this is the case from my inspection in Notebook 1. Also I could inspect the train_graph_df if needed.
print(f'Number of Tracks Nodes: {(len(test_nodes) - len(user_nodes))}')



#print(f'Number of features: {dataset.num_features}')
#print(f'Number of classes: {dataset.num_classes}')

Dataset: Graph with 159128 nodes and 453301 edges:
Number of nodes: 159128
Number of edges: 453301
Number of User Nodes: 3279
Number of Tracks Nodes: 155849


## Train Graph Preprocessing (Skip on Rerun)
Cleaning the Train Graph of missing songs and resetting Indices of each Node for Pyg Graph Init

Can be Skipped on Re-Runs for the same dataset (eg. Train)

In [9]:
## Initialize Mapping

In [10]:
artist_tracks_mapping = pd.read_csv('data/new_artist_tracks_mapping_df.csv')
artist_tracks_mapping = artist_tracks_mapping.rename(columns = {'Song_Node_ID':'Song_ID'})
artist_tracks_mapping

Unnamed: 0,Artist,Song_Name,Song_ID
0,Black Kids,I'm Not Gonna Teach Your Boyfriend How to Danc...,3307
1,Black Kids,Hit The Heartbrakes,3308
2,Black Kids,I've Underestimated My Charm (Again),3309
3,Black Kids,Partie Traumatic,3310
4,Black Kids,I'm Making Eyes at You,3311
...,...,...,...
252008,Jamie Lancaster,Boys Don't Cry,255203
252009,Sleeperstar,I Was Wrong,255208
252010,Anthony Naples,Mad Disrespect,255228
252011,Irene,Stardust,255253


In [11]:
unique_artists = sorted(artist_tracks_mapping['Artist'].unique())
artist_id_mapping = {artist: i for i, artist in enumerate(unique_artists)}
artist_tracks_mapping['Artist_ID'] = artist_tracks_mapping['Artist'].apply(lambda x: artist_id_mapping[x])
artist_tracks_mapping

Unnamed: 0,Artist,Song_Name,Song_ID,Artist_ID
0,Black Kids,I'm Not Gonna Teach Your Boyfriend How to Danc...,3307,3322
1,Black Kids,Hit The Heartbrakes,3308,3322
2,Black Kids,I've Underestimated My Charm (Again),3309,3322
3,Black Kids,Partie Traumatic,3310,3322
4,Black Kids,I'm Making Eyes at You,3311,3322
...,...,...,...,...
252008,Jamie Lancaster,Boys Don't Cry,255203,11994
252009,Sleeperstar,I Was Wrong,255208,22326
252010,Anthony Naples,Mad Disrespect,255228,1696
252011,Irene,Stardust,255253,11620


In [12]:
# Merge the train_graph_df with artist_tracks_mapping on the "Song_ID" column
test_graph_df = pd.merge(test_graph_df, artist_tracks_mapping, on="Song_ID", how="left")

# Display the structure of the merged DataFrame
test_graph_df

Unnamed: 0,User_ID,Song_ID,Scrobbles,Position,Date,Artist,Song_Name,Artist_ID
0,0,188713,8,130,"Thursday 23 Sep 2021, 9:31am",Phoenix,Oblique City,19432.0
1,0,8573,10,131,"Thursday 23 Sep 2021, 9:22am",Phoenix,Bourgeois,19432.0
2,0,4256,53,132,"Thursday 23 Sep 2021, 9:19am",Phoenix,Don't,19432.0
3,0,4521,47,133,"Thursday 23 Sep 2021, 9:15am",Phoenix,Chloroform,19432.0
4,0,4522,47,134,"Thursday 23 Sep 2021, 9:12am",Phoenix,Drakkar Noir,19432.0
...,...,...,...,...,...,...,...,...
453296,3306,202755,1,854,"Tuesday 19 Jan 2010, 10:17pm",Hurt,Summers Lost,11221.0
453297,3306,9790,1,857,"Tuesday 19 Jan 2010, 10:05pm",Joy Division,New Dawn Fades,12887.0
453298,3306,169252,1,860,"Tuesday 19 Jan 2010, 9:51pm",k.d. lang,Fallen,27871.0
453299,3306,181742,1,862,"Tuesday 19 Jan 2010, 9:43pm",Bowling for Soup,I Gotchoo,3787.0


In [13]:
# Remove rows with NaNs in Song_ID or Song_Name
test_graph_df = test_graph_df.dropna(subset=['Song_ID', 'Song_Name'])

In [15]:
full_graph_clean = pd.read_csv('data/full_graph_df_clean.csv')

map the Song IDs from the correct Full Graph Song Ids to the Test Graph df

In [16]:
import pandas as pd
import numpy as np
from tqdm import tqdm

# Create a dictionary to map (Artist, Song_Name) to Song_ID
song_id_map = dict(zip(zip(full_graph_clean['Artist'], full_graph_clean['Song_Name']), full_graph_clean['Song_ID']))

# Combine Artist and Song_Name into a single tuple column for mapping
test_graph_df['Artist_Song'] = list(zip(test_graph_df['Artist'], test_graph_df['Song_Name']))

# Use tqdm to show progress
tqdm.pandas(desc="Updating Song_IDs")

# Map the correct Song_IDs using the dictionary with tqdm progress bar
test_graph_df['Song_ID'] = test_graph_df['Artist_Song'].progress_apply(lambda x: song_id_map.get(x, np.nan))

# Drop the temporary column
test_graph_df = test_graph_df.drop(columns=['Artist_Song'])

test_graph_df.drop_duplicates(inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_graph_df['Artist_Song'] = list(zip(test_graph_df['Artist'], test_graph_df['Song_Name']))
Updating Song_IDs: 100%|██████████| 453300/453300 [00:00<00:00, 1017560.39it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_graph_df['Song_ID'] = test_graph_df['Artist_Song'].progress_apply(lambda x: song_id_map.get(x, np.nan))


In [17]:
test_graph_df['Artist_ID'] = test_graph_df['Artist_ID'].astype(int)
test_graph_df

Unnamed: 0,User_ID,Song_ID,Scrobbles,Position,Date,Artist,Song_Name,Artist_ID
0,0,185405,8,130,"Thursday 23 Sep 2021, 9:31am",Phoenix,Oblique City,19432
1,0,5266,10,131,"Thursday 23 Sep 2021, 9:22am",Phoenix,Bourgeois,19432
2,0,949,53,132,"Thursday 23 Sep 2021, 9:19am",Phoenix,Don't,19432
3,0,1214,47,133,"Thursday 23 Sep 2021, 9:15am",Phoenix,Chloroform,19432
4,0,1215,47,134,"Thursday 23 Sep 2021, 9:12am",Phoenix,Drakkar Noir,19432
...,...,...,...,...,...,...,...,...
453296,3306,199447,1,854,"Tuesday 19 Jan 2010, 10:17pm",Hurt,Summers Lost,11221
453297,3306,6483,1,857,"Tuesday 19 Jan 2010, 10:05pm",Joy Division,New Dawn Fades,12887
453298,3306,165944,1,860,"Tuesday 19 Jan 2010, 9:51pm",k.d. lang,Fallen,27871
453299,3306,178434,1,862,"Tuesday 19 Jan 2010, 9:43pm",Bowling for Soup,I Gotchoo,3787


In [18]:
## length of the Train Graph df before (and after merge). 14 is wrong, should be 13, since one song was double in the original graph and mapping. Hence we have to remove this song
len(test_graph_df["Song_ID"].unique())

155848

In [19]:
len(test_nodes) - len(user_nodes)

155849

In [20]:
## length of the Train Graph df before (and after merge). 14 is wrong, should be 13, since one song was double in the original graph and mapping. Hence we have to remove this song
len(test_graph_df["User_ID"].unique())

3279

In [21]:
len(user_nodes)

3279

In [22]:
test_graph_df["Song_ID"].unique().max()

252012

### check number of songs
ALL SONG from the FULL DATA are still "somewhere" in the train data. BUT not every user that had a "Listen to" relation to a song exists.
So eg. User 0 could have listened to track 3307 in the Full Graph, but in the Train data this connection is not seen. But for user 1 the connection remains in the training data, so this way we can see the song 3307 at least once in the training data. This means each song is at least listened to by 1 user in the training data. 

In [23]:
unique_song_ids= test_graph_df["Song_ID"].unique()
print("Length:", len(unique_song_ids))
print("Max Value:", test_graph_df["Song_ID"].unique().max())
#sorted(unique_song_ids) 

Length: 155848
Max Value: 252012


### Save Clean TrainGraph DF

In [24]:
import os

# Check if the file exists
if not os.path.exists('data/test_graph_df_clean.csv'):
    # Save the merged DataFrame to a CSV file if it doesn't exist
    test_graph_df.to_csv('data/test_graph_df_clean.csv', index=False)

## Import Clean TrainGraph DF (Dont Skip)

In [25]:
test_graph_df[test_graph_df["Song_Name"] == "Oblique City"]

Unnamed: 0,User_ID,Song_ID,Scrobbles,Position,Date,Artist,Song_Name,Artist_ID
0,0,185405,8,130,"Thursday 23 Sep 2021, 9:31am",Phoenix,Oblique City,19432


In [26]:
test_graph_df

Unnamed: 0,User_ID,Song_ID,Scrobbles,Position,Date,Artist,Song_Name,Artist_ID
0,0,185405,8,130,"Thursday 23 Sep 2021, 9:31am",Phoenix,Oblique City,19432
1,0,5266,10,131,"Thursday 23 Sep 2021, 9:22am",Phoenix,Bourgeois,19432
2,0,949,53,132,"Thursday 23 Sep 2021, 9:19am",Phoenix,Don't,19432
3,0,1214,47,133,"Thursday 23 Sep 2021, 9:15am",Phoenix,Chloroform,19432
4,0,1215,47,134,"Thursday 23 Sep 2021, 9:12am",Phoenix,Drakkar Noir,19432
...,...,...,...,...,...,...,...,...
453296,3306,199447,1,854,"Tuesday 19 Jan 2010, 10:17pm",Hurt,Summers Lost,11221
453297,3306,6483,1,857,"Tuesday 19 Jan 2010, 10:05pm",Joy Division,New Dawn Fades,12887
453298,3306,165944,1,860,"Tuesday 19 Jan 2010, 9:51pm",k.d. lang,Fallen,27871
453299,3306,178434,1,862,"Tuesday 19 Jan 2010, 9:43pm",Bowling for Soup,I Gotchoo,3787


### Map Song IDS and User IDs

In [27]:
min_song_id = test_graph_df["Song_ID"].min()
max_song_id = test_graph_df["Song_ID"].max()
missing_song_ids = set(range(min_song_id, max_song_id + 1)) - set(test_graph_df["Song_ID"].unique())
len(missing_song_ids)

96165

In [28]:
# key is old mapping, value is new mapping

user_id_mapping = {id: i for i, id in enumerate(sorted(test_graph_df["User_ID"].unique()))}
song_id_mapping = {id: i for i, id in enumerate(sorted(test_graph_df["Song_ID"].unique()))}


In [29]:
test_graph_df['Mapped_User_ID'] = test_graph_df['User_ID'].map(user_id_mapping)
test_graph_df['Mapped_Song_ID'] = test_graph_df['Song_ID'].map(song_id_mapping)


In [30]:
test_graph_df

Unnamed: 0,User_ID,Song_ID,Scrobbles,Position,Date,Artist,Song_Name,Artist_ID,Mapped_User_ID,Mapped_Song_ID
0,0,185405,8,130,"Thursday 23 Sep 2021, 9:31am",Phoenix,Oblique City,19432,0,112945
1,0,5266,10,131,"Thursday 23 Sep 2021, 9:22am",Phoenix,Bourgeois,19432,0,3705
2,0,949,53,132,"Thursday 23 Sep 2021, 9:19am",Phoenix,Don't,19432,0,794
3,0,1214,47,133,"Thursday 23 Sep 2021, 9:15am",Phoenix,Chloroform,19432,0,1015
4,0,1215,47,134,"Thursday 23 Sep 2021, 9:12am",Phoenix,Drakkar Noir,19432,0,1016
...,...,...,...,...,...,...,...,...,...,...
453296,3306,199447,1,854,"Tuesday 19 Jan 2010, 10:17pm",Hurt,Summers Lost,11221,3278,122919
453297,3306,6483,1,857,"Tuesday 19 Jan 2010, 10:05pm",Joy Division,New Dawn Fades,12887,3278,4619
453298,3306,165944,1,860,"Tuesday 19 Jan 2010, 9:51pm",k.d. lang,Fallen,27871,3278,99253
453299,3306,178434,1,862,"Tuesday 19 Jan 2010, 9:43pm",Bowling for Soup,I Gotchoo,3787,3278,108156


## Node Data Setup

### Select All Unique Users

In [31]:
user_ids = len(users_mapping)
user_ids

3307

In [32]:
user_ids = test_graph_df["User_ID"].unique()
user_ids, len(user_ids)

(array([   0,    1,    2, ..., 3304, 3305, 3306], dtype=int64), 3279)

### Select All Unique Song Nodes

In [33]:
song_ids = test_graph_df["Song_ID"].unique()
song_ids

array([185405,   5266,    949, ..., 199447, 165944, 178434], dtype=int64)

### Select All Unique Artists

In [34]:
test_graph_df["Artist_ID"].unique()

array([19432, 27034,  3708, ...,  9165, 13745,   379])

In [35]:
artist_ids = test_graph_df["Artist_ID"].unique()
artist_ids

array([19432, 27034,  3708, ...,  9165, 13745,   379])

In [36]:
unique_artists = test_graph_df["Artist"].unique()

In [37]:
# Initialize the starting node ID for artists
#reset this to 0! Important because PyG resets indices for each node type on initialization, and then my edge mapping will be wrong if it doesnt start from 0
artist_node_id = 0

# Dictionary to store node IDs for artists
artist_nodes_dic = {}
artist_nodes = []

# Iterate over each artist and assign node IDs
for artist in unique_artists:

    artist_nodes_dic[artist] = artist_node_id
    
    artist_nodes.append(artist_node_id)
    artist_node_id += 1



In [38]:
artist_nodes_dic

{'Phoenix': 0,
 'White Lies': 1,
 'Boniface': 2,
 'girl in red': 3,
 'CHVRCHES': 4,
 'Liniker e os Caramelows': 5,
 'Hospitality': 6,
 'Hot Hot Heat': 7,
 'Giovani Cidreira': 8,
 'Paramore': 9,
 'Foxing': 10,
 'Camera Obscura': 11,
 'Janelle Monáe': 12,
 'Someone Still Loves You Boris Yeltsin': 13,
 'Yeah Yeah Yeahs': 14,
 'Johnny Hooker': 15,
 'The Temper Trap': 16,
 'Jessie Ware': 17,
 'Regina Spektor': 18,
 'Gengahr': 19,
 'James Bay': 20,
 'Laura Marling': 21,
 'OK Go': 22,
 'Nao': 23,
 'Blood Orange': 24,
 'Carla Morrison': 25,
 'Foster the People': 26,
 'Jack Peñate': 27,
 'Beach Bunny': 28,
 'Ra Ra Riot': 29,
 'The Drums': 30,
 'Paloma Faith': 31,
 'Lewis Capaldi': 32,
 'Lorde': 33,
 'Marina & the Diamonds': 34,
 'Wolf Gang': 35,
 'Tom Grennan': 36,
 'Arctic Monkeys': 37,
 'AlunaGeorge': 38,
 'The Kooks': 39,
 'Mon Laferte': 40,
 "Allo Darlin'": 41,
 'Arcade Fire': 42,
 'King Krule': 43,
 'Dona Onete': 44,
 'Bob Dylan': 45,
 'Georgia': 46,
 'Jake Bugg': 47,
 'Cold War Kids': 48,

In [39]:
len(artist_nodes)

21943

## Edge Indices Setup

### User-Tracks Edges
Collect Edge Information from User-Tracks

#### Reset Song_ID Index

##### With edge attributes as List

In [40]:
# 1. Find unique User_IDs
unique_user_ids = len(test_graph_df['User_ID'].unique())
unique_user_ids

3279

#### Create User-Track Edge Indices & Attributes

In [41]:
def create_user_track_edge_index_and_attributes(graph_df):
    
    """
    Create edge index and attributes from a graph.

    Args:
    - graph as NetworkX Graph Object

    Returns:
    - user_song_edge_index (list): List of edges represented by node indices.
    - user_song_edge_scrobbel_attributes (list): List of scrobble attributes for each edge.
    """

    # Initialize index and dictionary
    user_song_edge_index = []
    user_song_edge_scrobbel_attributes = []
    
    for user_id, song_info in graph_df[["User_ID", "Song_ID", "Scrobbles"]].groupby("User_ID"):
        
        scrobbles_per_user_node = []
        
        song_ids = song_info['Song_ID']#.tolist()
        scrobbles = song_info['Scrobbles']
        #song_infos = song_info[['Song_ID', 'Scrobbles']]#
        #print(song_infos)
        #iterate over each song node and add its ID
        for song_id in song_ids:
            # get user and id information
            #print(song_id)
            user_id = user_id
            song_id = song_id

            ## for debudding wrong nodeID error which caused the problem in the LNH Sampler. 13 instead of 12
            if song_id == 252013:
                print(song_id, user_id)
            
            #store as edge tuple
            current_edge_directed = [user_id, song_id]
    
            #append to edge list
            user_song_edge_index.append(current_edge_directed)
            
        for scrobble in scrobbles:
            # Extract edge attributes and append to list
            scrobbles_per_song = scrobble
            scrobbles_per_user_node.append(scrobbles_per_song)
            
        user_song_edge_scrobbel_attributes.extend(scrobbles_per_user_node)
    return user_song_edge_index, user_song_edge_scrobbel_attributes
        
    


In [42]:
user_song_edge_index, user_song_edge_scrobbel_attributes = create_user_track_edge_index_and_attributes(test_graph_df)

In [43]:
print(len(user_song_edge_index), len(user_song_edge_scrobbel_attributes))

453300 453300


In [44]:
# Convert the list of edge attributes to a tensor
user_song_edge_attr_tensor = torch.tensor(user_song_edge_scrobbel_attributes, dtype=torch.long)
user_song_edge_attr_tensor.t().size()

torch.Size([453300])

In [46]:
# Convert the list of edge index to a tensor
user_song_edge_index = torch.tensor(user_song_edge_index , dtype=torch.long)
user_song_edge_index.t().size()

  user_song_edge_index = torch.tensor(user_song_edge_index , dtype=torch.long)


torch.Size([2, 453300])

### User User Edges
Collect User User Edge Information

#### User User Edge Index

In [47]:
def create_user_user_edge_index(social_graph):
    """
    Create edge index for user-user relationships in a social graph.

    Args:
    - social_graph: NetworkX Graph Object

    Returns:
    - user_user_edge_index (list): List of edges represented by node indices.
     One way - Directed Only. (Will be made undirected withing PyG
    """
    #initialize index
    user_user_edge_index = []
    
    # Iterate over all user nodes
    for user_node in test_graph_df['User_ID'].unique():
        #print(type(user_node))
        #print(user_node)
           
        # iterate over all edges of each user node in the social graph
        for key, value in social_graph[user_node].items():
            ## add edges twice for undirection - Not necessary since this will be done in PyG
            current_edge_directed = [user_node, key]
            current_edge_undirected = [key, user_node]
            user_user_edge_index.append(current_edge_directed)
            #user_user_edge_index.append(current_edge_undirected)
    return user_user_edge_index
    
user_user_edge_index = create_user_user_edge_index(social_graph)

In [48]:
user_user_edge_index = torch.tensor(user_user_edge_index , dtype=torch.long)
user_user_edge_index.t().size()

torch.Size([2, 141145])

In [49]:
user_user_edge_index

tensor([[   0,  763],
        [   0, 1435],
        [   0,  122],
        ...,
        [3306,  326],
        [3306,  926],
        [3306,  700]])

#### Remapping the User User Edge Index to PyG Format (meaning all Nodes have to start from ID 0 with no missings in between)

In [50]:
user_user_edge_index_df = pd.DataFrame(user_user_edge_index.numpy(), columns=['User1', 'User2'])


In [51]:
user_user_edge_index_df

Unnamed: 0,User1,User2
0,0,763
1,0,1435
2,0,122
3,0,488
4,0,170
...,...,...
141140,3306,1631
141141,3306,958
141142,3306,326
141143,3306,926


In [52]:
user_user_edge_index_df = user_user_edge_index_df.applymap(lambda x: user_id_mapping.get(x, x))
user_user_edge_index_df

  user_user_edge_index_df = user_user_edge_index_df.applymap(lambda x: user_id_mapping.get(x, x))


Unnamed: 0,User1,User2
0,0,758
1,0,1427
2,0,120
3,0,484
4,0,168
...,...,...
141140,3278,1622
141141,3278,952
141142,3278,323
141143,3278,920


In [53]:
# Convert the DataFrame back into the user-user edge index as a torch tensor
user_user_edge_index = torch.from_numpy(user_user_edge_index_df.values)


In [54]:
user_user_edge_index

tensor([[   0,  758],
        [   0, 1427],
        [   0,  120],
        ...,
        [3278,  323],
        [3278,  920],
        [3278,  695]])

###  Artist-Track Edges

#### Creating a Artist-Track Dictionary to feed into the edge Data

In [55]:
test_graph_df[["Artist","Song_ID","Song_Name"]]

Unnamed: 0,Artist,Song_ID,Song_Name
0,Phoenix,185405,Oblique City
1,Phoenix,5266,Bourgeois
2,Phoenix,949,Don't
3,Phoenix,1214,Chloroform
4,Phoenix,1215,Drakkar Noir
...,...,...,...
453296,Hurt,199447,Summers Lost
453297,Joy Division,6483,New Dawn Fades
453298,k.d. lang,165944,Fallen
453299,Bowling for Soup,178434,I Gotchoo


In [56]:
def artist_to_song(df):
    # Initialize an empty dictionary to store the mapping of artists to songs
    artist_to_songs = {}
    
    # Iterate through the DataFrame rows
    for index, row in df.iterrows():
        artist = row['Artist']
        song_name = row['Song_Name']
        song_id = row['Mapped_Song_ID']
        
        # Check if the artist is already in the dictionary
        if artist in artist_to_songs:
            # Add the song to the nested dictionary
            artist_to_songs[artist][song_id] = song_name
        else:
            # Create a new nested dictionary with the song and add it to the dictionary
            artist_to_songs[artist] = {song_id: song_name}


    return artist_to_songs

In [57]:
artist_to_songs_dic = artist_to_song(test_graph_df)


In [58]:
artist_to_songs_dic

{'Phoenix': {112945: 'Oblique City',
  3705: 'Bourgeois',
  794: "Don't",
  1015: 'Chloroform',
  1016: 'Drakkar Noir',
  3704: 'Bankrupt!',
  1083: 'Trying to Be Cool',
  866: 'S.O.S. in Bel Air',
  1017: 'The Real Thing',
  945: 'Entertainment',
  72382: 'Love for Granted',
  206: 'Lisztomania',
  294: '1901',
  101251: 'Ti Amo',
  205: 'Lasso',
  342: 'Girlfriend',
  353: 'Rome',
  324: 'Armistice',
  831: 'Too Young',
  5220: 'Everything Is Everything',
  1695: 'If I Ever Feel Better',
  93806: 'J-Boy',
  413: 'Fences',
  1479: 'Run Run Run',
  101253: 'Definitive Breaks',
  69441: 'One Time Too Many',
  106138: 'Lisztomania (Classixx Version)',
  3330: 'Second To None',
  78161: 'Sometimes In The Fall',
  79042: 'North',
  77437: 'Courtesy Laughs',
  79041: 'Lost And Found',
  1291: 'Long Distance Call',
  70736: 'Rally',
  1082: 'Consolation Prizes',
  73772: 'Napoleon Says',
  95393: 'Fences (Friendly Fires Remix)',
  82714: 'Alphabetical',
  79863: "Holdin' On Together",
  7816

In [59]:
def create_song_artist_edge_index(artist_tracks_mapping_dic):
    # init artist and songs dict from mapping df
    #artist_tracks_mapping_dic = dict(zip(artist_tracks_mapping['Artist'], artist_tracks_mapping['Song_ID']))
    
    # Initialize variables to store edge index and attributes
    artist_song_edge_index = []

     
    # Iterate over each artist and their songs
    for artist, songs in artist_tracks_mapping_dic.items():
        # Get the node ID of the current artist
        artist_node_id = artist_nodes_dic[artist]
        
        #print(artist, songs)
        #print(artist)
        #print(songs)
        
        # Iterate over each song and its node ID
        for song_node_id, song_name  in songs.items():
            
            # Create directed edges from artist to song
            #print(artist, song_node_id)
            #print(artist, song_node_id, song_name)
            artist_song_edge_index.append([artist_node_id, song_node_id])#, song_name, artist])
            
            # Optional: Create directed edges from song to artist
            #artist_song_edge_index.append([song_node_id, artist_node_id])
    return artist_song_edge_index
            
artist_song_edge_index = create_song_artist_edge_index(artist_to_songs_dic)
#artist_song_edge_index

In [60]:
artist_song_edge_index = torch.tensor(artist_song_edge_index , dtype=torch.long)
artist_song_edge_index.t().size()

torch.Size([2, 155848])

In [61]:
artist_song_edge_index

tensor([[     0, 112945],
        [     0,   3705],
        [     0,    794],
        ...,
        [ 21941,  65789],
        [ 21941,  65788],
        [ 21942, 146319]])

## Check All Edges
Edges are undirected:

In [62]:
print(len(user_song_edge_index))
print(len(user_user_edge_index))
print(len(artist_song_edge_index))

453300
141145
155848


In [63]:
print(f"The Edge {user_song_edge_index} is undirected: {is_undirected(user_song_edge_index)}.")
print(f"The Edge {user_user_edge_index} is undirected: {is_undirected(user_user_edge_index)}.")
print(f"The Edge {artist_song_edge_index} is undirected: {is_undirected(artist_song_edge_index)}.")

The Edge tensor([[     0, 185405],
        [     0,   5266],
        [     0,    949],
        ...,
        [  3306, 165944],
        [  3306, 178434],
        [  3306, 136260]]) is undirected: False.
The Edge tensor([[   0,  758],
        [   0, 1427],
        [   0,  120],
        ...,
        [3278,  323],
        [3278,  920],
        [3278,  695]]) is undirected: False.
The Edge tensor([[     0, 112945],
        [     0,   3705],
        [     0,    794],
        ...,
        [ 21941,  65789],
        [ 21941,  65788],
        [ 21942, 146319]]) is undirected: False.


# User_Song Test Matrix Graph


### Merc approach

In [64]:
test_users = test_graph_df["User_ID"].unique()
test_tracks = test_graph_df["Song_ID"].unique()

In [65]:
len(test_users), len(test_tracks)

(3279, 155848)

In [67]:
import pyarrow as pa
import pyarrow.parquet as pq

df = pd.DataFrame(data=[], columns=['User', 'Track']).astype({'User': np.int32, 'Track': np.int32})

parquet_schema = pa.Table.from_pandas(df=df, preserve_index=False).schema
# Open a Parquet file for writing
parquet_writer = pq.ParquetWriter('data/gnn_models/test_matrix.parquet', parquet_schema, compression='GZIP')



partial = []
for u in tqdm(test_users):
    data = np.zeros((len(test_tracks), 2), dtype=np.float32)
    data[:, 0] = u
    data[:, 1] = test_tracks

    df = pd.DataFrame(data=data, columns=['User', 'Track']).astype({'User': np.int32, 'Track': np.int32})
    
    no_valid = set(train_graph.neighbors(u))
    df = df[~df['Track'].isin(no_valid)]
    
    partial.append(df)
    if len(partial) >= 100:
        df = pd.concat(partial)
        partial = []
        table = pa.Table.from_pandas(df, schema=parquet_schema)
        parquet_writer.write_table(table)
        
        
if len(partial) > 0:
    df = pd.concat(partial)
    partial = []
    table = pa.Table.from_pandas(df, schema=parquet_schema)
    parquet_writer.write_table(table)
    
parquet_writer.close()

  0%|          | 0/3279 [00:00<?, ?it/s]

100%|██████████| 3279/3279 [01:34<00:00, 34.71it/s]


In [68]:
import pandas as pd

# Load the Parquet file
df_test_matrix = pd.read_parquet('data/gnn_models/test_matrix.parquet')
df_test_matrix

Unnamed: 0,User,Track
0,0,185405
1,0,949
2,0,1214
3,0,1215
4,0,1299
...,...,...
509212516,3306,196097
509212517,3306,68213
509212518,3306,199447
509212519,3306,165944


remapp these original user and track IDs to the ones we need for PyG graphs. (Which is a reset to 0, and all starting from 0 without any empties)

In [69]:
# Assuming test_users and test_tracks are arrays of unique users and tracks
test_users_pyg = np.arange(len(user_ids))
test_tracks_pyg = np.arange(len(song_ids))

In [70]:
test_user_mapping = {user: i for i, user in enumerate(sorted(test_users))}
test_track_mapping = {track: i for i, track in enumerate(sorted(test_tracks))}
test_user_mapping[3306], test_track_mapping[252012]



(3278, 155847)

In [71]:
df_test_matrix['User'] = df_test_matrix['User'].map(test_user_mapping)
df_test_matrix['Track'] = df_test_matrix['Track'].map(test_track_mapping)


In [72]:
df_test_matrix

Unnamed: 0,User,Track
0,0,112945
1,0,794
2,0,1015
3,0,1016
4,0,1083
...,...,...
509212516,3278,120584
509212517,3278,40810
509212518,3278,122919
509212519,3278,99253


In [73]:
import torch

# Convert the DataFrame to a tensor
test_matrix_tensor = torch.from_numpy(df_test_matrix.to_numpy())
full_matrix_user_song_edge_indices = test_matrix_tensor

In [74]:
full_matrix_user_song_edge_indices, len(full_matrix_user_song_edge_indices)

(tensor([[     0, 112945],
         [     0,    794],
         [     0,   1015],
         ...,
         [  3278, 122919],
         [  3278,  99253],
         [  3278, 108156]]),
 509212521)

# Test Graph


## Prepare Data 
For Graph Creation. Full Graph needs information like Song Tags and Audio features, which are not present in base graph
Could also be called "Rhich" Graph

## load tags and audio features

## Clean Audio df from missing ID

In [61]:
# Load Track Tag
audio_df = pd.read_csv('data/final_audio_df.csv')
train_graph_df = pd.read_csv('data/test_graph_df_clean.csv')


reset audio df Song Ids to 0


In [62]:
# Remove the missing Song_IDs from audio_df and reset its other IDS starting from 0, so they are consecutive
print("Removing missing Song_IDs from audio_df and resetting its other IDS starting from 0, so they are consecutive...")
audio_df = audio_df.sort_values('Song_ID').reset_index(drop=True)
audio_df['Song_ID'] = range(len(audio_df))

Removing missing Song_IDs from audio_df and resetting its other IDS starting from 0, so they are consecutive...


In [63]:
# filter the audio_df in such a way that only songs remain that are present in the train_graph_df
audio_df = audio_df[audio_df['Song_ID'].isin(train_graph_df['Song_ID'].unique())]
len(audio_df), len(train_graph_df['Song_ID'].unique())


(155848, 155848)

In [65]:
# Check if any Song_ID is missing in audio_df
audio_missing_ids = set(range(audio_df['Song_ID'].min(), audio_df['Song_ID'].max() + 1)) - set(audio_df['Song_ID'])
print(f"Missing Song_IDs in audio_df: {audio_missing_ids}")

# Check if any Song_ID is missing in train_graph_df
train_graph_missing_ids = set(range(train_graph_df['Song_ID'].min(), train_graph_df['Song_ID'].max() + 1)) - set(train_graph_df['Song_ID'])
print(f"Missing Song_IDs in train_graph_df: {train_graph_missing_ids}")



# Check if any Song_ID is missing in audio_df after resetting
audio_missing_ids = set(range(audio_df['Song_ID'].min(), audio_df['Song_ID'].max() + 1)) - set(audio_df['Song_ID'])
print(f"Missing Song_IDs in audio_df: {audio_missing_ids}")

# Check if any Song_ID is missing in train_graph_df after resetting
train_graph_missing_ids = set(range(train_graph_df['Song_ID'].min(), train_graph_df['Song_ID'].max() + 1)) - set(train_graph_df['Song_ID'])
print(f"Missing Song_IDs in train_graph_df: {train_graph_missing_ids}")

# Print the number of unique Song_IDs in both dfs
print(f"Number of unique Song_IDs in audio_df: {audio_df['Song_ID'].nunique()}")
print(f"Number of unique Song_IDs in train_graph_df: {train_graph_df['Song_ID'].nunique()}")


Missing Song_IDs in audio_df: {42, 48, 54, 57, 60, 76, 77, 93, 94, 99, 101, 103, 104, 110, 111, 112, 116, 118, 119, 121, 122, 127, 129, 132, 141, 149, 153, 158, 161, 166, 167, 168, 173, 179, 182, 189, 191, 194, 207, 212, 215, 216, 224, 231, 232, 233, 244, 246, 256, 258, 272, 288, 289, 294, 314, 325, 330, 331, 333, 342, 343, 350, 357, 360, 367, 377, 378, 385, 389, 392, 395, 398, 412, 416, 417, 421, 422, 430, 431, 435, 436, 450, 466, 486, 497, 498, 511, 518, 523, 539, 543, 553, 555, 557, 559, 572, 573, 578, 597, 604, 612, 628, 635, 648, 653, 654, 658, 660, 661, 663, 675, 676, 681, 689, 692, 694, 714, 721, 734, 736, 743, 744, 747, 749, 750, 758, 759, 761, 770, 784, 786, 800, 802, 804, 819, 820, 854, 861, 862, 867, 874, 880, 881, 883, 888, 900, 908, 916, 923, 925, 929, 931, 934, 942, 948, 955, 956, 961, 964, 972, 982, 983, 999, 1000, 1006, 1009, 1010, 1012, 1022, 1024, 1041, 1042, 1053, 1057, 1061, 1062, 1063, 1064, 1098, 1101, 1103, 1105, 1113, 1114, 1127, 1132, 1134, 1139, 1141, 1146, 11

## Process Tags
Remove NaN tags, and make a dict that assigns the tags correctly to their according SongIDs Songs


In [66]:
# show all values were tags are nan in "Song Tags"
audio_df[audio_df['Song_Tags'].isnull()]

# convert these values to a string with "Unknown"
audio_df.loc[audio_df['Song_Tags'].isnull(), 'Song_Tags'] = 'Unknown'

In [67]:
# Sort the DataFrame by Song_ID in ascending order
audio_df = audio_df.sort_values(by='Song_ID', ascending=True)

# Replace "None" with a special token
audio_df['Song_Tags'] = audio_df['Song_Tags'].str.replace('None', 'unknown')

# Ensure all entries in 'Song_Tags' are strings
audio_df['Song_Tags'] = audio_df['Song_Tags'].astype(str)

# Extract and process tags
tags = audio_df['Song_Tags'].str.split(',')

# Flatten the list of tags and count unique tags
all_tags = [tag for sublist in tags for tag in sublist]
unique_tags = set(all_tags)
print(f"Number of unique tags: {len(unique_tags)}")

Number of unique tags: 20431


In [68]:
from sklearn.preprocessing import LabelEncoder

audio_df = audio_df.sort_values(by='Song_ID', ascending=True)

# Extract and process tags
tags = audio_df['Song_Tags'].str.split(',')

# Flatten the list of tags and create a mapping from tags to indices
all_tags = [tag for sublist in tags for tag in sublist]
label_encoder = LabelEncoder()
label_encoder.fit(all_tags)
tag_to_index = {tag: idx for idx, tag in enumerate(label_encoder.classes_)}

In [75]:
tags_indices

0         [12371, 12392, 12394]
1         [12371, 12394, 12392]
2          [12371, 7640, 12392]
3         [12371, 12392, 16021]
4          [12371, 12392, 7640]
                  ...          
252008    [11978, 12665, 16579]
252009     [13046, 3445, 11315]
252010      [2872, 16579, 7873]
252011     [13046, 16021, 1355]
252012                  [17403]
Name: Song_Tags, Length: 155848, dtype: object

In [69]:
# Convert tags to indices
tags_indices = tags.apply(lambda x: [tag_to_index[tag] for tag in x])

# Pad the sequences to ensure they have the same length
from torch.nn.utils.rnn import pad_sequence

tags_indices_padded = pad_sequence([torch.tensor(t) for t in tags_indices], batch_first=True, padding_value=-1)

In [70]:
tags_indices_tensor = tags_indices_padded

In [71]:
tags_indices_tensor.shape

torch.Size([155848, 3])

create artist node tags. Top 3 most common tags for each artist


In [85]:
from collections import Counter

# Create a dictionary to store tags for each artist
artist_tags = {}

# Iterate over each song and its tags, ensuring Song_ID ascending order is kept
for idx, row in audio_df.sort_values(by='Song_ID', ascending=True).iterrows():
    artist = row['Artist']
    song_tags = row['Song_Tags'].split(',')
    
    if artist not in artist_tags:
        artist_tags[artist] = []
    
    artist_tags[artist].extend(song_tags)

# For each artist, find the top 3 most common tags
artist_top_tags = {}
for artist, tags in artist_tags.items():
    most_common_tags = [tag for tag, count in Counter(tags).most_common(3)]
    artist_top_tags[artist] = most_common_tags

# Convert artist tags to indices
artist_tags_indices = {artist: [tag_to_index[tag] for tag in tags] for artist, tags in artist_top_tags.items()}

# Create a tensor for artist tags, padding sequences to ensure they have the same length
artist_tags_indices_padded = pad_sequence([torch.tensor(tags) for tags in artist_tags_indices.values()], batch_first=True, padding_value=-1)

# Convert to tensor
artist_tags_tensor = artist_tags_indices_padded


In [86]:
artist_tags_tensor.shape

torch.Size([21943, 3])

In [87]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, to_hetero
from torch_geometric.data import HeteroData

# Convert to torch tensors
audio_features_tensor = torch.tensor(audio_df.iloc[:, 5:16].values, dtype=torch.float)
tags_indices_tensor = tags_indices_padded

final tags and audio features embeddings to use as node features in graph creation

In [88]:
audio_features_tensor.shape, tags_indices_tensor.shape

(torch.Size([155848, 11]), torch.Size([155848, 3]))

checking for error when creating users in the test graph
all looks good... all songs are correctly assigned to the actual users, the missings are not in and not referenced to


In [89]:
train_graph_df["User_ID"].unique(), len(train_graph_df["User_ID"].unique())

(array([   0,    1,    2, ..., 3304, 3305, 3306], dtype=int64), 3279)

In [90]:
missing_users = set(range(min(train_graph_df["User_ID"].unique()), max(train_graph_df["User_ID"].unique()) + 1)) - set(train_graph_df["User_ID"].unique())
print(missing_users)

{3, 1168, 1808, 2715, 1309, 33, 2727, 2732, 2991, 1840, 2484, 183, 2878, 1859, 1609, 586, 2124, 1753, 2266, 858, 2909, 1759, 1761, 2402, 2147, 3045, 362, 1905}


In [91]:
user_mapping = {user: i for i, user in enumerate(train_graph_df["User_ID"].unique())}
len(user_mapping)


3279

In [92]:
missing_users_in_edge_index = set(user_song_edge_index[0]).intersection(missing_users)
missing_users_in_edge_index

set()

# Create Full MAtrix Graph

In [93]:
artist_song_edge_index

tensor([[     0, 112945],
        [     0,   3705],
        [     0,    794],
        ...,
        [ 21941,  65789],
        [ 21941,  65788],
        [ 21942, 146319]])

In [94]:
# Create HeteroData object
data = HeteroData()


################### NODES ###################

# Filter user ids to only include those in train_graph_df["User_ID"].unique()
#filtered_user_ids = torch.tensor(train_graph_df["User_ID"].unique())
#filtered_user_ids

data["users"].node_id = torch.arange(len(user_ids))
data["songs"].node_id = torch.arange(len(song_ids))
data["artists"].node_id = torch.arange(len(artist_ids))


# Add song nodes with audio and tag features
data['songs'].x_audio = audio_features_tensor
data['songs'].x_tag = tags_indices_tensor

# Add artist nodes with tag features
data['artists'].x_tag = artist_tags_tensor


################### EDGES ###################

# Add Edge Indices and Edge Attributes
data['users', 'listens_to', 'songs'].edge_index = full_matrix_user_song_edge_indices.t().contiguous()
data['users', 'is_friends_with', 'users'].edge_index = user_user_edge_index.t().contiguous()
data['artists', 'makes', 'songs'].edge_index = artist_song_edge_index.t().contiguous()

# Add Edge Attributes
#data['users', 'listens_to', 'songs'].edge_attr = user_song_edge_attr_tensor

# enable undirected edges for message passing
data = T.ToUndirected(merge = False)(data)


In [95]:
user_user_edge_index

tensor([[   0,  758],
        [   0, 1427],
        [   0,  120],
        ...,
        [3278,  323],
        [3278,  920],
        [3278,  695]])

In [96]:
artist_song_edge_index

tensor([[     0, 112945],
        [     0,   3705],
        [     0,    794],
        ...,
        [ 21941,  65789],
        [ 21941,  65788],
        [ 21942, 146319]])

In [97]:
print('========================Nodes==============================')

# Gather some statistics about the graph.
print(f'Number of total nodes: {data.num_nodes}')
print(f'Number of user nodes: {data['users'].num_nodes}')
print(f'Number of song nodes: {data['songs'].num_nodes}')
print(f'Number of artist nodes: {data['artists'].num_nodes}')

print('========================Edges==============================')


print(f'Number of total edges: {data.num_edges}')
print(f'Number of Listening edges: {data['rev_listens_to'].num_edges}')
print(f'Number of Friends edges: {data['rev_is_friends_with'].num_edges}')
print(f'Number of Artist Makes Songs edges: {data['rev_makes'].num_edges}')

print('========================Reverse_Edges==============================')


print(f'Number of total edges: {data.num_edges}')
print(f'Number of Listening edges: {data['listens_to'].num_edges}')
print(f'Number of Friends edges: {data['is_friends_with'].num_edges}')
print(f'Number of Artist Makes Songs edges: {data['makes'].num_edges}')

print('========================Degree==============================')


print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')



Number of total nodes: 181070
Number of user nodes: 3279
Number of song nodes: 155848
Number of artist nodes: 21943
Number of total edges: 1019019028
Number of Listening edges: 509212521
Number of Friends edges: 141145
Number of Artist Makes Songs edges: 155848
Number of total edges: 1019019028
Number of Listening edges: 509212521
Number of Friends edges: 141145
Number of Artist Makes Songs edges: 155848
Average node degree: 5627.76


In [98]:
# Check max and min node IDs for all nodes and edges
max_user_node_id = data['users'].node_id.max()
min_user_node_id = data['users'].node_id.min()
max_song_node_id = data['songs'].node_id.max()
min_song_node_id = data['songs'].node_id.min()
max_artist_node_id = data['artists'].node_id.max()
min_artist_node_id = data['artists'].node_id.min()

listens_to_edge_index = data['users', 'listens_to', 'songs'].edge_index
max_listens_to_edge_id_0 = listens_to_edge_index[0].max()
min_listens_to_edge_id_0 = listens_to_edge_index[0].min()
max_listens_to_edge_id_1 = listens_to_edge_index[1].max()
min_listens_to_edge_id_1 = listens_to_edge_index[1].min()

is_friends_with_edge_index = data['users', 'is_friends_with', 'users'].edge_index
max_is_friends_with_edge_id_0 = is_friends_with_edge_index[0].max()
min_is_friends_with_edge_id_0 = is_friends_with_edge_index[0].min()
max_is_friends_with_edge_id_1 = is_friends_with_edge_index[1].max()
min_is_friends_with_edge_id_1 = is_friends_with_edge_index[1].min()

makes_edge_index = data['artists', 'makes', 'songs'].edge_index
max_makes_edge_id_0 = makes_edge_index[0].max()
min_makes_edge_id_0 = makes_edge_index[0].min()
max_makes_edge_id_1 = makes_edge_index[1].max()
min_makes_edge_id_1 = makes_edge_index[1].min()

print(f'Max User Node ID: {max_user_node_id}, Min User Node ID: {min_user_node_id}')
print(f'Max Song Node ID: {max_song_node_id}, Min Song Node ID: {min_song_node_id}')
print(f'Max Artist Node ID: {max_artist_node_id}, Min Artist Node ID: {min_artist_node_id}')
print(f'Max Listens To Edge ID (0): {max_listens_to_edge_id_0}, Min Listens To Edge ID (0): {min_listens_to_edge_id_0}')
print(f'Max Listens To Edge ID (1): {max_listens_to_edge_id_1}, Min Listens To Edge ID (1): {min_listens_to_edge_id_1}')
print(f'Max Is Friends With Edge ID (0): {max_is_friends_with_edge_id_0}, Min Is Friends With Edge ID (0): {min_is_friends_with_edge_id_0}')
print(f'Max Is Friends With Edge ID (1): {max_is_friends_with_edge_id_1}, Min Is Friends With Edge ID (1): {min_is_friends_with_edge_id_1}')
print(f'Max Makes Edge ID (0): {max_makes_edge_id_0}, Min Makes Edge ID (0): {min_makes_edge_id_0}')
print(f'Max Makes Edge ID (1): {max_makes_edge_id_1}, Min Makes Edge ID (1): {min_makes_edge_id_1}')


Max User Node ID: 3278, Min User Node ID: 0
Max Song Node ID: 155847, Min Song Node ID: 0
Max Artist Node ID: 21942, Min Artist Node ID: 0
Max Listens To Edge ID (0): 3278, Min Listens To Edge ID (0): 0
Max Listens To Edge ID (1): 155847, Min Listens To Edge ID (1): 0
Max Is Friends With Edge ID (0): 3278, Min Is Friends With Edge ID (0): 0
Max Is Friends With Edge ID (1): 3278, Min Is Friends With Edge ID (1): 0
Max Makes Edge ID (0): 21942, Min Makes Edge ID (0): 0
Max Makes Edge ID (1): 155847, Min Makes Edge ID (1): 0


In [99]:
data

HeteroData(
  users={ node_id=[3279] },
  songs={
    node_id=[155848],
    x_audio=[155848, 11],
    x_tag=[155848, 3],
  },
  artists={
    node_id=[21943],
    x_tag=[21943, 3],
  },
  (users, listens_to, songs)={ edge_index=[2, 509212521] },
  (users, is_friends_with, users)={ edge_index=[2, 141145] },
  (artists, makes, songs)={ edge_index=[2, 155848] },
  (songs, rev_listens_to, users)={ edge_index=[2, 509212521] },
  (users, rev_is_friends_with, users)={ edge_index=[2, 141145] },
  (songs, rev_makes, artists)={ edge_index=[2, 155848] }
)

In [108]:
import os

# Check if the file exists
if not os.path.exists('data/pyg_data/test_hetero_data_3_nodes_rich.pt'):
    # Save your HeteroData object if the file does not exist
    torch.save(data, 'data/pyg_data/test_hetero_data_3_nodes_rich.pt')
else:
    # Load the HeteroData object if the file exists
    data = torch.load('data/pyg_data/test_hetero_data_3_nodes_rich.pt')
    print("The HeteroData object was loaded successfully.")

The HeteroData object was loaded successfully.


## Debugging and Proofs

In [101]:
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T

In [102]:
train_graph_df["User_ID"].unique().shape

(3279,)

In [103]:
torch.tensor(train_graph_df["User_ID"].unique()), len(torch.tensor(train_graph_df["User_ID"].unique()))

(tensor([   0,    1,    2,  ..., 3304, 3305, 3306]), 3279)

In [104]:
artist_song_edge_index.max(), artist_song_edge_index.min(), artist_song_edge_index[:, 1].float().mean()

(tensor(155847), tensor(0), tensor(77923.5000))

In [105]:
torch.arange(len(song_ids)).max(), torch.arange(len(song_ids)).min(), torch.arange(len(song_ids)).float().mean()

(tensor(155847), tensor(0), tensor(77923.5000))

In [106]:
torch.unique(full_matrix_user_song_edge_indices[:, 1]).float().mean()

tensor(77923.5000)