# Data Inspection and Code understanding

In [7]:
from collections import defaultdict, Counter
from tqdm.notebook import tqdm
import networkx as nx
import random
from tqdm.auto import tqdm

import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


## Data Import via Pickle Files

In [2]:
import pickle

# First, extract the contents of dataset.rar to a folder

# Then, load the data from the dataset.pickle file
with open('data/MRecury_data/dataset.pickle', 'rb') as f:
    dataset = pickle.load(f)


full_graph = dataset['full']
train_graph = dataset['train']
test_graph = dataset['test']
users_mapping = dataset['users']
artist_tracks_mapping = dataset['artist-tracks']

# Accessing nodes and edges of the graphs:
full_nodes = full_graph.nodes()
full_edges = full_graph.edges()

## Data Inspection of User - Track Graph Data

### Inspection of the Graph Data

In [4]:
dataset = full_graph
print(f'Dataset: {dataset}:')
print('======================')

#print(f'Number of graphs: {len(full_graph)}') # seems like the number of graphs is wrong, this is identital with nodes
print(f'Number of nodes: {len(full_nodes)}')
print(f'Number of edges: {len(full_edges)}')
print(f'Number of User Nodes: {len(users_mapping)}')
print(f'Number of Tracks Nodes: {(len(full_nodes) - len(users_mapping))}')
print(f'Number of total (non unique) Artists in the MAPPING, NOT IN THE SONG NODES: {len(artist_tracks_mapping.keys())}')


#print(f'Number of features: {dataset.num_features}')
#print(f'Number of classes: {dataset.num_classes}')

Dataset: Graph with 255321 nodes and 3018209 edges:
Number of nodes: 255321
Number of edges: 3018209
Number of User Nodes: 3307
Number of Tracks Nodes: 252014
Number of total (non unique) Artists in the MAPPING, NOT IN THE SONG NODES: 28540


In [6]:
len(artist_tracks_mapping.values())

28540

In [6]:
print("Count of elements in the Nested Dictionary = ",sum(len(v) for v in artist_tracks_mapping.values()))
print(sum(len(v) for v in artist_tracks_mapping.values()) - 252014) # this means 6137 entries in the Mappings Dict are TOO MUCH! #doubles or something

Count of elements in the Nested Dictionary =  258151
6137


This is the first user node, which shows all songs (NodeID after 3307) the User has listened to. this denotes all connections from node 0. Since node zero has connections to 10127 track (song)  nodes (at least so it seems) it starts its connection at Node 3307 (the first track node, all nodes before are USER nodes).

In [7]:
full_graph[0] # user node

AtlasView({3307: {'scrobbles': 370, 'pos': inf, 'date': None}, 3308: {'scrobbles': 357, 'pos': inf, 'date': None}, 3309: {'scrobbles': 349, 'pos': inf, 'date': None}, 3310: {'scrobbles': 347, 'pos': inf, 'date': None}, 3311: {'scrobbles': 346, 'pos': inf, 'date': None}, 3312: {'scrobbles': 346, 'pos': inf, 'date': None}, 3313: {'scrobbles': 342, 'pos': inf, 'date': None}, 3314: {'scrobbles': 332, 'pos': inf, 'date': None}, 3315: {'scrobbles': 330, 'pos': inf, 'date': None}, 3316: {'scrobbles': 328, 'pos': inf, 'date': None}, 3317: {'scrobbles': 309, 'pos': 592, 'date': 'Friday 17 Sep 2021, 2:49pm'}, 3318: {'scrobbles': 288, 'pos': 588, 'date': 'Friday 17 Sep 2021, 3:05pm'}, 3319: {'scrobbles': 285, 'pos': 591, 'date': 'Friday 17 Sep 2021, 2:54pm'}, 3320: {'scrobbles': 283, 'pos': 590, 'date': 'Friday 17 Sep 2021, 2:58pm'}, 3321: {'scrobbles': 282, 'pos': 586, 'date': 'Friday 17 Sep 2021, 3:54pm'}, 3322: {'scrobbles': 282, 'pos': 587, 'date': 'Friday 17 Sep 2021, 3:08pm'}, 3323: {'scrob

In [8]:
full_graph[255320] # this is a song (track) node, which shows all the user that have listened to this song

AtlasView({147: {'scrobbles': 33, 'pos': 514, 'date': 'Thursday 16 Sep 2021, 1:02am'}, 349: {'scrobbles': 6, 'pos': inf, 'date': None}, 724: {'scrobbles': 4, 'pos': inf, 'date': None}, 1070: {'scrobbles': 14, 'pos': inf, 'date': None}, 1172: {'scrobbles': 88, 'pos': 1278, 'date': 'Tuesday 24 Aug 2021, 5:29pm'}, 1672: {'scrobbles': 1, 'pos': inf, 'date': None}, 1677: {'scrobbles': 1, 'pos': inf, 'date': None}, 1822: {'scrobbles': 40, 'pos': inf, 'date': None}})

### Descriptive Statistics on User and Track behaviour

#### User Node Iteration

#### Tracks Per User

In [9]:
total_users = 0
total_tracks = 0
max_tracks = float('-inf')
min_tracks = float('inf')

# loop through each user node to gather data
for user_node in range(3307):
    tracks_listened = len(full_graph[user_node])
    total_users += 1
    total_tracks += tracks_listened
    max_tracks = max(max_tracks, tracks_listened)
    min_tracks = min(min_tracks, tracks_listened)

# calculate the average tracks per user
average_tracks_per_user = total_tracks / total_users

# print out the stats
print("Descriptive Statistics:")
print("=================================================")
print(f"Total number of users: {total_users}")
print(f"Total number of tracks listened to (non unique = number of edges): {total_tracks}")
print(f"Average number of tracks per user: {average_tracks_per_user}")
print(f"Maximum tracks listened to by any user: {max_tracks}")
print(f"Minimum tracks listened to by any user: {min_tracks}")


Descriptive Statistics:
Total number of users: 3307
Total number of tracks listened to (non unique = nunmber of edges): 3018209
Average number of tracks per user: 912.6728152403991
Maximum tracks listened to by any user: 30015
Minimum tracks listened to by any user: 2


#### Scrobbles Per User

In [14]:
total_scrobbles = 0
total_positions = 0
total_dates = 0
valid_positions_count = 0
valid_dates_count = 0

min_scrobbles_per_user = float('inf')
max_scrobbles_per_user = 0

scrobbles_per_song = {}

# iterate over all user nodes
for user_node in range(3307):

    # initialize variables to store per user scrobbles
    user_scrobbles = 0
    user_valid_positions_count = 0

    # iterate over the data associated with each user node
    for node_data in full_graph[user_node].values():

        # check if 'scrobbles' key exists in the node data
        if 'scrobbles' in node_data:
            scrobbles = node_data['scrobbles']
            total_scrobbles += scrobbles
            user_scrobbles += scrobbles

            # update scrobbles per song dictionary
            if scrobbles_per_song.get(scrobbles) is None:
                scrobbles_per_song[scrobbles] = 1
            else:
                scrobbles_per_song[scrobbles] += 1

        # check if 'pos' key exists and is not infinite
        if 'pos' in node_data and isinstance(node_data['pos'], int):
            total_positions += node_data['pos']
            user_valid_positions_count += 1
            valid_positions_count += 1

        # check if 'date' key exists and is not None
        if 'date' in node_data and node_data['date'] is not None:
            total_dates += 1
            valid_dates_count += 1

    # update per user statistics
    if user_scrobbles > max_scrobbles_per_user:
        max_scrobbles_per_user = user_scrobbles
    if user_scrobbles < min_scrobbles_per_user:
        min_scrobbles_per_user = user_scrobbles

# calculate average position
if valid_positions_count > 0:
    average_position = total_positions / valid_positions_count
else:
    average_position = None

# calculate average date
if valid_dates_count > 0:
    average_date = total_dates / valid_dates_count
else:
    average_date = None

# calculate average scrobbles per user
average_scrobbles_per_user = total_scrobbles / 3307

# calculate average scrobbles per song
average_scrobbles_per_song = sum(scrobbles * count for scrobbles, count in scrobbles_per_song.items()) / sum(scrobbles_per_song.values())

# print descriptive statistics
print("Descriptive Statistics:")
print("=================================================")
print(f"Total scrobbles of all users: {total_scrobbles}")
print(f"Average scrobbles per user for each song: {average_scrobbles_per_song}")
print(f"Average scrobbles per user (over all songs): {average_scrobbles_per_user}")
print(f"Minimum scrobbles per user: {min_scrobbles_per_user}")
print(f"Maximum scrobbles per user (over all his songs): {max_scrobbles_per_user}")


Descriptive Statistics:
Total scrobbles of all users: 139602350
Average scrobbles per user for each song: 46.25337410364889
Average scrobbles per user (over all songs): 42214.1971575446
Minimum scrobbles per user: 3
Maximum scrobbles per user (over all his songs): 1359884


#### Track Nodes Iteration

Iteration through all track nodes to see if the information is correct while iterating over User nodes above

In [17]:
total_scrobbles_all_users = 0
total_songs = 0
max_scrobbles = float('-inf')
min_scrobbles = float('inf')

# Iterate over all song nodes
for song_node in range(3307, 255321):
    # Iterate over the data associated with each song node
    for user_node, node_data in full_graph[song_node].items():
        # Check if 'scrobbles' key exists in the node data
        if 'scrobbles' in node_data:
            scrobbles = node_data['scrobbles']
            total_scrobbles_all_users += scrobbles
            total_songs += 1
            # Update max and min scrobbles
            if scrobbles > max_scrobbles:
                max_scrobbles = scrobbles
                max_scrobbles_user_node = user_node
                max_scrobbles_song_node = song_node
            min_scrobbles = min(min_scrobbles, scrobbles)
            

# Calculate the average scrobbles across all users
average_scrobbles_all_users = total_scrobbles_all_users / total_songs

print(f"Total Scrobbles Across All Users: {total_scrobbles_all_users}")
print(f"Total Songs listened to (non unique = number of edges): {total_songs}")
print(f"Average Scrobbles Across All Users: {average_scrobbles_all_users}")
print(f"Maximum Scrobbles (per SONG! Across All Users: {max_scrobbles}")
print(f"User node with maximum scrobbles for one song: {max_scrobbles_user_node} with {max_scrobbles} scrobbles on Song with ID {max_scrobbles_song_node} ")
print(f"Minimum Scrobbles Across All Users: {min_scrobbles}")


Total Scrobbles Across All Users: 139602350
Total Songs listened to (non unique = number of edges): 3018209
Average Scrobbles Across All Users: 46.25337410364889
Maximum Scrobbles (per SONG! Across All Users: 205689
User node with maximum scrobbles for one song: 2344 with 205689 scrobbles on Song with ID 13637 
Minimum Scrobbles Across All Users: 1


## Inspection of the Mappings Data
Good for indepth understanding but next chapter "New Mapping" is the essential thing for how my new mapping came into place.

In [12]:
users_mapping

{'Enrique-': 0,
 'EduardoMol': 1,
 'DemetriDyslexik': 2,
 'unicef41': 3,
 'losena': 4,
 'jpw130855': 5,
 'felipemusky': 6,
 'felipe_89': 7,
 'camiloei': 8,
 'EriF_JR': 9,
 'Backstage_Rock': 10,
 'nirvaana_': 11,
 'miladi': 12,
 'djchaco': 13,
 'dizzydjc': 14,
 'conversemanman': 15,
 'Yavedu': 16,
 'Param0rexx_': 17,
 'Jeff_Serozini': 18,
 'JCCAKES': 19,
 'EnricoFranchi': 20,
 'Ehsandiary': 21,
 'Creepsnight': 22,
 'zero-inch': 23,
 'violaceousest': 24,
 'the_edster': 25,
 'maikcuritiba': 26,
 'imyyy': 27,
 'chocobooo': 28,
 'c0rts': 29,
 'barkbarkdisco': 30,
 'WichitaQ': 31,
 'TheRootsLife': 32,
 'LeoMetal965': 33,
 'waltercabellon': 34,
 'pellitero': 35,
 'masud_saedi': 36,
 'corky64': 37,
 'alinzainescu': 38,
 'Vintovka': 39,
 'VRec': 40,
 'STxza': 41,
 'NaturalStudio': 42,
 'Lain12': 43,
 'KarenValensi': 44,
 'DJGabster': 45,
 'ASTOKALOSOU': 46,
 'thunder__': 47,
 'loohop15': 48,
 'kyliesaysparty': 49,
 'amakiell': 50,
 'aemea': 51,
 'Tott_Di': 52,
 'Nihilistic23': 53,
 'MarchuSykes

In [13]:
artist_tracks_mapping

defaultdict(dict,
            {'Black Kids': {"I'm Not Gonna Teach Your Boyfriend How to Dance With You": 3307,
              'Hit The Heartbrakes': 3308,
              "I've Underestimated My Charm (Again)": 3309,
              'Partie Traumatic': 3310,
              "I'm Making Eyes at You": 3311,
              'Listen To Your Body Tonight': 3312,
              'Hurricane Jane': 3313,
              'Look at Me (When I Rock Wichoo)': 3314,
              'Love Me Already': 3315,
              'I Wanna Be Your Limousine': 3316,
              'My Christian Name': 3361,
              'Obligatory Drugs': 4107,
              'V-Card (Not Nuthin’)': 4498,
              'Natural Born Kissers': 4538,
              'In A Song': 4773,
              'If My Heart Is Broken': 4829,
              'All The Emotions': 4959,
              'Way Into Leather': 4960,
              'Hurricane Jane (The Cansecos Remix)': 54755,
              "I'm Making Eyes At You (Joy Electric Remix)": 87774,
            

In [14]:
len(users_mapping) #these are already the top 5% users they selected

3307

In [15]:
len(artist_tracks_mapping) #number of unique artists in the dataset (aligns with paper numbers table 1)

28540

In [16]:
len(artist_tracks_mapping.keys()) #number of unique artists

28540

In [17]:
sum(len(v) for v in artist_tracks_mapping.values()) # this should be all unique track but why around 6k more than in the paper
# something is counted double, either tracks and / or artists appear more than once

258151

### Artist Mappings and Song Nodes Comparison
Does every song map to an artist?

In [6]:
len(artist_tracks_mapping)

28540

In [59]:
from tqdm import tqdm
import pandas as pd
import os

# Check if the file exists
file_path = 'data/pre_processing/matched_songs.csv'
if not os.path.exists(file_path):
    matched_songs = []
    unmatched_song_nodes = []
    unique_artists = set()

    pbar = tqdm(total=255321 - 3307)
    #iterate over all song nodes
    for song_node_id in range(3307, 255321):
        # Check if the song node ID has a corresponding entry in the nested dictionary
        found = False
        for artist, tracks in artist_tracks_mapping.items():
            if song_node_id in tracks.values():
                found = True
                # Find the corresponding song name for the given ID
                song_name = next(key for key, value in tracks.items() if value == song_node_id)
                matched_songs.append({'Song_Node_ID': song_node_id, 'Song_Name': song_name, 'Artist': artist})
                #print(f"Song node ID {song_node_id} corresponds to the song '{song_name}' by '{artist}'")
                break # only searches for the FIRST song name it finds in the dictionary, after that it breaks
        if not found:
            unmatched_song_nodes.append(song_node_id)
            print(f"No corresponding song mapping found for song node ID: {song_node_id}")
        else:
            # Add the artist to the set
            unique_artists.add(artist)

        pbar.update(1)
    pbar.close()

    matched_songs_df = pd.DataFrame(matched_songs)
    matched_songs_df.to_csv(file_path, index=False)
    print("Unmatched song nodes:", unmatched_song_nodes)
else:
    matched_songs_df = pd.read_csv(file_path)
    print("Loaded existing matched songs DataFrame.")

Loaded existing matched songs DataFrame.


yes, all songs map to an artist

In [20]:
matched_songs_df[matched_songs_df['Artist'].isna()] # yes, all songs map to an artist

Unnamed: 0,Song_Node_ID,Song_Name,Artist


#### Matched Dataframe
= the Mapping with unique song IDs as a pandas df

Matched_songs_df = All Song Nodes mapped to their corresponding song via song ID and their artists 

In [21]:
print("Matched songs:")
matched_songs_df

Matched songs:


Unnamed: 0,Song_Node_ID,Song_Name,Artist
0,3307,I'm Not Gonna Teach Your Boyfriend How to Danc...,Black Kids
1,3308,Hit The Heartbrakes,Black Kids
2,3309,I've Underestimated My Charm (Again),Black Kids
3,3310,Partie Traumatic,Black Kids
4,3311,I'm Making Eyes at You,Black Kids
...,...,...,...
252009,255316,Base Line,j-hope
252010,255317,No,CLC
252011,255318,UFA,Djonga
252012,255319,Sixth Sense,Brown Eyed Girls


#### Comparison of the number of unique Artists in the Mapping Dict and the actual Song Nodes

In [22]:
print("Mapping Artists:",len(artist_tracks_mapping),",", "Song Node Artists:", len(matched_songs_df["Artist"].unique()))

Mapping Artists: 28540 , Song Node Artists: 28172


In [27]:
exessive_artists = len(artist_tracks_mapping) - len(matched_songs_df["Artist"].unique())
print(f"There are {exessive_artists} excessive artists in the artist mappings, that do NOT appear in the Song Nodes")

There are 368 excessive artists in the artist mappings, that do NOT appear in the Song Nodes


In [28]:
# Check if any "Song" values are empty
empty_songs = matched_songs_df['Song_Name'].isna().any()

# Check if any "Artist" values are empty
empty_artists = matched_songs_df['Artist'].isna().any()

print(f"Are there any empty 'Song' values? {empty_songs}")
print(f"Are there any empty 'Artist' values? {empty_artists}")
print(f"Hence, Each Song Node HAS a corresponding Song ID, Song Name and Artist Name")

Are there any empty 'Song' values? False
Are there any empty 'Artist' values? False
Hence, Each Song Node HAS a corresponding Song ID, Song Name and Artist Name


#### Mapping Dataframe 
Just to check back these facts from the other way around too. Converting the mapping to a df

In [29]:
import pandas as pd

data = []

# loop through each artist and their tracks in the nested dictionary
for artist, tracks in artist_tracks_mapping.items():
    # loop through each song and its corresponding song node ID
    for song, song_node_id in tracks.items():
        # add the data as a tuple to the list
        data.append((artist, song, song_node_id))

# create a DataFrame from the list of tuples
artist_tracks_mapping_df = pd.DataFrame(data, columns=['Artist', 'Song_Name', 'Song_Node_ID'])


In [30]:
artist_tracks_mapping_df

Unnamed: 0,Artist,Song_Name,Song_Node_ID
0,Black Kids,I'm Not Gonna Teach Your Boyfriend How to Danc...,3307
1,Black Kids,Hit The Heartbrakes,3308
2,Black Kids,I've Underestimated My Charm (Again),3309
3,Black Kids,Partie Traumatic,3310
4,Black Kids,I'm Making Eyes at You,3311
...,...,...,...
258146,Jamie Lancaster,Boys Don't Cry,255203
258147,Sleeperstar,I Was Wrong,255208
258148,Anthony Naples,Mad Disrespect,255228
258149,Irene,Stardust,255253


#### HERE LIES THE ERROR OF THE MAPPED DATA

This explains why the mapped data has more entries than there are song nodes: Some songs appear multiple times with the same ID, but different naming conventions. Additionally as we will see, it also contains artists multiple times, due to naming ambiguities.

In [31]:
# Count the occurrences of each artist in the DataFrame
id_count = artist_tracks_mapping_df['Song_Node_ID'].value_counts()

# Filter the artists that are not unique (i.e., have counts greater than 1)
non_unique_ids = id_count[id_count > 1]

# Print the non-unique artists
print("Non-unique artists / these artists have their songs listed several times with the same song ID under different ambiguous artist names")
print(non_unique_ids)
#print(sum(non_unique_ids))

Non-unique artists / these artists have their songs listed several times with the same song ID under different ambiguous artist names
Song_Node_ID
15255     8
33233     5
10468     5
45860     5
10040     5
         ..
78285     2
107063    2
37878     2
177601    2
54880     2
Name: count, Length: 5640, dtype: int64


In [32]:
## check those ids
artist_tracks_mapping_df[artist_tracks_mapping_df['Song_Node_ID'] == 15255]

Unnamed: 0,Artist,Song_Name,Song_Node_ID
59357,Yann Tiersen,Comptine d'un autre été: L'Après-midi,15255
59390,Yann Tiersen,Comptine D'un Autre Ete L'apres Midi,15255
59414,Yann Tiersen,Comptine D'un Autre ete: L'après-midi,15255
59431,Yann Tiersen,Comptine d'un autre été: L'après midi,15255
59441,Yann Tiersen,"Comptine d'un Autre Ete, L'apres-midi",15255
59444,Yann Tiersen,Comptine d'un Autre Eté-L'Après Midi,15255
59459,Yann Tiersen,"Comptine d'un autre été, L'Après-midi",15255
59502,Yann Tiersen,Comptine D'un Autre Été: L'après Midi,15255


In [34]:
print("Number of  Songs in the Mapping Data:", len(artist_tracks_mapping_df ))
print("Number of unique SongsIDs in the Mapping Data:", len(artist_tracks_mapping_df["Song_Node_ID"].unique()), ", means there are some songs doubles as the above number is bigger. BUT it also means there are no more songIds than 252014 (Which is good, since every node gets a correct mapping then.")
print("Number of unique Artists in the Mapping Data:", len(artist_tracks_mapping_df["Artist"].unique()) )

Number of  Songs in the Mapping Data: 258151
Number of unique SongsIDs in the Mapping Data: 252014 , means there are some songs doubles as the above number is bigger. BUT it also means there are no more songIds than 252014 (Which is good, since every node gets a correct mapping then.
Number of unique Artists in the Mapping Data: 28540


#### Unique Artist Issue
Solved! There are some artists in the original mapping data that appear more often in diferent forms/ versions eg. "Avalanches" or "The Avalanches". They share some songs, but at least the SongID shared between the same artists with different writing is actually the same. 

In [35]:
print("Number of unique Artists in the Mappings Data:", len(artist_tracks_mapping_df["Artist"].unique()) )
print("Number of unique Artists in the Songe Node Data:", len(matched_songs_df["Artist"].unique()) )

Number of unique Artists in the Mappings Data: 28540
Number of unique Artists in the Songe Node Data: 28172


#### comparing unique artist sets

In [36]:
# Get the unique artists in each DataFrame
artists_mapping = set(artist_tracks_mapping_df["Artist"].unique())
artists_songs = set(matched_songs_df["Artist"].unique())

# Artists present in the mapping data but not in the song node data
missing_artists_in_songs = artists_mapping - artists_songs
#missing_artists_in_songs = [missing_artists_in_songs]

# Artists present in the song node data but not in the mapping data
missing_artists_in_mapping = artists_songs - artists_mapping

len(missing_artists_in_songs)

368

In [40]:
print("Artists present in the mapping data but not in the song node data (these are mostly some collab names when 2 artists make features):")
for artist in missing_artists_in_songs:
    print(artist)

Artists present in the mapping data but not in the song node data (these are mostly some collab names when 2 artists make features):
Dua Lipa, Angèle
Deorro X Chris Brown
Kanye West Feat. T-Pain
RAÇA NEGRA
Miranda
breeders
Armin Van Buuren feat. Christian Burns
Becky G, Natti Natasha
Louis Amstrong
Ruth B
Billie
Run–D.M.C.
Yusuf / Cat Stevens
Dexy's Midnight Runners
Paul & Fritz Kalkbrenner
Selena Gomez, Marshmello
Martin Garrix & Bebe Rexha
Elis Regina & Antônio Carlos Jobim
Aṣa
Camaron De La Isla
Roy Ayers Ubiquity
Armin Van Buuren feat. Jennifer Rene
Elton John & Kiki Dee
Mana
St Germain
Jorge Ben Jor
Sergio Mendes
Prodıgy
BEAT! BEAT! BEAT!
Patti Smith Group
Olafur Arnalds
Prince  The Revolution
Robyn Hitchcock and The Egyptians
Big Brother  The Holding Company
Rasmus
J Balvin & Willy William
Touch & Go
Archie Bell & The Drells
Pino D'Angio
Smiths
Sam Smith, Normani
Ting Tings
Tornados
Dam Funk
Carl B
Bomba Estereo
Bruce Springsteen & The E Street Band
Armin Van Buuren feat. Jaren
R

In [38]:
matched_songs_df[matched_songs_df['Song_Node_ID'] == 13236]

Unnamed: 0,Song_Node_ID,Song_Name,Artist
9929,13236,Since I Left You,The Avalanches


In [39]:
print(artist_tracks_mapping["Avalanches"])
print(artist_tracks_mapping["The Avalanches"])

{'Since I Left You': 13236}
{'Since I Left You': 13236, 'Colours': 24607, 'Harmony': 24609, 'Wildflower': 43380, 'We Will Always Love You (feat. Blood Orange)': 70172, 'Light Up': 70174, 'Live A Lifetime Love': 70175, 'Park Music': 70176, 'The Noisy Eater': 70178, 'Zap!': 70181, 'Tonight May Have to Last Me All My Life': 70182, 'Sunshine': 109275, 'Over The Turnstiles': 109277, 'The Wozard Of Iz': 109278, 'We Will Always Love You': 109280, 'Frontier Psychiatrist': 130956, 'Stay Another Season': 138854, 'Electricity': 153375, 'Radio': 155173, 'Two Hearts in 3/4 Time': 157093, 'A Different Feeling': 157095, 'Flight Tonight': 163893, 'Little Journey': 166286, 'Close to You': 166287, 'Diners Only': 168513, 'Subways': 168514, 'ETOH': 171050, 'Summer Crane': 174151, "Pablo's Cruise": 177658, 'Live at Dominoes': 177659, 'Extra Kings': 181985, 'Frankie Sinatra - Extended Mix': 212883, 'Kaleidoscopic Lovers': 212885, "Livin' Underwater (Is Somethin' Wild)": 212887, 'Saturday Night Inside Out': 

#### Final Error Proof and Explenation:

In [43]:
print("Count of elements in the Nested Dictionary = ",sum(len(v) for v in artist_tracks_mapping.values())) # this means 6137 entries in the Mappings Dict are TOO MUCH! #doubles or something
print("Count of elements in the Matched Song Nodes = ", len(matched_songs_df))
print("Number of Elements (Songs) that are too much in the mapping data: ",sum(len(v) for v in artist_tracks_mapping.values()) - len(matched_songs_df) )

Count of elements in the Nested Dictionary =  258151
Count of elements in the Matched Song Nodes =  252014
Number of Elements (Songs) that are too much in the mapping data:  6137


In [53]:
print("Number of  Artists in the Mappings Data:", len(artist_tracks_mapping_df["Artist"].unique()) )
print("Number of  Artists in the Songe Node Data:", len(matched_songs_df["Artist"].unique()) )
print("Number of  Artists that are too much in the mapping data: ", len(artist_tracks_mapping_df["Artist"].unique()) - len(matched_songs_df["Artist"].unique()) ) 

Number of  Artists in the Mappings Data: 28540
Number of  Artists in the Songe Node Data: 28172
Number of  Artists that are too much in the mapping data:  368


In [54]:
len(non_unique_ids) + (len(artist_tracks_mapping_df["Artist"].unique()) - len(matched_songs_df["Artist"].unique()))

6008

In [55]:
(sum(len(v) for v in artist_tracks_mapping.values()) - len(matched_songs_df)) - (len(non_unique_ids) + (len(artist_tracks_mapping_df["Artist"].unique()) - len(matched_songs_df["Artist"].unique())))

129

There are still some ~129 elements that are unclear why they are missing that will be inspected in more detail below. BUT The main takeaway is that the mappings data is just a bit ambigous because of differences in spelling. But this doesnt matter since even when the artist or song name is spelled wrong our appears multiple times, the SongID is always unqiue and matches across those occurences.

#### Final Proof: Comparing the dfs

In [50]:
# Perform a left join on the artist_tracks_mapping_df and matched_songs_df

merged_df = artist_tracks_mapping_df.merge(matched_songs_df, how='left', indicator=True)

# Filter out the rows where the entry is only present in artist_tracks_mapping_df
unmatched_rows = merged_df[merged_df['_merge'] == 'left_only']

# Drop the indicator column '_merge'
unmatched_rows.drop(columns='_merge', inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unmatched_rows.drop(columns='_merge', inplace=True)


Song IDs that appear multiple times with different artist aliases

In [51]:
unmatched_rows

Unnamed: 0,Artist,Song_Name,Song_Node_ID
21,Black Kids,I'm Not Gonna Teach Your Boyfriend How to Danc...,87775
56,Foster the People,Don?t Stop (Color On The Walls,3340
58,Foster the People,Don't Stop (Color On The Walls,3340
81,The Vaccines,Norgaard,3377
413,Two Door Cinema Club,Do You Want It All,3345
...,...,...,...
257906,Otis Redding & Carla Thomas,Tramp,190879
257984,Johnny Cash & June Carter,Jackson,34098
258036,Septic Flesh,A Great Mass of Death,84628
258037,Septic Flesh,Pyramid God,53285


In [52]:
matched_songs_df[matched_songs_df['Song_Name'] == "Tramp"]

Unnamed: 0,Song_Node_ID,Song_Name,Artist
175045,178352,Tramp,Buddy Guy
187572,190879,Tramp,Otis Redding


In [81]:
artist_tracks_mapping["Otis Redding"]["Tramp"]

190879

In [84]:
artist_tracks_mapping["Otis Redding & Carla Thomas"]

{'Tramp': 190879}

In [85]:
artist_tracks_mapping["Carla Thomas"]

{'Gee Whiz': 70635,
 'I Kinda Think He Does': 109779,
 'B-A-B-Y': 174185,
 "You'll Lose A Good Thing": 213825}

As we see Otis Redding has the Song "Tramp" with ID 190879. But also "Otis Redding & Carla Thomas" have the same song. but Carla Thomas alone, does not have this Song.

In [56]:
# Extract unique artists from the unmatched_rows DataFrame
unique_artists_unmatched = unmatched_rows['Artist'].unique()

# Filter out artists that are present in the Song Nodes DataFrame
artists_not_in_song_nodes = [artist for artist in unique_artists_unmatched if artist not in matched_songs_df['Artist'].unique()]

In [60]:
print("There are", len(unmatched_rows) ," songs that DONT MATCH IN ALL 3 Columns. This probably means thatthe Node ID is the same, but the artist or the Song Name doesnt match for those occurence eg. because of wrong spelling or special characters.")
print("Out of these songs, there are  ",len(artists_not_in_song_nodes), " artists that are in the Mappings data but not in the Nodes data")


There are 6137  songs that DONT MATCH IN ALL 3 Columns. This probably means thatthe Node ID is the same, but the artist or the Song Name doesnt match for those occurence eg. because of wrong spelling or special characters.
Out of these songs, there are   368  artists that are in the Mappings data but not in the Nodes data


In [61]:
# Display the artists that are not in the Song Nodes DataFrame
print("Unique artists in unmatched rows but not in Song Nodes DataFrame:")
len(artists_not_in_song_nodes)

Unique artists in unmatched rows but not in Song Nodes DataFrame:


368

## New Mapping
Create a new more accurate mapping of artist- Tracks with no doubles in artists and tracks 

#### Investigate Ambiguous Songs

In [149]:
from collections import defaultdict
from tqdm import tqdm

# Initialize a new mapping dictionary with default values as lists
song_artist_mapping = defaultdict(list)

# Create a tqdm instance to track the progress
pbar = tqdm(total=len(artist_tracks_mapping.items()))

# Iterate over the original mapping dictionary
for artist, songs in artist_tracks_mapping.items():
    # Iterate over the songs for each artist
    for song, song_id in songs.items():
        # Append the song ID to the list of IDs for the current artist
        song_artist_mapping[song].append((artist, song_id))

    # Update tqdm progress bar
    pbar.update(1)

# Close the tqdm progress bar
pbar.close()

# Convert the defaultdict to a regular dictionary
song_artist_mapping = dict(song_artist_mapping)


100%|██████████| 28540/28540 [00:00<00:00, 303445.06it/s]


In [146]:
song_artist_mapping # one song name can have mutliple artists with different song IDS, since many artists give their song the same name

{"I'm Not Gonna Teach Your Boyfriend How to Dance With You": [('Black Kids',
   3307)],
 'Hit The Heartbrakes': [('Black Kids', 3308)],
 "I've Underestimated My Charm (Again)": [('Black Kids', 3309)],
 'Partie Traumatic': [('Black Kids', 3310)],
 "I'm Making Eyes at You": [('Black Kids', 3311)],
 'Listen To Your Body Tonight': [('Black Kids', 3312)],
 'Hurricane Jane': [('Black Kids', 3313)],
 'Look at Me (When I Rock Wichoo)': [('Black Kids', 3314)],
 'Love Me Already': [('Black Kids', 3315)],
 'I Wanna Be Your Limousine': [('Black Kids', 3316)],
 'My Christian Name': [('Black Kids', 3361)],
 'Obligatory Drugs': [('Black Kids', 4107)],
 'V-Card (Not Nuthin’)': [('Black Kids', 4498)],
 'Natural Born Kissers': [('Black Kids', 4538)],
 'In A Song': [('Black Kids', 4773)],
 'If My Heart Is Broken': [('Black Kids', 4829)],
 'All The Emotions': [('Black Kids', 4959)],
 'Way Into Leather': [('Black Kids', 4960)],
 'Hurricane Jane (The Cansecos Remix)': [('Black Kids', 54755)],
 "I'm Making E

In [141]:
from collections import defaultdict
from tqdm import tqdm

# Create a dictionary to store songs associated with multiple artists
ambiguous_songs = defaultdict(list)

# Create a tqdm instance to track the progress
pbar = tqdm(total=len(artist_tracks_mapping.items()))

# Iterate over the original mapping dictionary
for artist, songs in artist_tracks_mapping.items():
    # Iterate over the songs for each artist
    for song, song_id in songs.items():
        # Append the artist to the list of artists for the current song ID
        ambiguous_songs[song_id].append(artist)

    # Update tqdm progress bar
    pbar.update(1)

# Close the tqdm progress bar
pbar.close()

# Filter the dictionary to include only song IDs associated with multiple artists
ambiguous_songs = {song_id: artists for song_id, artists in ambiguous_songs.items() if len(artists) > 1}


100%|██████████| 28540/28540 [00:00<00:00, 390460.53it/s]


In [142]:
ambiguous_songs

{87775: ['Black Kids', 'Black Kids'],
 3340: ['Foster the People', 'Foster the People', 'Foster the People'],
 3377: ['The Vaccines', 'The Vaccines'],
 3336: ['Panic! at the Disco', 'Panic at the Disco'],
 3550: ['Panic! at the Disco', 'Panic at the Disco'],
 3577: ['Panic! at the Disco', 'Panic at the Disco'],
 3721: ['Panic! at the Disco', 'Panic at the Disco'],
 3774: ['Panic! at the Disco', 'Panic at the Disco'],
 4988: ['Panic! at the Disco', 'Panic at the Disco'],
 5110: ['Panic! at the Disco', 'Panic at the Disco'],
 5230: ['Panic! at the Disco', 'Panic at the Disco'],
 5581: ['Panic! at the Disco', 'Panic at the Disco'],
 5764: ['Panic! at the Disco', 'Panic at the Disco'],
 5765: ['Panic! at the Disco', 'Panic at the Disco'],
 5985: ['Panic! at the Disco', 'Panic at the Disco'],
 6082: ['Panic! at the Disco', 'Panic at the Disco'],
 6300: ['Panic! at the Disco', 'Panic at the Disco'],
 3337: ['The xx', 'XX'],
 3360: ['The xx', 'XX'],
 3366: ['The xx', 'XX'],
 3373: ['The xx', 

In [143]:
len(ambiguous_songs.keys())

5640

In [70]:
from collections import Counter

# Convert the lists of artists to tuples and count their occurrences
unique_artist_sets = Counter(tuple(artists) for artists in ambiguous_songs.values())

# Print the count of unique sets of alternative spelling artists
print("Number of unique sets of alternative spelling artists:", len(unique_artist_sets))


Number of unique sets of alternative spelling artists: 2579


In [113]:
# Print the count of each unique set of alternative spelling artists
for artists, count in unique_artist_sets.items():
    print(f"Artist set {artists} appears {count} times")

Artist set ('Black Kids', 'Black Kids') appears 1 times
Artist set ('Foster the People', 'Foster the People', 'Foster the People') appears 1 times
Artist set ('The Vaccines', 'The Vaccines') appears 1 times
Artist set ('Panic! at the Disco', 'Panic at the Disco') appears 14 times
Artist set ('The xx', 'XX') appears 6 times
Artist set ('Two Door Cinema Club', 'Two Door Cinema Club') appears 2 times
Artist set ('White Lies', 'White Lies') appears 1 times
Artist set ('Empire of the Sun', 'Empire of the Sun') appears 1 times
Artist set ('The Ting Tings', 'Ting Tings') appears 1 times
Artist set ('Arcade Fire', 'Arcade Fire') appears 10 times
Artist set ('Arcade Fire', 'Arcade Fire', 'Arcade Fire') appears 1 times
Artist set ('Alabama Shakes', 'Alabama Shakes') appears 1 times
Artist set ('Lorde', 'Lorde') appears 1 times
Artist set ('The 1975', 'The 1975') appears 2 times
Artist set ('The Wombats', 'The Wombats') appears 2 times
Artist set ('Marina & the Diamonds', 'Marina') appears 11 tim

### New mapping Dictionary
Basically to prevent double artists with different spellings we only take the FIRST artist names appearances and map all songs belonging to this one artist

In [125]:
new_artist_song_mapping = defaultdict(dict)
existing_song_ids = []

pbar = tqdm(total=len(artist_tracks_mapping.items()))
    
# Iterate over the original mapping dictionary
for artist, songs in artist_tracks_mapping.items():
        # Iterate over the songs for each artist
        for song, song_id in songs.items():
            # Compare if Song_ids are already present in song id List
            if song_id not in existing_song_ids:
                
                #print for error tracing:
                #print("New Song",artist,song,song_id)
                
                #add to "counter" of song_ids
                existing_song_ids.append(song_id)
                # add this artist and its song with its id to the dict
                new_artist_song_mapping[artist][song] = song_id

            else:
                #print for error tracing:
                #print("Song already present",artist, song, song_id)
                pass
    
        # Update tqdm progress bar
        pbar.update(1)

# Close the tqdm progress bar
pbar.close()

100%|██████████| 28540/28540 [01:48<00:00, 261.89it/s] 


In [None]:
new_artist_song_mapping

defaultdict(dict,
            {'Black Kids': {"I'm Not Gonna Teach Your Boyfriend How to Dance With You": 3307,
              'Hit The Heartbrakes': 3308,
              "I've Underestimated My Charm (Again)": 3309,
              'Partie Traumatic': 3310,
              "I'm Making Eyes at You": 3311,
              'Listen To Your Body Tonight': 3312,
              'Hurricane Jane': 3313,
              'Look at Me (When I Rock Wichoo)': 3314,
              'Love Me Already': 3315,
              'I Wanna Be Your Limousine': 3316,
              'My Christian Name': 3361,
              'Obligatory Drugs': 4107,
              'V-Card (Not Nuthin’)': 4498,
              'Natural Born Kissers': 4538,
              'In A Song': 4773,
              'If My Heart Is Broken': 4829,
              'All The Emotions': 4959,
              'Way Into Leather': 4960,
              'Hurricane Jane (The Cansecos Remix)': 54755,
              "I'm Making Eyes At You (Joy Electric Remix)": 87774,
            

In [None]:
print("Number of Unique Artist:", len(new_artist_song_mapping))

Number of Unique Artist: 28120


In [None]:
print("Count of elements (Unique Song_ids) in the New Mappings = ",sum(len(v) for v in new_artist_song_mapping.values()))

Count of elements (Unique Song_ids) in the New Mappings =  252013


In [None]:
print(f"Has {len(artist_tracks_mapping.values()) - len(new_artist_song_mapping)} less Artist Nodes than the original mapping, by removing duplicate artists")

Has 421 less Artist Nodes than the original mapping, by removing duplicate artists


#### Ambiguous Artists
check if there are any more ambiguous songs: there are non, dic is empty. This means that 1 song is always only assigned to one artist

In [94]:
from collections import defaultdict
from tqdm import tqdm

# Create a dictionary to store songs associated with multiple artists
ambiguous_songs = defaultdict(list)

# Create a tqdm instance to track the progress
pbar = tqdm(total=len(new_artist_song_mapping.items()))

# Iterate over the original mapping dictionary
for artist, songs in new_artist_song_mapping.items():
    # Iterate over the songs for each artist
    for song, song_id in songs.items():
        # Append the artist to the list of artists for the current song ID
        ambiguous_songs[song_id].append(artist)

    # Update tqdm progress bar
    pbar.update(1)

# Close the tqdm progress bar
pbar.close()

# Filter the dictionary to include only song IDs associated with multiple artists
ambiguous_songs = {song_id: artists for song_id, artists in ambiguous_songs.items() if len(artists) > 1}
ambiguous_songs

  0%|          | 0/793661584 [01:11<?, ?it/s]
100%|██████████| 28172/28172 [00:00<00:00, 45750.58it/s]


{}

#### New Mapping as DF
We can see that we have 1 Song / Song Node ID Less the what is written in the Paper and in the our Matched original Nodes and Mapped Data from the Paper. This song ID will be identified here.

In [150]:
import pandas as pd

# Check if the DataFrame already exists
try:
    new_artist_tracks_mapping_df = pd.read_csv('data/new_artist_tracks_mapping_df.csv')
except FileNotFoundError:
    # Initialize an empty list to store the data
    data = []

    # Iterate over each artist and their tracks in the nested dictionary
    for artist, tracks in new_artist_song_mapping.items():
        # Iterate over each song and its corresponding song node ID
        for song, song_node_id in tracks.items():
            # Append the data as a tuple to the list
            data.append((artist, song, song_node_id))

    # Create a DataFrame from the list of tuples
    new_artist_tracks_mapping_df = pd.DataFrame(data, columns=['Artist', 'Song_Name', 'Song_Node_ID'])
    # Save the DataFrame to a CSV file
    new_artist_tracks_mapping_df.to_csv('data/new_artist_tracks_mapping_df.csv', index=False)

In [151]:
new_artist_tracks_mapping_df

Unnamed: 0,Artist,Song_Name,Song_Node_ID
0,Black Kids,I'm Not Gonna Teach Your Boyfriend How to Danc...,3307
1,Black Kids,Hit The Heartbrakes,3308
2,Black Kids,I've Underestimated My Charm (Again),3309
3,Black Kids,Partie Traumatic,3310
4,Black Kids,I'm Making Eyes at You,3311
...,...,...,...
252008,Jamie Lancaster,Boys Don't Cry,255203
252009,Sleeperstar,I Was Wrong,255208
252010,Anthony Naples,Mad Disrespect,255228
252011,Irene,Stardust,255253


In [152]:
len(new_artist_tracks_mapping_df["Song_Node_ID"].unique())

252013

#### identify missing song_ID

In [153]:
df = new_artist_tracks_mapping_df
# Generate a list of expected song IDs from 3307 to 255320
expected_ids = list(range(3307, 255321))

# Check if each expected ID is present in the dataframe and if they are consistently rising by 1
consistent_rise = all(song_id == expected_ids[i] for i, song_id in enumerate(df['Song_Node_ID']))

# Check if all expected IDs are present in the dataframe
all_ids_present = all(song_id in df['Song_Node_ID'].values for song_id in expected_ids)

print("Consistently rising by 1:", consistent_rise)
print("All IDs from 3307 to 255320 present:", all_ids_present)


Consistently rising by 1: False
All IDs from 3307 to 255320 present: False


In [154]:
# Generate a list of expected song IDs from 3307 to 255320
expected_ids = set(range(3307, 255321))

# Get the set of song IDs present in the dataframe
present_ids = set(df['Song_Node_ID'])

# Find the missing IDs
missing_ids = expected_ids - present_ids

print("Missing IDs:", missing_ids)


Missing IDs: {68691}


#### Find missing Song

In [None]:
# my new mapping as a df, correctly has this song (68691, "Cavalo de Fogo") only once

In [155]:
new_artist_tracks_mapping_df[new_artist_tracks_mapping_df['Song_Node_ID'] == 68691]

Unnamed: 0,Artist,Song_Name,Song_Node_ID


In [156]:
new_artist_tracks_mapping_df[new_artist_tracks_mapping_df['Song_Node_ID'] == 194224]

Unnamed: 0,Artist,Song_Name,Song_Node_ID
72040,Banda UÓ,Cavalo de Fogo,194224


In [157]:
new_artist_tracks_mapping_df[new_artist_tracks_mapping_df['Song_Node_ID'] == 255320]


Unnamed: 0,Artist,Song_Name,Song_Node_ID
97174,BaianaSystem,Capim Guiné,255320


In [158]:
new_artist_tracks_mapping_df[new_artist_tracks_mapping_df['Song_Name'] == "Cavalo de Fogo"]

Unnamed: 0,Artist,Song_Name,Song_Node_ID
72040,Banda UÓ,Cavalo de Fogo,194224


In [159]:
new_artist_tracks_mapping_df[new_artist_tracks_mapping_df['Artist'] == "Banda UÓ"]

Unnamed: 0,Artist,Song_Name,Song_Node_ID
72028,Banda UÓ,Sonho Molhado,14865
72029,Banda UÓ,Malandro,14866
72030,Banda UÓ,I <3 Cafuçú,14867
72031,Banda UÓ,Gringo,35447
72032,Banda UÓ,Arregaçada / U Can't Touch This,59171
72033,Banda UÓ,Vânia,59173
72034,Banda UÓ,Catraca - Faixa Bônus,68684
72035,Banda UÓ,X-Bacon,68685
72036,Banda UÓ,Na Varanda,68686
72037,Banda UÓ,Buzios Do Coração,68687


#### Missing Song in OG Mapping
Original Mapping has 1 SONG mapped TWICE (2x) to the same IDs!

In [160]:
artist_tracks_mapping["Banda UÓ"]["Cavalo de Fogo"]

68691

In [161]:
artist_tracks_mapping["Banda Uó"]["Cavalo de Fogo"]

194224

In [162]:
len(artist_tracks_mapping_df["Song_Node_ID"].unique())

252014

In [163]:
artist_tracks_mapping_df[artist_tracks_mapping_df['Song_Node_ID'] == 68691]

Unnamed: 0,Artist,Song_Name,Song_Node_ID
74634,Banda UÓ,Cavalo de Fogo,68691


In [164]:
artist_tracks_mapping_df[artist_tracks_mapping_df['Song_Node_ID'] == 194224]

Unnamed: 0,Artist,Song_Name,Song_Node_ID
249885,Banda Uó,Cavalo de Fogo,194224


In [165]:
# comparing the duplicates in the respective DFs

# Social Graph from LastFM (User-User Graph)

In [167]:
def load_social(file_users, file_edges, users_ids):
    df_users = pd.read_csv(file_users, sep='\t', names=['id', 'user'])
    df_edges = pd.read_csv(file_edges, sep=' ', names=['origin', 'destination'])
    old_new = {}
    for _, r in tqdm(df_users.iterrows(), total=len(df_users)):
        if r['user'] in users_ids:
            old_new[r['id']] = users_ids[r['user']]
    social_graph = nx.DiGraph()
    social_graph.add_nodes_from(old_new.values())
    for _, r in tqdm(df_edges.iterrows(), total=len(df_edges)):
        if r['origin'] in old_new and r['destination'] in old_new:
            social_graph.add_edge(old_new[r['origin']], old_new[r['destination']])
    return social_graph

social_graph = load_social('data/MRecury_data/lastfm.nodes', 'data/MRecury_data/lastfm.edges', users_mapping)

100%|██████████| 136420/136420 [00:01<00:00, 79483.07it/s]
100%|██████████| 1685524/1685524 [00:17<00:00, 97339.90it/s] 


### Inspection of Social Graph

In [170]:
print(f'Dataset: {social_graph}:')
print('======================')

print(f'Number of user nodes: {len(social_graph.nodes)}')
print(f'Number of edges (connections between users): {len(social_graph.edges)}')
# Corrected the line to calculate the average degree
print(f'Average degree (average number of connections per user): {sum(degree for _, degree in social_graph.degree()) / len(social_graph.nodes)}')
print(f'Network density (proportion of all possible connections): {nx.density(social_graph)}')
print(f'Average clustering coefficient (measure of clustering): {nx.average_clustering(social_graph)}')

Dataset: DiGraph with 3307 nodes and 142919 edges:
Number of user nodes: 3307
Number of edges (connections between users): 142919
Average degree (average number of connections per user): 86.43423042032053
Network density (proportion of all possible connections): 0.013072327649776245
Average clustering coefficient (measure of clustering): 0.12105636762162594


# Load Tags Data

In [174]:
import pickle


# Then, load the data from the tags.pickle file
with open('data/MRecury_data/tags_artist.pickle', 'rb') as f:
    dataset = pickle.load(f)

# Extract tag_id and artist_id dictionaries
tag_id = dataset['tag_id']
artist_id = dataset['artist_id']

# Example usage:
# Get the ID of a specific tag
tag_name = 'rock'
tag_id_of_rock = tag_id.get(tag_name, None)
if tag_id_of_rock is not None:
    print(f"The ID of the tag '{tag_name}' is {tag_id_of_rock}")

# Get the ID of a specific artist
artist_name = 'adam johnson'
artist_id_of_beatles = artist_id.get(artist_name, None)
if artist_id_of_beatles is not None:
    print(f"The ID of the artist '{artist_name}' is {artist_id_of_beatles}")


The ID of the tag 'rock' is 5
The ID of the artist 'adam johnson' is 367


In [175]:
dataset["tag_id"]

{'indie pop': 1,
 'indie rock': 2,
 'indie': 3,
 'black kids': 4,
 'rock': 5,
 'alternative': 6,
 'alternative rock': 7,
 'pop': 8,
 'indie disco': 9,
 'electropop': 10,
 'love at first listen': 11,
 'summer': 12,
 'happy': 13,
 'catchy': 14,
 'electronic': 15,
 'foster the people': 16,
 '2011': 17,
 'why on earth is this just a bonus track': 18,
 'british': 19,
 'beautiful': 20,
 'post-punk': 21,
 'if this were a pokemon i would catch it': 22,
 'poptron': 23,
 'folk': 24,
 'american': 25,
 'folk rock': 26,
 'synth indie rock': 27,
 'uplifting': 28,
 'mellow': 29,
 '10s': 30,
 'icelandic': 31,
 'indie folk': 32,
 '2013': 33,
 'birp': 34,
 'acoustic': 35,
 'two door cinema club': 36,
 'lush': 37,
 'banned': 38,
 'short': 39,
 'britpop': 40,
 'indie11': 41,
 'eclectonia': 42,
 'indietronica': 43,
 'electro': 44,
 'digitalis': 45,
 'minimal': 46,
 'new wave': 47,
 'fun': 48,
 'dance-punk': 49,
 'post-punk revival': 50,
 'dance-rock': 51,
 'electrorock': 52,
 'pherfavs': 53,
 'synthpop': 5

In [176]:
dataset["artist_id"]

{'robin schulz': 0,
 'darya dadvar': 1,
 'colle der fomento': 2,
 'charlie parker quartet': 3,
 'anastasija': 4,
 'nik turner': 5,
 'sounds of blackness': 6,
 'linda perhacs': 7,
 'dark sanctuary': 8,
 'buried inside': 9,
 'the naked and famous': 10,
 'ersen': 11,
 'scouting for girls': 12,
 'sienna skies': 13,
 'tamia': 14,
 'os vips': 15,
 'john lunn': 16,
 'self esteem': 17,
 'night beds': 18,
 'vali': 19,
 '6pm': 20,
 'vinicio capossela': 21,
 'aware': 22,
 'george jackson': 23,
 'ryukyu underground': 24,
 'baxter dury': 25,
 'junior boys': 26,
 'slow club': 27,
 'sea oleena': 28,
 'lifehouse': 29,
 'evereve': 30,
 'ponytail': 31,
 'andy caldwell': 32,
 'extermination dismemberment': 33,
 'max gazze': 34,
 'bizzy bone': 35,
 'lipstick': 36,
 'hinds': 37,
 'sr-71': 38,
 'instra:mental': 39,
 'bosco': 40,
 'trentemoller': 41,
 'william onyeabor': 42,
 'our last hope lost hope': 43,
 'extreme': 44,
 'senking': 45,
 'neurotic outsiders': 46,
 'schwefelgelb': 47,
 'anohni': 48,
 'matt m

In [177]:
len(dataset["artist_id"])

28152

In [178]:
print(f'Number of Tags in Tags File: {len(dataset["tag_id"])}')

Number of Tags in Tags File: 8438


In [179]:
print(f'Number of artists in Tags File: {len(dataset["artist_id"])}')

Number of artists in Tags File: 28152


## Load dataset "Typed Pickle"
This holds the "type" information on the Graph dataset, type for "user" and "Track" node. I should have started with this, but I didnt know. 
Doesnt make much difference, but the majority of important insights i find here again, and in a more accessible form since I have all data in dfs instead of dicts.
Plus I compare Full , Train and Test data only here, not above.

In [180]:
import pickle


# Then, load the data from the tags.pickle file
with open('data/MRecury_data/dataset_typed.pickle', 'rb') as f:
    dataset = pickle.load(f)

In [181]:
# Now you can access the different parts of the dataset:
full_graph = dataset['full']
train_graph = dataset['train']
test_graph = dataset['test']
users_mapping = dataset['users']
artist_tracks_mapping = dataset['artist-tracks']

# Accessing nodes and edges of the graphs:
# For example, to access nodes and edges of the full graph:
full_nodes = full_graph.nodes()
full_edges = full_graph.edges()

In [182]:
len(users_mapping)

3307

In [183]:
len(artist_tracks_mapping)

28540

In [184]:
dataset.keys()

dict_keys(['full', 'train', 'test', 'users', 'artist-tracks'])

In [185]:
dataset['users']

{'Enrique-': 0,
 'EduardoMol': 1,
 'DemetriDyslexik': 2,
 'unicef41': 3,
 'losena': 4,
 'jpw130855': 5,
 'felipemusky': 6,
 'felipe_89': 7,
 'camiloei': 8,
 'EriF_JR': 9,
 'Backstage_Rock': 10,
 'nirvaana_': 11,
 'miladi': 12,
 'djchaco': 13,
 'dizzydjc': 14,
 'conversemanman': 15,
 'Yavedu': 16,
 'Param0rexx_': 17,
 'Jeff_Serozini': 18,
 'JCCAKES': 19,
 'EnricoFranchi': 20,
 'Ehsandiary': 21,
 'Creepsnight': 22,
 'zero-inch': 23,
 'violaceousest': 24,
 'the_edster': 25,
 'maikcuritiba': 26,
 'imyyy': 27,
 'chocobooo': 28,
 'c0rts': 29,
 'barkbarkdisco': 30,
 'WichitaQ': 31,
 'TheRootsLife': 32,
 'LeoMetal965': 33,
 'waltercabellon': 34,
 'pellitero': 35,
 'masud_saedi': 36,
 'corky64': 37,
 'alinzainescu': 38,
 'Vintovka': 39,
 'VRec': 40,
 'STxza': 41,
 'NaturalStudio': 42,
 'Lain12': 43,
 'KarenValensi': 44,
 'DJGabster': 45,
 'ASTOKALOSOU': 46,
 'thunder__': 47,
 'loohop15': 48,
 'kyliesaysparty': 49,
 'amakiell': 50,
 'aemea': 51,
 'Tott_Di': 52,
 'Nihilistic23': 53,
 'MarchuSykes

In [186]:
g = dataset['full']
node_data = g.nodes.data()
node_data[1]

{'type': 'user'}

### Inspect Node Type

In [187]:

user_node_ids = []
song_node_ids = []

for node_id, nested_data in node_data:
    for key, value in nested_data.items():
        if value == 'user':
            user_node_ids.append(node_id)
        if value == 'track':
            song_node_ids.append(node_id)
        else:
            pass

In [188]:
user_nodes_df = pd.DataFrame((user_node_ids), columns =['User_node_IDS'])
track_nodes_df = pd.DataFrame((song_node_ids), columns =['Song_node_IDS'])

In [189]:
user_nodes_df

Unnamed: 0,User_node_IDS
0,0
1,1
2,2
3,3
4,4
...,...
3302,3302
3303,3303
3304,3304
3305,3305


In [190]:
track_nodes_df

Unnamed: 0,Song_node_IDS
0,3307
1,3308
2,3309
3,3310
4,3311
...,...
252009,124803
252010,115042
252011,122571
252012,116122


In [191]:
## this number 252014  has nothing to do with the MAPPING song number / ids!!! Remember these nodes are just created because the mapping "artist-tracks" says so.

In [192]:
missing_nodes = []

for node_id in range(3307):
    if node_id not in g:
        missing_nodes.append(node_id)

print("Missing nodes in the range 0-3306:", missing_nodes)

Missing nodes in the range 0-3306: []


### Inspect Train Dataset

In [197]:
g_train = train_graph
train_node_data = g_train.nodes.data()

In [198]:

train_user_node_ids = {}
train_song_node_ids = {}

for node_id, nested_data in train_node_data:
    for key, value in nested_data.items():
        if value == 'user':
            train_user_node_ids[node_id] = value
        if value == 'track':
            train_song_node_ids[node_id] = value
        else:
            pass

In [199]:
train_user_nodes_df = pd.DataFrame(list(train_user_node_ids.items()), columns=['Node ID', 'Type'])
train_track_nodes_df = pd.DataFrame(list(train_song_node_ids.items()), columns=['Node ID', 'Type'])

In [200]:
train_user_nodes_df

Unnamed: 0,Node ID,Type
0,0,user
1,1,user
2,2,user
3,3,user
4,4,user
...,...,...
3302,3302,user
3303,3303,user
3304,3304,user
3305,3305,user


In [201]:
train_track_nodes_df

Unnamed: 0,Node ID,Type
0,6235,track
1,6346,track
2,6460,track
3,6347,track
4,6117,track
...,...,...
252009,104198,track
252010,121872,track
252011,101599,track
252012,98990,track


In [202]:
missing_nodes = []

for node_id in range(3307):
    if node_id not in g_train:
        missing_nodes.append(node_id)

print("Missing nodes in the range 0-3306:", missing_nodes)

Missing nodes in the range 0-3306: []


### Inspect Test Dataset

In [203]:
g_test = test_graph
test_node_data = g_test.nodes.data()

In [204]:

test_user_node_ids = {}
test_song_node_ids = {}

for node_id, nested_data in test_node_data:
    for key, value in nested_data.items():
        if value == 'user':
            test_user_node_ids[node_id] = value
        if value == 'track':
            test_song_node_ids[node_id] = value
        else:
            pass

In [205]:
test_user_nodes_df = pd.DataFrame(list(test_user_node_ids.items()), columns=['Node ID', 'Type'])
test_track_nodes_df = pd.DataFrame(list(test_song_node_ids.items()), columns=['Node ID', 'Type'])

In [206]:
test_user_nodes_df

Unnamed: 0,Node ID,Type
0,0,user
1,1,user
2,2,user
3,4,user
4,5,user
...,...,...
3274,3302,user
3275,3303,user
3276,3304,user
3277,3305,user


In [207]:
test_track_nodes_df

Unnamed: 0,Node ID,Type
0,188713,track
1,8573,track
2,4256,track
3,4521,track
4,4522,track
...,...,...
155844,199405,track
155845,71521,track
155846,202755,track
155847,169252,track


#### Missing Test User Nodes

In [208]:
# get test nodes that have NO edges

# Initialize an empty list to store nodes with 0 edges
test_nodes_with_zero_edges = []

# Assuming your graph is named 'g'
for node in g_test.nodes():
    if g_test.degree(node) == 0:
        #print(f"Node {node} has no connected edges.")
        test_nodes_with_zero_edges.append(node)

# Print the list of nodes with 0 edges
print("Nodes with 0 edges: (because those nodes are actually not even in the test data, which itself is problematic. they should be there, but have no edges)", test_nodes_with_zero_edges)

Nodes with 0 edges: (because those nodes are actually not even in the test data, which itself is problematic. they should be there, but have no edges) []


In [210]:
missing_nodes = []

for node_id in range(3307):
    if node_id not in g_test:
        missing_nodes.append(node_id)

print("Missing nodes in the range 0-3306:", missing_nodes)
print(f'Number of missing nodes: {len(missing_nodes)}')

Missing nodes in the range 0-3306: [3, 33, 183, 362, 586, 858, 1168, 1309, 1609, 1753, 1759, 1761, 1808, 1840, 1859, 1905, 2124, 2147, 2266, 2402, 2484, 2715, 2727, 2732, 2878, 2909, 2991, 3045]
Number of missing nodes: 28


# Split Inspection

Inspection of the Code from their NB 1, where they split the full graph data into training and test set according to some rule. As we will find out this rule should be determined by "User scrobbles were temporally split. For each user, the first 70% of listened tracks were used as training, while the remaining tracks were used as the test set." But unfortunately a lot of the "time data" that is needed to make this "temporal" split between tracks. Hence many tracks and up at random in the training or testing set, simply because the information on their listening time is missing. 


In [211]:

print(f'Full Graph Dataset: {g}:')
print('======================')

#print(f'Number of graphs: {len(full_graph)}') # seems like the number of graphs is wrong, this is identital with nodes
print(f'Number of nodes: {len(g.nodes)}')
print(f'Number of edges: {len(g.edges)}')

print(f'Train Graph Dataset: {g_train}:')
print('======================')

#print(f'Number of graphs: {len(full_graph)}') # seems like the number of graphs is wrong, this is identital with nodes
print(f'Number of nodes: {len(g_train.nodes)}')
print(f'Number of edges: {len(g_train.edges)}')

print(f'Test Graph Dataset: {g_test}:')
print('======================')

#print(f'Number of graphs: {len(full_graph)}') # seems like the number of graphs is wrong, this is identital with nodes
print(f'Number of nodes (user + song nodes): {len(g_test.nodes)}')
print(f'Number of missing User nodes (nodes with 0 edges AND missing in the original test dataset): {len(missing_nodes)}')
print(f'Number of edges: {len(g_test.edges)}')

Full Graph Dataset: Graph with 255321 nodes and 3018209 edges:
Number of nodes: 255321
Number of edges: 3018209
Train Graph Dataset: Graph with 255321 nodes and 2564908 edges:
Number of nodes: 255321
Number of edges: 2564908
Test Graph Dataset: Graph with 159128 nodes and 453301 edges:
Number of nodes (user + song nodes): 159128
Number of missing User nodes (nodes with 0 edges AND missing in the original test dataset): 28
Number of edges: 453301


#### Single User Inspection
inspect the difference of one node (user) before and after train test split

In [212]:
user_3306_full = full_graph[3306]
user_3306_train = train_graph[3306]
user_3306_test = test_graph[3306]

In [213]:
# Convert nested dictionary to DataFrame
df_user_3306_full = pd.DataFrame.from_dict(user_3306_full, orient='index').reset_index()
df_user_3306_train = pd.DataFrame.from_dict(user_3306_train, orient='index').reset_index()
df_user_3306_test = pd.DataFrame.from_dict(user_3306_test, orient='index').reset_index()

#reset index and rename ID
df_user_3306_full.rename(columns={'index': 'Song_Node_ID'}, inplace=True)
df_user_3306_train.rename(columns={'index': 'Song_Node_ID'}, inplace=True)
df_user_3306_test.rename(columns={'index': 'Song_Node_ID'}, inplace=True)

In [214]:
df_user_3306_full

Unnamed: 0,Song_Node_ID,scrobbles,pos,date
0,53072,7,1460.0,"Tuesday 29 Dec 2009, 1:52am"
1,223362,6,1357.0,"Thursday 31 Dec 2009, 1:12am"
2,14540,5,1693.0,"Tuesday 22 Dec 2009, 12:36am"
3,129761,5,1670.0,"Tuesday 22 Dec 2009, 2:24am"
4,156484,4,207.0,"Thursday 30 Sep 2010, 5:58pm"
...,...,...,...,...
759,3815,1,inf,
760,14156,1,800.0,"Saturday 23 Jan 2010, 3:01pm"
761,12191,1,inf,
762,173432,1,inf,


In [215]:
df_user_3306_train

Unnamed: 0,Song_Node_ID,scrobbles,pos,date
0,119301,1,9.0,"Wednesday 2 Dec 2015, 3:18pm"
1,116122,1,479.0,"Saturday 13 Feb 2010, 1:20am"
2,104198,1,570.0,"Tuesday 2 Feb 2010, 7:32pm"
3,121872,1,574.0,"Tuesday 2 Feb 2010, 7:13pm"
4,47263,1,868.0,"Tuesday 19 Jan 2010, 9:05pm"
...,...,...,...,...
529,222754,1,inf,
530,8500,1,inf,
531,3815,1,inf,
532,12191,1,inf,


In [216]:
df_user_3306_test

Unnamed: 0,Song_Node_ID,scrobbles,pos,date
0,133915,1,11,"Wednesday 25 Nov 2015, 3:59pm"
1,5413,1,13,"Wednesday 25 Nov 2015, 3:43pm"
2,10212,1,15,"Wednesday 25 Nov 2015, 3:36pm"
3,4114,1,28,"Tuesday 24 Nov 2015, 8:45pm"
4,3814,2,29,"Tuesday 24 Nov 2015, 8:41pm"
...,...,...,...,...
225,202755,1,854,"Tuesday 19 Jan 2010, 10:17pm"
226,9790,1,857,"Tuesday 19 Jan 2010, 10:05pm"
227,169252,1,860,"Tuesday 19 Jan 2010, 9:51pm"
228,181742,1,862,"Tuesday 19 Jan 2010, 9:43pm"


In [217]:
len(df_user_3306_train) / len(df_user_3306_full) # confirms the split by 70% ON A USER LEVEL!!! 

0.6989528795811518

In [421]:
## Add ARtist - Tracks Mapping Data

In [218]:
# Replace 'your_file.csv' with the path to your CSV file
artist_tracks_df = pd.read_csv('data/new_artist_tracks_mapping_df.csv')
artist_tracks_df

Unnamed: 0,Artist,Song_Name,Song_Node_ID
0,Black Kids,I'm Not Gonna Teach Your Boyfriend How to Danc...,3307
1,Black Kids,Hit The Heartbrakes,3308
2,Black Kids,I've Underestimated My Charm (Again),3309
3,Black Kids,Partie Traumatic,3310
4,Black Kids,I'm Making Eyes at You,3311
...,...,...,...
252008,Jamie Lancaster,Boys Don't Cry,255203
252009,Sleeperstar,I Was Wrong,255208
252010,Anthony Naples,Mad Disrespect,255228
252011,Irene,Stardust,255253


In [219]:
# Merge the two DataFrames on 'Song ID'
df_user_3306_full = pd.merge(df_user_3306_full, artist_tracks_df, on='Song_Node_ID', how='left')
df_user_3306_train = pd.merge(df_user_3306_train, artist_tracks_df, on='Song_Node_ID', how='left')
df_user_3306_test = pd.merge(df_user_3306_test, artist_tracks_df, on='Song_Node_ID', how='left')


In [220]:
df_user_3306_full

Unnamed: 0,Song_Node_ID,scrobbles,pos,date,Artist,Song_Name
0,53072,7,1460.0,"Tuesday 29 Dec 2009, 1:52am",Edie Brickell & New Bohemians,What I Am
1,223362,6,1357.0,"Thursday 31 Dec 2009, 1:12am",Fun Lovin' Criminals,The Fun Lovin' Criminal
2,14540,5,1693.0,"Tuesday 22 Dec 2009, 12:36am",Rage Against the Machine,People of the Sun
3,129761,5,1670.0,"Tuesday 22 Dec 2009, 2:24am",Soundgarden,Outshined
4,156484,4,207.0,"Thursday 30 Sep 2010, 5:58pm",(hed) Planet Earth,Firsty
...,...,...,...,...,...,...
759,3815,1,inf,,Kings of Leon,Use Somebody
760,14156,1,800.0,"Saturday 23 Jan 2010, 3:01pm",Kings of Leon,Wasted Time
761,12191,1,inf,,Kiss,Love Gun
762,173432,1,inf,,Kittie,Charlotte


In [221]:
df_user_3306_train

Unnamed: 0,Song_Node_ID,scrobbles,pos,date,Artist,Song_Name
0,119301,1,9.0,"Wednesday 2 Dec 2015, 3:18pm",Dirty Wormz,Top Of The Food Chain
1,116122,1,479.0,"Saturday 13 Feb 2010, 1:20am",Edan,The Science Of The Two
2,104198,1,570.0,"Tuesday 2 Feb 2010, 7:32pm",Godsmack,Shine Down
3,121872,1,574.0,"Tuesday 2 Feb 2010, 7:13pm",Falling Up,Escalates (Aceramic)
4,47263,1,868.0,"Tuesday 19 Jan 2010, 9:05pm",Electric Wizard,Raptus
...,...,...,...,...,...,...
529,222754,1,inf,,Killswitch Engage,Never Again
530,8500,1,inf,,Kings of Leon,The Runner
531,3815,1,inf,,Kings of Leon,Use Somebody
532,12191,1,inf,,Kiss,Love Gun


In [222]:
df_user_3306_test

Unnamed: 0,Song_Node_ID,scrobbles,pos,date,Artist,Song_Name
0,133915,1,11,"Wednesday 25 Nov 2015, 3:59pm",JAY-Z,Ni**as In Paris
1,5413,1,13,"Wednesday 25 Nov 2015, 3:43pm",Hozier,Take Me to Church
2,10212,1,15,"Wednesday 25 Nov 2015, 3:36pm",Audioslave,Cochise
3,4114,1,28,"Tuesday 24 Nov 2015, 8:45pm",Kings of Leon,Crawl
4,3814,2,29,"Tuesday 24 Nov 2015, 8:41pm",Kings of Leon,Closer
...,...,...,...,...,...,...
225,202755,1,854,"Tuesday 19 Jan 2010, 10:17pm",Hurt,Summers Lost
226,9790,1,857,"Tuesday 19 Jan 2010, 10:05pm",Joy Division,New Dawn Fades
227,169252,1,860,"Tuesday 19 Jan 2010, 9:51pm",k.d. lang,Fallen
228,181742,1,862,"Tuesday 19 Jan 2010, 9:43pm",Bowling for Soup,I Gotchoo


In [223]:
df_user_3306_full.iloc[229]

Song_Node_ID               142128
scrobbles                       1
pos                           inf
date                         None
Artist          Avenged Sevenfold
Song_Name              Sidewinder
Name: 229, dtype: object

In [224]:
user_3306_full[53072]

{'scrobbles': 7, 'pos': 1460, 'date': 'Tuesday 29 Dec 2009, 1:52am'}

In [225]:
full_train_df = pd.merge(df_user_3306_full, df_user_3306_train, on="Song_Node_ID", how="left")


In [226]:
# The scope of these changes made to
# pandas settings are local to with statement.
with pd.option_context('display.max_rows', 10,
                       'display.max_columns', None,
                       'display.precision', 3,
                       ):
    display(full_train_df)

Unnamed: 0,Song_Node_ID,scrobbles_x,pos_x,date_x,Artist_x,Song_Name_x,scrobbles_y,pos_y,date_y,Artist_y,Song_Name_y
0,53072,7,1460.0,"Tuesday 29 Dec 2009, 1:52am",Edie Brickell & New Bohemians,What I Am,7.0,1460.0,"Tuesday 29 Dec 2009, 1:52am",Edie Brickell & New Bohemians,What I Am
1,223362,6,1357.0,"Thursday 31 Dec 2009, 1:12am",Fun Lovin' Criminals,The Fun Lovin' Criminal,6.0,1357.0,"Thursday 31 Dec 2009, 1:12am",Fun Lovin' Criminals,The Fun Lovin' Criminal
2,14540,5,1693.0,"Tuesday 22 Dec 2009, 12:36am",Rage Against the Machine,People of the Sun,5.0,1693.0,"Tuesday 22 Dec 2009, 12:36am",Rage Against the Machine,People of the Sun
3,129761,5,1670.0,"Tuesday 22 Dec 2009, 2:24am",Soundgarden,Outshined,5.0,1670.0,"Tuesday 22 Dec 2009, 2:24am",Soundgarden,Outshined
4,156484,4,207.0,"Thursday 30 Sep 2010, 5:58pm",(hed) Planet Earth,Firsty,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
759,3815,1,inf,,Kings of Leon,Use Somebody,1.0,inf,,Kings of Leon,Use Somebody
760,14156,1,800.0,"Saturday 23 Jan 2010, 3:01pm",Kings of Leon,Wasted Time,,,,,
761,12191,1,inf,,Kiss,Love Gun,1.0,inf,,Kiss,Love Gun
762,173432,1,inf,,Kittie,Charlotte,1.0,inf,,Kittie,Charlotte


In [227]:

# The scope of these changes made to
# pandas settings are local to with statement.
with pd.option_context('display.max_rows', 10,
                       'display.max_columns', None,
                       'display.precision', 3,
                       ):
    display(df_user_3306_full)

Unnamed: 0,Song_Node_ID,scrobbles,pos,date,Artist,Song_Name
0,53072,7,1460.0,"Tuesday 29 Dec 2009, 1:52am",Edie Brickell & New Bohemians,What I Am
1,223362,6,1357.0,"Thursday 31 Dec 2009, 1:12am",Fun Lovin' Criminals,The Fun Lovin' Criminal
2,14540,5,1693.0,"Tuesday 22 Dec 2009, 12:36am",Rage Against the Machine,People of the Sun
3,129761,5,1670.0,"Tuesday 22 Dec 2009, 2:24am",Soundgarden,Outshined
4,156484,4,207.0,"Thursday 30 Sep 2010, 5:58pm",(hed) Planet Earth,Firsty
...,...,...,...,...,...,...
759,3815,1,inf,,Kings of Leon,Use Somebody
760,14156,1,800.0,"Saturday 23 Jan 2010, 3:01pm",Kings of Leon,Wasted Time
761,12191,1,inf,,Kiss,Love Gun
762,173432,1,inf,,Kittie,Charlotte


In [228]:

# The scope of these changes made to
# pandas settings are local to with statement.
with pd.option_context('display.max_rows', 10,
                       'display.max_columns', None,
                       'display.precision', 3,
                       ):
    display(df_user_3306_test)

Unnamed: 0,Song_Node_ID,scrobbles,pos,date,Artist,Song_Name
0,133915,1,11,"Wednesday 25 Nov 2015, 3:59pm",JAY-Z,Ni**as In Paris
1,5413,1,13,"Wednesday 25 Nov 2015, 3:43pm",Hozier,Take Me to Church
2,10212,1,15,"Wednesday 25 Nov 2015, 3:36pm",Audioslave,Cochise
3,4114,1,28,"Tuesday 24 Nov 2015, 8:45pm",Kings of Leon,Crawl
4,3814,2,29,"Tuesday 24 Nov 2015, 8:41pm",Kings of Leon,Closer
...,...,...,...,...,...,...
225,202755,1,854,"Tuesday 19 Jan 2010, 10:17pm",Hurt,Summers Lost
226,9790,1,857,"Tuesday 19 Jan 2010, 10:05pm",Joy Division,New Dawn Fades
227,169252,1,860,"Tuesday 19 Jan 2010, 9:51pm",k.d. lang,Fallen
228,181742,1,862,"Tuesday 19 Jan 2010, 9:43pm",Bowling for Soup,I Gotchoo


#### Full Graph Song Information
Creates df for Full, Train and Test data, holding ALL "listening" information (basically each edge).
So for each user all the songs they listened to.
Can have multiple songs, multiple times because different users can ofc listen to the same song

In [232]:
def create_dataframe_from_graph(graph):
    # Initialize lists to store extracted information
    user_ids = []
    song_ids = []
    scrobbles = []
    positions = []
    dates = []

    # Iterate over users
    for user_id in range(3307):
        if user_id in graph:
            # Iterate over the songs
            for song_id, songs_info in graph[user_id].items():
                user_ids.append(user_id)
                song_ids.append(song_id)
                scrobbles.append(songs_info['scrobbles'])
                positions.append(songs_info['pos'])
                dates.append(songs_info['date'])
                
    # Create a DataFrame from the lists
    graph_df = pd.DataFrame({
        'User_ID': user_ids,
        'Song_ID': song_ids,
        'Scrobbles': scrobbles,
        'Position': positions,
        'Date': dates
    })
    
    return graph_df

In [234]:
full_graph_df = pd.read_csv('data/pre_processing/full_graph.csv') if os.path.exists('data/pre_processing/full_graph.csv') else create_dataframe_from_graph(full_graph)
full_graph_df.to_csv('data/pre_processing/full_graph.csv', index=False)
train_graph_df = pd.read_csv('data/pre_processing/train_graph.csv') if os.path.exists('data/pre_processing/train_graph.csv') else create_dataframe_from_graph(train_graph)
train_graph_df.to_csv('data/pre_processing/train_graph.csv', index=False)
test_graph_df = pd.read_csv('data/pre_processing/test_graph.csv') if os.path.exists('data/pre_processing/test_graph.csv') else create_dataframe_from_graph(test_graph)
test_graph_df.to_csv('data/pre_processing/test_graph.csv', index=False)


In [235]:
print(len(test_graph_df))
print(len(full_graph_df))
print(len(train_graph_df))
print(len(test_graph_df)/len(full_graph_df))
print(len(train_graph_df)/len(full_graph_df))

453301
3018209
2564908
0.15018873775805452
0.8498112622419455


In [236]:
full_graph_df

Unnamed: 0,User_ID,Song_ID,Scrobbles,Position,Date
0,0,3307,370,inf,
1,0,3308,357,inf,
2,0,3309,349,inf,
3,0,3310,347,inf,
4,0,3311,346,inf,
...,...,...,...,...,...
3018204,3306,3815,1,inf,
3018205,3306,14156,1,800.0,"Saturday 23 Jan 2010, 3:01pm"
3018206,3306,12191,1,inf,
3018207,3306,173432,1,inf,


In [237]:
train_graph_df

Unnamed: 0,User_ID,Song_ID,Scrobbles,Position,Date
0,0,6235,22,196.0,"Wednesday 22 Sep 2021, 10:41am"
1,0,6346,21,197.0,"Wednesday 22 Sep 2021, 10:37am"
2,0,6460,20,198.0,"Wednesday 22 Sep 2021, 10:33am"
3,0,6347,21,199.0,"Wednesday 22 Sep 2021, 10:30am"
4,0,6117,23,200.0,"Wednesday 22 Sep 2021, 10:26am"
...,...,...,...,...,...
2564903,3306,222754,1,inf,
2564904,3306,8500,1,inf,
2564905,3306,3815,1,inf,
2564906,3306,12191,1,inf,


In [238]:
test_graph_df

Unnamed: 0,User_ID,Song_ID,Scrobbles,Position,Date
0,0,188713,8,130,"Thursday 23 Sep 2021, 9:31am"
1,0,8573,10,131,"Thursday 23 Sep 2021, 9:22am"
2,0,4256,53,132,"Thursday 23 Sep 2021, 9:19am"
3,0,4521,47,133,"Thursday 23 Sep 2021, 9:15am"
4,0,4522,47,134,"Thursday 23 Sep 2021, 9:12am"
...,...,...,...,...,...
453296,3306,202755,1,854,"Tuesday 19 Jan 2010, 10:17pm"
453297,3306,9790,1,857,"Tuesday 19 Jan 2010, 10:05pm"
453298,3306,169252,1,860,"Tuesday 19 Jan 2010, 9:51pm"
453299,3306,181742,1,862,"Tuesday 19 Jan 2010, 9:43pm"


In [None]:
## check if test_graph has no "inf" in "position"

In [239]:
test_graph_df[test_graph_df["Position"] == float("inf")]

Unnamed: 0,User_ID,Song_ID,Scrobbles,Position,Date


In [240]:
test_graph_df[test_graph_df["Position"] == "inf"]

Unnamed: 0,User_ID,Song_ID,Scrobbles,Position,Date


In [241]:
# Check how many positions in the train graph have the number "inf"
inf_positions_count = train_graph_df['Position'].value_counts().get(float('inf'), 0)

print("Number of positions with 'inf' in the train data:", inf_positions_count, "/", len(train_graph_df), "=", inf_positions_count / len(train_graph_df), "% have no info on position" )
print("Number of positions with an actual Position Info in the train data:", len(train_graph_df) -  inf_positions_count, )



Number of positions with 'inf' in the train data: 2469569 / 2564908 = 0.9628294660081376 % have no info on position
Number of positions with an actual Position Info in the train data: 95339


In [242]:
## double check

# Check if 'Position' is finite and an integer
valid_positions_count = np.sum(np.isfinite(train_graph_df['Position']))

print("Number of rows with a valid integer position:", valid_positions_count)


Number of rows with a valid integer position: 95339


ONLY ~10k from 250k, 4% Tracks in the Training Data actually have a time / position Information

#### Check Train / Test Ratio per User

looking at the % ratio of train / test splits for each user


In [243]:
import pandas as pd

# Initialize lists to store total song counts for each user
total_song_counts_train = []
total_song_counts_test = []
total_song_counts_full = []

# Iterate over each user node ID
for user_id in range(3307):
    if user_id in train_graph:
        # Get the length of song IDs for the current user ID in the training data
        song_count_train = len(train_graph[user_id])
        # Append the song count to the list
        total_song_counts_train.append(song_count_train)
    else:
        # If user ID is not in the training data, append 0
        total_song_counts_train.append(0)

    if user_id in test_graph:
        # Get the length of song IDs for the current user ID in the test data
        song_count_test = len(test_graph[user_id])
        # Append the song count to the list
        total_song_counts_test.append(song_count_test)
    else:
        # If user ID is not in the test data, append 0
        total_song_counts_test.append(0)

    if user_id in full_graph:
        # Get the length of song IDs for the current user ID in the full graph data
        song_count_full = len(full_graph[user_id])
        # Append the song count to the list
        total_song_counts_full.append(song_count_full)
    else:
        # If user ID is not in the full graph data, append 0
        total_song_counts_full.append(0)

# Create a DataFrame to store the total song counts for each user
train_test_ratio_df = pd.DataFrame({
    'userID': range(3307),
    'Train_Songs': total_song_counts_train,
    'Test_Songs': total_song_counts_test,
    'Full_Songs': total_song_counts_full
})

# Calculate the ratios
train_test_ratio_df['Train_Ratio'] = train_test_ratio_df['Train_Songs'] / train_test_ratio_df['Full_Songs']
train_test_ratio_df['Test_Ratio'] = train_test_ratio_df['Test_Songs'] / train_test_ratio_df['Full_Songs']


In [244]:
# The scope of these changes made to
# pandas settings are local to with statement.
with pd.option_context('display.max_rows', 10,
                       'display.max_columns', None,
                       'display.precision', 4,
                       ):
    display(train_test_ratio_df)

Unnamed: 0,userID,Train_Songs,Test_Songs,Full_Songs,Train_Ratio,Test_Ratio
0,0,9513,614,10127,0.9394,0.0606
1,1,12759,874,13633,0.9359,0.0641
2,2,3958,267,4225,0.9368,0.0632
3,3,638,0,638,1.0000,0.0000
4,4,5245,389,5634,0.9310,0.0690
...,...,...,...,...,...,...
3302,3302,668,214,882,0.7574,0.2426
3303,3303,534,194,728,0.7335,0.2665
3304,3304,507,170,677,0.7489,0.2511
3305,3305,266,29,295,0.9017,0.0983


In [250]:
user_df = train_test_ratio_df
# Filter the DataFrame based on the conditions
filtered_df = user_df[((user_df['Train_Ratio'] >= 0.6) & (user_df['Train_Ratio'] <= 0.8)) | ((user_df['Test_Ratio'] >= 0.2) & (user_df['Test_Ratio'] <= 0.4))]

print("Only",len(filtered_df) , "/", len(user_df), " User Nodes have their train/ test split in a ratio of ~70/30 (bound 0.2-0.4 or 0.6-0.8). \nThe rest is (due to split method with the many pos = inf) heavily leaning towards more samples in the training data" )
# Print the filtered DataFrame
with pd.option_context('display.max_rows', 20,
                       'display.max_columns', None,
                       'display.precision', 4,
                       ):
    display(filtered_df)


Only 1304 / 3307  User Nodes have their train/ test split in a ratio of ~70/30 (bound 0.2-0.4 or 0.6-0.8). 
The rest is (due to split method with the many pos = inf) heavily leaning towards more samples in the training data


Unnamed: 0,userID,Train_Songs,Test_Songs,Full_Songs,Train_Ratio,Test_Ratio
12,12,220,95,315,0.6984,0.3016
16,16,1020,305,1325,0.7698,0.2302
17,17,163,70,233,0.6996,0.3004
20,20,742,319,1061,0.6993,0.3007
26,26,872,289,1161,0.7511,0.2489
...,...,...,...,...,...,...
3300,3300,366,157,523,0.6998,0.3002
3302,3302,668,214,882,0.7574,0.2426
3303,3303,534,194,728,0.7335,0.2665
3304,3304,507,170,677,0.7489,0.2511
