In [1]:
import pandas as pd

In [2]:
import json

In [3]:
from tqdm import tqdm

In [4]:
data = pd.read_csv("../public/spotify_data.csv")

In [5]:
with open("../src/api/music_genres_tree.json", 'r') as f:
	tree = json.load(f)

In [6]:
def get_leafs(tree):
    childs = tree.get("children")
    if childs is None:
        songs_list = []
        tree["songs"] = songs_list
        yield (tree["name"], songs_list)
        return
    for child in childs:
        yield from get_leafs(child)

In [7]:
leaves = dict([*get_leafs(tree)])

In [8]:
(expected_keys:=set(['acoustic', 'afrobeat', 'alt-rock', 'ambient', 'black-metal',
       'blues', 'breakbeat', 'cantopop', 'chicago-house', 'chill',
       'classical', 'club', 'comedy', 'country', 'dance', 'dancehall',
       'death-metal', 'deep-house', 'detroit-techno', 'disco',
       'drum-and-bass', 'dub', 'dubstep', 'edm', 'electro', 'electronic',
       'emo', 'folk', 'forro', 'french', 'funk', 'garage', 'german',
       'gospel', 'goth', 'grindcore', 'groove', 'guitar', 'hard-rock',
       'hardcore', 'hardstyle', 'heavy-metal', 'hip-hop', 'house',
       'indian', 'indie-pop', 'industrial', 'jazz', 'k-pop', 'metal',
       'metalcore', 'minimal-techno', 'new-age', 'opera', 'party',
       'piano', 'pop', 'pop-film', 'power-pop', 'progressive-house',
       'psych-rock', 'punk', 'punk-rock', 'rock', 'rock-n-roll',
       'romance', 'sad', 'salsa', 'samba', 'sertanejo', 'show-tunes',
       'singer-songwriter', 'ska', 'sleep', 'songwriter', 'soul',
       'spanish', 'swedish', 'tango', 'techno', 'trance', 'trip-hop'])) == set(leaves.keys())

True

In [9]:
data.head()[["track_name", "track_id"]]

Unnamed: 0,track_name,track_id
0,I Won't Give Up,53QF56cjZA9RTuuMZDrSA6
1,93 Million Miles,1s8tP3jP4GZcyHDsjvw218
2,Do Not Let Me Go,7BRCa8MPiyuvr2VU3O9W0F
3,Fast Car,63wsZUhUZLlh1OsyrZq7sz
4,Sky's Still Blue,6nXIYClvJAfi6ujLiKqEq8


In [10]:
len(dict([*get_leafs(tree)])), len(expected_keys)

(82, 82)

In [11]:
corrected_data = data.copy()
corrected_data['track_name'] = corrected_data['track_name'].fillna("Unknown Track")

print(f"Utilisation des données corrigées: {len(corrected_data)} lignes")

leaves = dict([*get_leafs(tree)])
for song in tqdm(corrected_data.iterrows(), total=len(corrected_data)):
    leaves[dict(song[1])["genre"]].append(song[1][["track_name", "track_id"]])

Utilisation des données corrigées: 1159764 lignes


100%|██████████| 1159764/1159764 [10:45<00:00, 1795.32it/s]
100%|██████████| 1159764/1159764 [10:45<00:00, 1795.32it/s]


In [12]:
# Convertir les Series en dictionnaires 
leaves_serializable = {}
for genre, songs_list in tqdm(leaves.items(), desc="Converting to serializable format"):
    leaves_serializable[genre] = [song.to_dict() for song in songs_list]

Converting to serializable format: 100%|██████████| 82/82 [00:24<00:00,  3.29it/s]
Converting to serializable format: 100%|██████████| 82/82 [00:24<00:00,  3.29it/s]


In [18]:
# Maintenant créons la structure hiérarchique avec l'arbre des genres enrichi
def enrich_tree_with_songs(tree_node, songs_by_genre):
    """Enrichit récursivement l'arbre avec les chansons"""
    # Ajouter les chansons pour ce nœud
    tree_node['songs'] = songs_by_genre.get(tree_node['name'], [])
    
    # Si c'est une feuille, pas d'enfants à traiter
    if 'children' not in tree_node or not tree_node['children']:
        return tree_node
    
    # Traiter récursivement les enfants
    for child in tree_node['children']:
        enrich_tree_with_songs(child, songs_by_genre)
    
    # Agréger les chansons des enfants dans le parent
    all_child_songs = []
    for child in tree_node['children']:
        all_child_songs.extend(child.get('songs', []))
    
    # Combiner les chansons directes et des enfants (sans doublons)
    seen_ids = set()
    combined_songs = []
    
    for song in tree_node['songs'] + all_child_songs:
        if song['track_id'] not in seen_ids:
            seen_ids.add(song['track_id'])
            combined_songs.append(song)
    
    tree_node['songs'] = combined_songs
    return tree_node

# Enrichir l'arbre avec les chansons
enriched_tree = enrich_tree_with_songs(tree.copy(), leaves_serializable)

print(f"Arbre enrichi créé avec {len(enriched_tree['songs'])} chansons au niveau racine")

Arbre enrichi créé avec 1159764 chansons au niveau racine


In [19]:
output_file = "../public/indexByGenreSongs.json"
with tqdm(total=1, desc="Saving hierarchical tree") as pbar:
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(enriched_tree, f, indent=2, ensure_ascii=False)
    pbar.update(1)

print(f"Structure hiérarchique sauvegardée dans {output_file}")
print("Le fichier contient maintenant une structure arborescente compatible avec TreeVizProcessor")

Saving hierarchical tree: 100%|██████████| 1/1 [00:49<00:00, 49.16s/it]

Structure hiérarchique sauvegardée dans ../public/indexByGenreSongs.json
Le fichier contient maintenant une structure arborescente compatible avec TreeVizProcessor



