In [1]:
import os
import json
import random
import pandas as pd

In [2]:
# Path to the folder containing JSON files
json_folder = "data/spotify_million_playlist_dataset/data"

In [3]:
# Initialize a list to store all playlists
all_playlists = []

In [4]:
# Loop through all JSON files in the folder
for filename in os.listdir(json_folder):
    if filename.endswith(".json"):
        file_path = os.path.join(json_folder, filename)
        
        # Open and load the JSON file
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)  # Each file contains a dictionary with "playlists" key
            
            # Check if 'playlists' key exists and is a list
            playlists = data.get("playlists", [])
            if not isinstance(playlists, list):
                print(f"Unexpected playlists structure in file {filename}")
                continue
            
            # Filter playlists with num_tracks <= 40 and add them to the list
            for playlist in playlists:
                if playlist.get("num_tracks", 0) <= 40:
                    all_playlists.append(playlist)
                    if len(all_playlists) == 4000:  # Stop once 4000 playlists are collected
                        break
        
        if len(all_playlists) == 4000:  # Stop processing files if we have 4000 playlists
            break

In [5]:
all_playlists[0]

{'name': 'August',
 'collaborative': 'false',
 'pid': 549003,
 'modified_at': 1505001600,
 'num_tracks': 38,
 'num_albums': 32,
 'num_followers': 1,
 'tracks': [{'pos': 0,
   'artist_name': 'C-Trox',
   'track_uri': 'spotify:track:44uuZDQFAtfag94mDPIsEu',
   'artist_uri': 'spotify:artist:1MWyFWQ7erF4XjLph84x5J',
   'track_name': 'Passionate',
   'album_uri': 'spotify:album:1eJGL3iCFGrc0O34Wh57cA',
   'duration_ms': 206001,
   'album_name': 'Passionate'},
  {'pos': 1,
   'artist_name': 'Watsky',
   'track_uri': 'spotify:track:0Vwfd6fxFrL3kCnZSJ9vid',
   'artist_uri': 'spotify:artist:3mJ9GlkLzj8Ka7Z7EQaCMi',
   'track_name': 'Sloppy Seconds',
   'album_uri': 'spotify:album:2zgb2l9pexGdZkM5pGuwb9',
   'duration_ms': 245933,
   'album_name': 'Cardboard Castles'},
  {'pos': 2,
   'artist_name': 'Maritime',
   'track_uri': 'spotify:track:2Ze0YvSXz8CnC81hw5rXNo',
   'artist_uri': 'spotify:artist:79xu36jzq3a5nU8uubNZr9',
   'track_name': 'Roaming Empire',
   'album_uri': 'spotify:album:3A2rsJI

In [6]:
len(all_playlists)

4000

In [7]:
# Initialize a list to store flattened data
flattened_data = []

In [8]:
# Flatten the data
for playlist in all_playlists:
    for track in playlist["tracks"]:
        flattened_data.append({
            "pid": playlist["pid"],
            "name": playlist["name"],
            "artist_name": track["artist_name"],
            "artist_uri": track["artist_uri"],
            "track_name": track["track_name"],
            "track_uri": track["track_uri"],
            "album_uri": track["album_uri"],
            "duration_ms": track["duration_ms"],
            "album_name": track["album_name"]
        })

In [9]:
# Convert the flattened data to a pandas DataFrame
df = pd.DataFrame(flattened_data)

In [10]:
df.head(5)

Unnamed: 0,pid,name,artist_name,artist_uri,track_name,track_uri,album_uri,duration_ms,album_name
0,549003,August,C-Trox,spotify:artist:1MWyFWQ7erF4XjLph84x5J,Passionate,spotify:track:44uuZDQFAtfag94mDPIsEu,spotify:album:1eJGL3iCFGrc0O34Wh57cA,206001,Passionate
1,549003,August,Watsky,spotify:artist:3mJ9GlkLzj8Ka7Z7EQaCMi,Sloppy Seconds,spotify:track:0Vwfd6fxFrL3kCnZSJ9vid,spotify:album:2zgb2l9pexGdZkM5pGuwb9,245933,Cardboard Castles
2,549003,August,Maritime,spotify:artist:79xu36jzq3a5nU8uubNZr9,Roaming Empire,spotify:track:2Ze0YvSXz8CnC81hw5rXNo,spotify:album:3A2rsJIXswjqBHa9digdCb,234960,Magnetic Bodies/Maps of Bones
3,549003,August,RY X,spotify:artist:2KjAo6wVc9d2WcxdxSArpV,Howling,spotify:track:34hMOtKwf5nm8tjvkGV0Dk,spotify:album:0DvauBOn2G8RIEQR54Cmiv,309590,Dawn
4,549003,August,Liam Payne,spotify:artist:5pUo3fmmHT8bhCyHE52hA6,Strip That Down,spotify:track:6EpRaXYhGOB3fj4V2uDkMJ,spotify:album:2mnDyPSNM02LMvniaMWnLl,204502,Strip That Down


In [11]:
total_songs = len(df)
total_songs

91598

In [12]:
df['track_id'] = df['track_uri'].str.split(':').str[2]

In [13]:
df.head(5)

Unnamed: 0,pid,name,artist_name,artist_uri,track_name,track_uri,album_uri,duration_ms,album_name,track_id
0,549003,August,C-Trox,spotify:artist:1MWyFWQ7erF4XjLph84x5J,Passionate,spotify:track:44uuZDQFAtfag94mDPIsEu,spotify:album:1eJGL3iCFGrc0O34Wh57cA,206001,Passionate,44uuZDQFAtfag94mDPIsEu
1,549003,August,Watsky,spotify:artist:3mJ9GlkLzj8Ka7Z7EQaCMi,Sloppy Seconds,spotify:track:0Vwfd6fxFrL3kCnZSJ9vid,spotify:album:2zgb2l9pexGdZkM5pGuwb9,245933,Cardboard Castles,0Vwfd6fxFrL3kCnZSJ9vid
2,549003,August,Maritime,spotify:artist:79xu36jzq3a5nU8uubNZr9,Roaming Empire,spotify:track:2Ze0YvSXz8CnC81hw5rXNo,spotify:album:3A2rsJIXswjqBHa9digdCb,234960,Magnetic Bodies/Maps of Bones,2Ze0YvSXz8CnC81hw5rXNo
3,549003,August,RY X,spotify:artist:2KjAo6wVc9d2WcxdxSArpV,Howling,spotify:track:34hMOtKwf5nm8tjvkGV0Dk,spotify:album:0DvauBOn2G8RIEQR54Cmiv,309590,Dawn,34hMOtKwf5nm8tjvkGV0Dk
4,549003,August,Liam Payne,spotify:artist:5pUo3fmmHT8bhCyHE52hA6,Strip That Down,spotify:track:6EpRaXYhGOB3fj4V2uDkMJ,spotify:album:2mnDyPSNM02LMvniaMWnLl,204502,Strip That Down,6EpRaXYhGOB3fj4V2uDkMJ


In [14]:
df.to_csv(f'4000_playlists_{total_songs}.csv')

In [16]:
df[df['track_id']=='3JOF9NzQVkUXtCcJbEQuAb']

Unnamed: 0,pid,name,artist_name,artist_uri,track_name,track_uri,album_uri,duration_ms,album_name,track_id
2204,549197,Lol,Willamette Stone,spotify:artist:19gmxCK2V3jLMi5fDYyKtS,Heart Like Yours,spotify:track:3JOF9NzQVkUXtCcJbEQuAb,spotify:album:2cpfXsbsHVNweQ4USbO6wu,199960,If I Stay (Original Motion Picture Soundtrack),3JOF9NzQVkUXtCcJbEQuAb
6394,549623,Chillz,Willamette Stone,spotify:artist:19gmxCK2V3jLMi5fDYyKtS,Heart Like Yours,spotify:track:3JOF9NzQVkUXtCcJbEQuAb,spotify:album:2cpfXsbsHVNweQ4USbO6wu,199960,If I Stay (Original Motion Picture Soundtrack),3JOF9NzQVkUXtCcJbEQuAb
6395,549623,Chillz,Willamette Stone,spotify:artist:19gmxCK2V3jLMi5fDYyKtS,Heart Like Yours,spotify:track:3JOF9NzQVkUXtCcJbEQuAb,spotify:album:2cpfXsbsHVNweQ4USbO6wu,199960,If I Stay (Original Motion Picture Soundtrack),3JOF9NzQVkUXtCcJbEQuAb
